In [1]:
from typing import Tuple, List
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb

import matplotlib.pyplot as plt
import plotly.graph_objects as go 
import plotly.express as px

from sklearn.model_selection import KFold, train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from sklearn.preprocessing import normalize, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

from xgboost import XGBRegressor

In [2]:
# We load the DataFrames 

# Training data
df_sensors = pd.read_csv('nuclear-waste/Coordinates_Training.csv')
hum = pd.read_csv('nuclear-waste/Training_data_humidity.csv')
pre = pd.read_csv('nuclear-waste/Training_data_pressure.csv')
df_tem = pd.read_csv('nuclear-waste/Training_data_temperature.csv')  

# Example of submission
ex = pd.read_csv('nuclear-waste/example_of_submission.csv')

# Test data
df_test = pd.read_csv('nuclear-waste/Coordinates_Test.csv')
humtest = pd.read_csv('nuclear-waste/Test_Time_humidity.csv')
pretest = pd.read_csv('nuclear-waste/Test_Time_pressure.csv')
               



In [None]:
# With the scatter_3d function from plotly, we will visualize the sensors position in a 3D space

fig = px.scatter_3d(
    df_sensors,
    x='Coor X [m]',
    y='Coor Y [m]',
    z='Coor Z [m]',
    width = 800,
    height = 600,
    hover_name='Sensor ID' 
)

fig.update_traces(marker=dict(size=3))

fig.show()


In [3]:
# Defining functions

In [4]:
# Replace with value

def replacewithvalue(
    df : pd.DataFrame,
    header : str,
    val = np.nan,
    min_val = -float('inf') ,
    max_val = float('inf') ,
) -> int:
    """
    Replace values between a certain interval with a unique value
    Clipping

    df : DataFrame
    header : Header of column
    value : Value used to replace, NaN by default
    min_val : minimal value of the interval
    max_val : maximal value of the interval
    """
    for i in df[header]:
        if min_val < i < max_val and (type(i) == float or type(i) == str) :
            df.replace(to_replace = i, value=val, inplace= True)
    return df

In [5]:
# Here we will replace NaNs with another value. For example, the arithmetric average or the mean


def avgcolumn(
    df : pd.DataFrame,
    header : str,
    min_val = -float('inf') ,
    max_val = float('inf') ,
) -> float:
    """
    Returns arithmetric average (mean) of a DateFrame column

    df : The target DataFrame
    header : header of column
    min_val : minimal value of the interval
    max_val : maximal value of the interval
    """
    liste = []
    for i in df[header]:
        if min_val < i < max_val and isinstance(i, (int, float))  :
            liste.append(i)
    if not liste :
        return 0
    else :
        return sum(liste)/len(liste)

# P.S. From now on, the 'average' refers to the mean unless stated otherwise

In [6]:
# Make an average of a feature for a unique sensor

def avgsensor(
    df: pd.DataFrame, 
    sensor: str, 
    feature: str
) -> float:
    """
    Makes an average of a feature for a unique sensor
    df : the target DaataFrame
    sensor : the target sensor
    feature : the target feature
    """
    filtered = df[df['Sensor ID'] == sensor]

    if filtered.empty:
        return 0.0
        
    return filtered[feature].mean()

In [7]:
# Make it so it removes sensors

def removesensor(
    df: pd.DataFrame, 
    sensor: str
)-> pd.DataFrame:
    """
    Removes sensor and it's data from the DataFrame
    df : the target DataFrame
    sensor : the removed sensor
    """
    df_cleaned = df[df['Sensor ID'] != sensor]
    return df_cleaned

In [8]:
def Xy(
    fulldf : pd.DataFrame
):
    """
    Takes DataFrame and split it into features X and target y (optional)
    Returns:
    - X, y : if target 'Temperature' column is present
    - X : otherwise
    """
    
    cols = ['M.Time[d]', 'Coor X [m]', 'Coor Y [m]', 'Coor Z [m]', 'R [m]', 'Humidity', 'Pressure']

    if 'Material' in fulldf.columns:
        X = fulldf[cols + ['Material']]
    elif 'Material_encoded' in fulldf.columns:
        X = fulldf[cols + ['Material_encoded']]
    else:
        raise ValueError("Missing 'Material' or 'Material_encoded' column in DataFrame.")
    
    if 'Temperature' in fulldf.columns : 
        y = fulldf[['Temperature']]
        return X, y
    else:
        return X

In [9]:
def eval(
    y_val : pd.DataFrame,
    y_pred : pd.DataFrame,
    model
):
    """
    Outputs the Mean Squared Error (MSE) and the coefficient of determination (R²) as well as the importance of features (for xgboost mainly)
    y_val : DataFrame of validation data
    y_pred : DataFrame of prediction data
    model : model
    """
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    print(f"MSE : {mse:.4f}")
    print(f"R² : {r2:.4f}")
    print(f"Features importances : {model.feature_importances_}")

In [10]:
def fill_nan_with_sensor_mean(
    row, 
    header: str, 
    sensor_means: dict
):
    """
    Fill NaNs value with a dictionary of sensor means
    row : row
    header : which feature to target
    sensor_means : dictionary of sensor means
    """
    if pd.isna(row[header]):
        return sensor_means.get(row['Sensor ID'], np.nan)
    return row[header]

In [11]:
# Preprocessing
# 1. Make a DataFrame with a column for each feature + label
# 2. Add all the features
# 3. Treat outliers and missing values
# 4. Create X,y
# 5. Encode materials into numerical values

In [12]:
# Combining the data on one data frame will facilitate the access to the said data

df_long = df_tem.melt(id_vars='M.Time[d]', var_name='Sensor ID', value_name='Temperature')
df_merged = df_long.merge(df_sensors[['Sensor ID', 'Index', 'Material','Coor X [m]','Coor Y [m]','Coor Z [m]', 'R [m]']], on='Sensor ID', how='left')


In [13]:
# Let's apply these functions

In [14]:
# Adding humidity as a feature
df_hum = hum.melt(id_vars='M.Time[d]', var_name='Sensor ID', value_name='Humidity')

# Add a column in the main DataFrame
df_merged['Humidity'] = df_hum['Humidity']


In [15]:
# Adding pressure as a feature
df_pre = pre.melt(id_vars='M.Time[d]', var_name='Sensor ID', value_name='Pressure')

# Add a column in the main DataFrame
df_merged['Pressure'] = df_pre['Pressure']

# We didn't remove potential outliers from either humidity pressure (apart from missing data) as we didn't find a reliable way to find them. 
# They didn't end up causing any major issues.

In [16]:
# By scrolling through the training data, we found that N_442 didn't have any temperature values attributed to it 
display(df_merged[df_merged['Sensor ID']=='N_442'])

# So we decided to remove it entirely (Deletion)
# Remove N_442
df_merged = removesensor(df_merged,'N_442')

# Same for N_518 and N_693 but for lack of pressure and humidity data
df_merged = removesensor(df_merged,'N_518')
df_merged = removesensor(df_merged,'N_693')
#df_merged = removesensor(df_merged,'N_891')
#df_merged = removesensor(df_merged,'N_892')
#df_merged = removesensor(df_merged,'N_893')
#df_merged = removesensor(df_merged,'N_894')
#df_merged = removesensor(df_merged,'N_895')
#df_merged = removesensor(df_merged,'N_896')
#df_merged = removesensor(df_merged,'N_897')
#df_merged = removesensor(df_merged,'N_898')
#df_merged = removesensor(df_merged,'N_898')
#df_merged = removesensor(df_merged,'N_899')

# The explanation is in the next cell

Unnamed: 0,M.Time[d],Sensor ID,Temperature,Index,Material,Coor X [m],Coor Y [m],Coor Z [m],R [m],Humidity,Pressure
14112,1554,N_442,,442,VOID,-0.458902,6.606527,-0.309136,0.553314,,
14113,1556,N_442,,442,VOID,-0.458902,6.606527,-0.309136,0.553314,,
14114,1558,N_442,,442,VOID,-0.458902,6.606527,-0.309136,0.553314,,
14115,1560,N_442,,442,VOID,-0.458902,6.606527,-0.309136,0.553314,,
14116,1563,N_442,,442,VOID,-0.458902,6.606527,-0.309136,0.553314,,
14117,1567,N_442,,442,VOID,-0.458902,6.606527,-0.309136,0.553314,,
14118,1572,N_442,,442,VOID,-0.458902,6.606527,-0.309136,0.553314,,
14119,1578,N_442,,442,VOID,-0.458902,6.606527,-0.309136,0.553314,,
14120,1585,N_442,,442,VOID,-0.458902,6.606527,-0.309136,0.553314,,
14121,1595,N_442,,442,VOID,-0.458902,6.606527,-0.309136,0.553314,,


In [17]:
# We can observe a huge gap in the temperature data, here we print the two values next to where the gap occurs
print(df_merged[df_merged['Temperature'] < 1000]['Temperature'].max())
print(df_merged[df_merged['Temperature'] > 1000]['Temperature'].min())

# We can safely assume that the latter values are outliers. We then remove them and replace them with NaNs
replacewithvalue(df_merged, 'Temperature', min_val=1000.0)

# Calculating the mean temperature of the sensors with data on it (without outliers!)
sensor_means = df_merged.groupby('Sensor ID')['Temperature'].mean()

# Filling NaNs with this mean
df_merged['Temperature'] = df_merged.apply(lambda row: fill_nan_with_sensor_mean(row, 'Temperature', sensor_means), axis=1)

# Since an average isn't a very realistic value for missing data, we want to minimize its presence in our dataset, hence the removal of N_442

139.9373009
2518.947148359651


In [18]:
# We combine the dataset and extract features and target variables
X, y = Xy(df_merged)

# Renaming the columns for simplicity and clarity
X = X.rename(columns={
    'M.Time[d]': 'Time',
    'Coor X [m]': 'X',
    'Coor Y [m]': 'Y',
    'Coor Z [m]': 'Z',
    'R [m]': 'R'
})

# We then use the train_test_split function to, in our case, split our data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=7)

In [19]:
# Material encoder

# Label encoder, numeralize the material to make it a feature
# This is the first encoder we used, we ended up changing it

#le = LabelEncoder()
#df_merged['Material_encoded'] = le.fit_transform(df_merged['Material'])

# We opted for a target encoding or also called mean encoding
# It transforms the material into the mean of the target values (temperature)

# material_target_mean will be re-used for test data
material_target_mean = X_train.assign(temp=y_train).groupby('Material')['temp'].mean()

X_train['Material_encoded'] = X_train['Material'].map(material_target_mean)
X_val['Material_encoded'] = X_val['Material'].map(material_target_mean).fillna(y_train.mean())

X_train = X_train.drop('Material', axis=1)
X_val = X_val.drop('Material', axis=1)

In [None]:
X_train

In [None]:
X_val

In [None]:
# We tested Linear regression L1, linear regression L2 and KNN Regressor --> Not good

In [None]:
# Random Forest Regressor 

rf = RandomForestRegressor(n_estimators= 85, max_depth= 6, random_state= 13)
a = rf.fit(X_train, y_train.values.ravel())
y_predrf = rf.predict(X_val)


In [None]:
eval(y_val, y_predrf, rf)

In [None]:
# XGBoost

# We create the model with defined hyperparameters
xgbm = XGBRegressor(n_estimators= 300, max_depth= 6, learning_rate= 0.05, random_state= 13, reg_lambda=1.5, reg_alpha=0.5, min_child_weight=1.25)

# We then train it on the training data
xgbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=True)

# We now make a prediction with the features of the validation data
y_predxgbm = xgbm.predict(X_val)


[0]	validation_0-rmse:14.17753	validation_1-rmse:13.93508
[1]	validation_0-rmse:13.51471	validation_1-rmse:13.28103
[2]	validation_0-rmse:12.88740	validation_1-rmse:12.66268
[3]	validation_0-rmse:12.28980	validation_1-rmse:12.07356
[4]	validation_0-rmse:11.72322	validation_1-rmse:11.51812
[5]	validation_0-rmse:11.18481	validation_1-rmse:10.98482
[6]	validation_0-rmse:10.67340	validation_1-rmse:10.48250
[7]	validation_0-rmse:10.18644	validation_1-rmse:10.00465
[8]	validation_0-rmse:9.72523	validation_1-rmse:9.55161
[9]	validation_0-rmse:9.28626	validation_1-rmse:9.12254
[10]	validation_0-rmse:8.87036	validation_1-rmse:8.71487
[11]	validation_0-rmse:8.47616	validation_1-rmse:8.33063
[12]	validation_0-rmse:8.09860	validation_1-rmse:7.96326
[13]	validation_0-rmse:7.74121	validation_1-rmse:7.61312
[14]	validation_0-rmse:7.39980	validation_1-rmse:7.28046
[15]	validation_0-rmse:7.07665	validation_1-rmse:6.96701
[16]	validation_0-rmse:6.76924	validation_1-rmse:6.66768
[17]	validation_0-rmse:6.

In [None]:
# We the compare the prediction with the actual data that we have (y_val)
eval(y_val, y_predxgbm, xgbm)

In [None]:
# Plotting a graph

results = xgbm.evals_result()
train_rmse = results['validation_0']['rmse']
val_rmse = results['validation_1']['rmse']
iterations = list(range(len(train_rmse)))

# Tracé du graphique
plt.figure(figsize=(10, 6))
plt.plot(iterations, train_rmse, label='Training RMSE', marker='o', linewidth=1)
plt.plot(iterations, val_rmse, label='Validation RMSE', marker='o', linewidth=1)

plt.title('RMSE per iteration (XGBoost)')
plt.xlabel('Iteration')
plt.ylabel('RMSE')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
X_train

In [None]:
# K-Folds cross-validation

kf = KFold(n_splits=5, shuffle=True, random_state=7)
scores = []

xgbm = XGBRegressor(
    n_estimators=85,
    max_depth=6,
    learning_rate=0.09,
    random_state=13
)
for train_idx, val_idx in kf.split(X):
    X_train, X_val = X.iloc[train_idx].copy(), X.iloc[val_idx].copy()
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    material_target_mean2 = X_train.assign(temp=y_train).groupby('Material')['temp'].mean()

    X_train['Material_encoded'] = X_train['Material'].map(material_target_mean2)
    X_val['Material_encoded'] = X_val['Material'].map(material_target_mean2).fillna(y_train.mean())

    X_train = X_train.drop('Material', axis=1)
    X_val = X_val.drop('Material', axis=1)

    # Entraînement du modèle
    xgbm.fit(X_train, y_train, eval_set=[(X_val, y_val)],  verbose=True)
    y_pred = xgbm.predict(X_val)
    score = mean_squared_error(y_val, y_pred)
    scores.append(score)
    
rmse_scores = np.sqrt(scores)
    
print("CV RMSE mean:", rmse_scores.mean())
print("CV RMSE std:", rmse_scores.std())



print("CV MSE mean:", np.mean(scores))
print("CV MSE std:", np.std(scores))




In [None]:
# Gradient Boosting Regression

gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.08)
gbr.fit(X_train, y_train.values.ravel())
y_predgbr = gbr.predict(X_val)


In [None]:
eval(y_val, y_predgbr, gbr)

In [None]:
# Final Test 

In [None]:
#le2 = LabelEncoder()
#df_test['Material_encoded'] = le2.fit_transform(df_test['Material'])

# Creating a temporary key feature which will be useful to create a full DataFrame similar to df_merged
df_test["key"] = 1

# Extracting the time stamps from df_tem to a separate temporary DataFrame
df_times = pd.DataFrame({"M.Time[d]": df_tem['M.Time[d]']})

# Adding the same temporary key feature
df_times["key"] = 1

# Perform a cartesian merge to replicate the structure of df_merged (without the temperatures)
df_test_prepared = pd.merge(df_test, df_times, on="key").drop("key", axis=1)

In [None]:
# Adding humidity as a feature
df_humtest = humtest.melt(id_vars='M.Time[d]', var_name='Sensor ID', value_name='Humidity')

# Adding it to the main DataFrame
df_test_prepared['Humidity'] = df_humtest['Humidity']

In [None]:
# Adding pressure as a feature
df_pretest = pretest.melt(id_vars='M.Time[d]', var_name='Sensor ID', value_name='Pressure')

# Adding it to the main DataFrame
df_test_prepared['Pressure'] = df_pretest['Pressure']

In [None]:
# Material encoder
df_test_prepared['Material_encoded'] = df_test_prepared['Material'].map(material_target_mean).fillna(y_train.mean())
df_test_prepared = df_test_prepared.drop('Material', axis=1)

In [None]:
# Creating the feature dataframe (picking only the useful features)
X_test = Xy(df_test_prepared)

# Renaming the features
X_test = X_test.rename(columns={
    'M.Time[d]': 'Time',  
    'Coor X [m]': 'X',
    'Coor Y [m]': 'Y',
    'Coor Z [m]': 'Z',
    'R [m]': 'R'
})




In [None]:
# Predicting the temperatures with the predict function

# MODIFY MODEL 
         # HERE 
        #   v
y_pred_t = xgbm.predict(X_test)

# Reshape the predictions
y_pred_t = y_pred_t.reshape(int(4640/32),32)

In [None]:
# Create column names from the 'M.Time[d]' values
header = df_tem['M.Time[d]'].to_numpy()

# Recategorize them to string
header = header.astype(str)

# Retyping header to a list 
header = list(header)

In [None]:
# Extracting sensor ids from submission example and converting it to numpy array
ids = ex['id'].to_numpy()

In [None]:
## Creating the final DataFrame with the predictions
final = pd.DataFrame(y_pred_t, columns=header)

# Finally, inserting the sensor ids in the first column
final.insert(0, "id", ids)

In [None]:
# Converting the final DataFrame to .csv
final.to_csv("Results/final72.csv", index=False)

# DONE

In [None]:
with open("Results/final48.csv", "r", encoding="utf-8") as f1, open("Results/val.csv", "r", encoding="utf-8") as f2:
    lignes1 = f1.readlines()
    lignes2 = f2.readlines()

if lignes1 == lignes2:
    print("Les fichiers sont identiques.")
else:
    print("Les fichiers sont différents.")


In [None]:

# Charger les deux fichiers CSV
df1 = pd.read_csv("Results/final48.csv")
df2 = pd.read_csv("Results/val.csv")

# Trier les colonnes et les lignes
df1_sorted = df1.sort_index(axis=1).sort_values(by=df1.columns.tolist()).reset_index(drop=True)
df2_sorted = df2.sort_index(axis=1).sort_values(by=df2.columns.tolist()).reset_index(drop=True)

# Comparer
identiques = df1_sorted.equals(df2_sorted)
print("Les fichiers sont identiques." if identiques else "Les fichiers sont différents.")
