In [None]:
from typing import Tuple, List
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb

import matplotlib.pyplot as plt
import plotly.graph_objects as go 
import plotly.express as px

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import normalize, LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor

from xgboost import XGBRegressor

In [None]:
from google.colab import files

# For google colab
uploaded = files.upload()


Data

In [None]:
df_sensors = pd.read_csv('nuclear-waste/Coordinates_Training.csv')
hum = pd.read_csv('nuclear-waste/Training_data_humidity.csv')
pre = pd.read_csv('nuclear-waste/Training_data_pressure.csv')
df_tem = pd.read_csv('nuclear-waste/Training_data_temperature.csv')  

ex = pd.read_csv('nuclear-waste/example_of_submission.csv')

df_test = pd.read_csv('nuclear-waste/Coordinates_Test.csv')
humtest = pd.read_csv('nuclear-waste/Test_Time_humidity.csv')
pretest = pd.read_csv('nuclear-waste/Test_Time_pressure.csv')
               



In [None]:
# With the scatter_3d function from plotly, we will visualize the sensors position in space

fig = px.scatter_3d(
    df_sensors,
    x='Coor X [m]',
    y='Coor Y [m]',
    z='Coor Z [m]',
    width = 800,
    height = 600,
    hover_name='Sensor ID' 
)

fig.update_traces(marker=dict(size=3))

fig.show()


Cylinder shape, disc on XZ plane

In [None]:
# Preprocessing and functions defining

In [None]:
# Combining the data on one data frame will facilitate the access to the said data

df_long = df_tem.melt(id_vars='M.Time[d]', var_name='Sensor ID', value_name='Temperature')
df_merged = df_long.merge(df_sensors[['Sensor ID', 'Index', 'Material','Coor X [m]','Coor Y [m]','Coor Z [m]', 'R [m]']], on='Sensor ID', how='left')

# Label encoder, numeralize the material to make it a feature
#le = LabelEncoder()
#df_merged['Material_encoded'] = le.fit_transform(df_merged['Material'])

In [None]:
def eval(
    y_val : pd.DataFrame,
    y_pred : pd.DataFrame,
    model
):
    """
    Outputs the Mean Squared Error (MSE) and the coefficient of determination (R²) as well as the importance of features (for xgboost mainly)
    y_val : DataFrame of validation data
    y_pred : DataFrame of prediction data
    model : model
    """
    mse = mean_squared_error(y_val, y_pred)
    r2 = r2_score(y_val, y_pred)

    print(f"MSE : {mse:.4f}")
    print(f"R² : {r2:.4f}")
    print(f"Features importances : {model.feature_importances_}")

In [None]:
# Replace with value

def replacewithvalue(
    df : pd.DataFrame,
    header : str,
    val = np.nan,
    min_val = -float('inf') ,
    max_val = float('inf') ,
) -> int:
    """
    Replace values between a certain interval with a unique value
    Clipping

    df : DataFrame
    header : Header of column
    value : Value used to replace, NaN by default
    min_val : minimal value of the interval
    max_val : maximal value of the interval
    """
    for i in df[header]:
        if min_val < i < max_val and (type(i) == float or type(i) == str) :
            df.replace(to_replace = i, value=val, inplace= True)
    return df



In [None]:
# Here we will replace NaNs with another value. For example, the arithmetric average

def avgcolumn(
    df : pd.DataFrame,
    header : str,
    min_val = -float('inf') ,
    max_val = float('inf') ,
) -> float:
    """
    Returns average of a DateFrame column

    df : The target DataFrame
    header : Header of column
    min_val : minimal value of the interval
    max_val : maximal value of the interval
    """
    liste = []
    for i in df[header]:
        if min_val < i < max_val and isinstance(i, (int, float))  :
            liste.append(i)
    if not liste :
        return 0
    else :
        return sum(liste)/len(liste)


In [None]:
# Make it so it removes sensors

def removesensor(
    df: pd.DataFrame, 
    sensor: str
)-> pd.DataFrame:
    """
    Removes sensor and it's data from the DataFrame
    df : The target DataFrame
    sensor : The removed sensor
    """
    df_cleaned = df[df['Sensor ID'] != sensor]
    return df_cleaned

In [None]:
def Xy(
    fulldf : pd.DataFrame
):
    """
    Takes DataFrame and split it into features X and target y (optional)
    Returns:
    - X, y : if 'Temperature' column is present
    - X : otherwise
    """
    
    cols = ['M.Time[d]', 'Coor X [m]', 'Coor Y [m]', 'Coor Z [m]', 'R [m]', 'Humidity', 'Pressure']

    if 'Material' in fulldf.columns:
        X = fulldf[cols + ['Material']]
    elif 'Material_encoded' in fulldf.columns:
        X = fulldf[cols + ['Material_encoded']]
    else:
        raise ValueError("Missing 'Material' or 'Material_encoded' column in DataFrame.")
    
    if 'Temperature' in fulldf.columns : 
        y = fulldf[['Temperature']]
        return X, y
    else:
        return X

In [None]:
# Let's apply these functions

average = avgcolumn(df_merged,'Temperature', 0.0, 1000.0)
replacewithvalue(df_merged,'Temperature', min_val = 1000.0)
df_merged = df_merged.fillna(average)

In [None]:
#Adding humidity as a feature

df_hum = hum.melt(id_vars='M.Time[d]', var_name='Sensor ID', value_name='Humidity')
averageh = avgcolumn(df_hum,'Humidity')
df_hum = df_hum.fillna(averageh)
df_merged['Humidity'] = df_hum['Humidity']


In [None]:
# Adding pressure as a feature

df_pre = pre.melt(id_vars='M.Time[d]', var_name='Sensor ID', value_name='Pressure')
averagep = avgcolumn(df_pre,'Pressure')
df_pre = df_pre.fillna(averagep)
df_merged['Pressure'] = df_pre['Pressure']





In [None]:
# Remove N_442

df_merged = removesensor(df_merged,'N_442')


In [None]:
# Now we combine the sets and split them into subsets 

X, y = Xy(df_merged)
X = X.rename(columns={
    'M.Time[d]': 'Time',
    'Coor X [m]': 'X',
    'Coor Y [m]': 'Y',
    'Coor Z [m]': 'Z',
    'R [m]': 'R'
})


X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.175, random_state=10)

In [None]:
# Material encoder
# material_target_mean will be re used for test data

material_target_mean = X_train.assign(temp=y_train).groupby('Material')['temp'].mean()

X_train['Material_encoded'] = X_train['Material'].map(material_target_mean)
X_val['Material_encoded'] = X_val['Material'].map(material_target_mean).fillna(y_train.mean())

X_train = X_train.drop('Material', axis=1)
X_val = X_val.drop('Material', axis=1)

In [None]:
X_train

In [None]:
X_val

In [None]:
# We tested Linear regression L1, L2 and KNN Regressor --> Not good

In [None]:
# Random Forest Regressor 

rf = RandomForestRegressor(n_estimators= 85, max_depth= 6, random_state= 13)
a = rf.fit(X_train, y_train.values.ravel())
y_predrf = rf.predict(X_val)


In [None]:
eval(y_val, y_predrf,rf)

In [None]:
# XGBoost

xgbm = XGBRegressor(n_estimators= 85, max_depth= 6, learning_rate= 0.09, random_state= 13)
xgbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=True)
y_predxgbm = xgbm.predict(X_val)


In [None]:
eval(y_val, y_predxgbm, xgbm)
X_train

In [None]:
# Gradient Boosting Regression

gbr = GradientBoostingRegressor(n_estimators=100, learning_rate=0.08)
gbr.fit(X_train, y_train.values.ravel())
y_predgbr = gbr.predict(X_val)


In [None]:
eval(y_val, y_predgbr)

In [None]:
# Comparisons

# Random Forest Regressor
#
#XGBoost
#
#Gradient Boosting Regression

In [None]:
cv_scores = cross_val_score(rf, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
print("MSE CV RF :", -cv_scores.mean())

In [None]:
cv_scores = cross_val_score(xgbm, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
print("MSE CV XGB :", -cv_scores.mean())

In [None]:
cv_scores = cross_val_score(gbr, X_train, y_train.values.ravel(), cv=5, scoring='neg_mean_squared_error')
print("MSE CV GBR :", -cv_scores.mean())

In [None]:
# Hyperparameters search

In [None]:
# Définir la grille d'hyperparamètres à tester
param_grid = {
    'n_estimators': [75, 80, 85, 90],
    'max_depth': [6, 7, 8],
    'learning_rate': [0.08,0.085,0.09,0.095]
}

# Configurer la recherche par validation croisée
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    cv=5,  # nombre de folds pour la validation croisée
    scoring='neg_mean_squared_error',  # ou 'r2', 'neg_mean_absolute_error', etc.
    verbose=1,
    n_jobs=-1  # utilise tous les cœurs disponibles
)

# Lancer la recherche
grid_search.fit(X_train, y_train)

# Afficher les meilleurs paramètres trouvés
print("Meilleurs hyperparamètres :", grid_search.best_params_)

# Utiliser le meilleur modèle pour prédire
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)

In [None]:
# Neural Network
# Initialize the neural network
nn = MLPRegressor(
    hidden_layer_sizes=(100, 50),  # you can experiment with these
    activation='relu',
    solver='adam',
    max_iter=300,
    random_state=1
)

# Fit the model
nn.fit(X_train_s, y_train_s.values.ravel())

# Predict (scaled)
y_pred_scaled = nn.predict(X_val_s)

# Inverse transform predictions and targets
y_pred = target_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))
y_val_real = target_scaler.inverse_transform(y_val_s)

# Evaluate
mse = mean_squared_error(y_val_real, y_pred)
r2 = r2_score(y_val_real, y_pred)

print(f"MSE : {mse:.4f}")
print(f"R²  : {r2:.4f}")

In [None]:
# Final Test 

In [None]:
#le2 = LabelEncoder()
#df_test['Material_encoded'] = le2.fit_transform(df_test['Material'])
df_test["key"] = 1
df_times = pd.DataFrame({"M.Time[d]": df_tem['M.Time[d]']})
df_times["key"] = 1
df_test_prepared = pd.merge(df_test, df_times, on="key").drop("key", axis=1)

In [None]:
df_humtest = humtest.melt(id_vars='M.Time[d]', var_name='Sensor ID', value_name='Humidity')
df_test_prepared['Humidity'] = df_humtest['Humidity']

In [None]:
df_pretest = pretest.melt(id_vars='M.Time[d]', var_name='Sensor ID', value_name='Pressure')
df_test_prepared['Pressure'] = df_pretest['Pressure']

In [None]:
df_test_prepared

In [None]:
# Material encoder
df_test_prepared['Material_encoded'] = df_test_prepared['Material'].map(material_target_mean).fillna(y_train.mean())
df_test_prepared = df_test_prepared.drop('Material', axis=1)

In [None]:
X_test = Xy(df_test_prepared)

X_test = X_test.rename(columns={
    'M.Time[d]': 'Time',  
    'Sensor ID': 'Sensor ID',
    'Coor X [m]': 'X',
    'Coor Y [m]': 'Y',
    'Coor Z [m]': 'Z',
    'R [m]': 'R'
})




In [None]:
X_test

In [None]:
# MODIFY MODEL 
         # HERE 
        #   v
y_pred_t = xgbm.predict(X_test)
y_pred_t.shape
y_pred_t = y_pred_t.reshape(int(4640/32),32)

In [None]:
header = df_tem['M.Time[d]'].to_numpy()
header = header.astype(str)
header[header == '0'] = 'id'
header = list(header)

In [None]:
ex['id']
ids = ex['id'].to_numpy()

In [None]:
final = pd.DataFrame(y_pred_t, columns=header)
final.insert(0, "id", ids)


In [None]:
final.to_csv("Results/final99.csv", index=False)

In [None]:
# Initialiser le modèle
xgbm = XGBRegressor(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.09,
    objective='reg:squarederror',  # 'reg:absoluteerror' si tu veux MAE native
    random_state=6
)

# Stocker les prédictions à chaque round
train_preds = []
val_preds = []

xgbm.set_params(eval_metric="mae")
xgbm.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], verbose=True)


# Accéder aux prédictions intermédiaires
for i in range(1, len(xgbm.evals_result()['validation_0']['mae']) + 1):
    train_pred = xgbm.predict(X_train, iteration_range=(0, i))
    val_pred = xgbm.predict(X_val, iteration_range=(0, i))
    train_preds.append(train_pred)
    val_preds.append(val_pred)

# Calculer les métriques à chaque étape
train_mae = [mean_absolute_error(y_train, p) for p in train_preds]
val_mae   = [mean_absolute_error(y_val, p) for p in val_preds]

train_mse = [mean_squared_error(y_train, p) for p in train_preds]
val_mse   = [mean_squared_error(y_val, p) for p in val_preds]

train_r2  = [r2_score(y_train, p) for p in train_preds]
val_r2    = [r2_score(y_val, p) for p in val_preds]

# Tracer les courbes
plt.figure(figsize=(15, 12))

plt.subplot(3, 1, 1)
plt.plot(train_mae, label='Train MAE')
plt.plot(val_mae, label='Val MAE')
plt.ylabel('MAE')
plt.legend()
plt.grid(True)

plt.subplot(3, 1, 2)
plt.plot(train_mse, label='Train MSE')
plt.plot(val_mse, label='Val MSE')
plt.ylabel('MSE')
plt.legend()
plt.grid(True)

plt.subplot(3, 1, 3)
plt.plot(train_r2, label='Train R²')
plt.plot(val_r2, label='Val R²')
plt.xlabel('Boosting Rounds')
plt.ylabel('R²')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()
