In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split


df = pd.read_csv ( "../data/raw_data/raw.csv")

print(df.head(5))
print(df.info())
y = df['silica_concentrate']
X = df.drop(columns=['date','silica_concentrate'])      # On drop la date et la cible, la date n'est pas un paramètre a traiter comme les autres si on devait le faire

# Fait le decoupage XY et Train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)


# sauve les dataframes dans le dossier data/processed
X_train.to_csv('../data/processed_data/X_train.csv', index=False)
X_test.to_csv('../data/processed_data/X_test.csv', index=False)
y_train.to_csv('../data/processed_data/y_train.csv', index=False)
y_test.to_csv('../data/processed_data/y_test.csv', index=False)


                  date  ave_flot_air_flow  ave_flot_level  iron_feed  \
0  2017-04-24 00:00:00         300.263166      383.982443      55.17   
1  2017-04-24 01:00:00         299.782402      386.049069      55.17   
2  2017-04-24 02:00:00         299.750052      385.250935      55.17   
3  2017-04-24 03:00:00         299.997522      389.635519      55.17   
4  2017-04-24 04:00:00         300.005220      387.810807      55.17   

   starch_flow  amina_flow  ore_pulp_flow  ore_pulp_pH  ore_pulp_density  \
0  1979.589150  599.676489     400.017222     9.774028          1.753206   
1  1758.466329  600.043100     400.484528     9.539246          1.754190   
2  2379.752428  599.948406     400.325617     9.434227          1.756873   
3  2287.130046  599.580383     399.801506     9.725607          1.727125   
4  2291.789167  599.871217     399.567333     9.845198          1.633063   

   silica_concentrate  
0            4.360000  
1            3.290000  
2            4.900000  
3            4

In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

X_train = pd.read_csv('../data/processed_data/X_train.csv')
X_test = pd.read_csv('../data/processed_data/X_test.csv')

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)   # Fit le scaler sur les données d'entrainement et applique la transformation
X_test_scaled = scaler.transform(X_test)         # Transform sans fit, pour eviter la fuite de donnée

# Reconvertir en DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_scaled.to_csv('../data/processed_data/X_train_scaled.csv', index=False)
X_test_scaled.to_csv('../data/processed_data/X_test_scaled.csv', index=False)




In [22]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
import pickle


# 🔹 Charger les data 
X_train = pd.read_csv('../data/processed_data/X_train_scaled.csv')
y_train = pd.read_csv('../data/processed_data/y_train.csv').values.ravel()  # .ravel() pour éviter une shape (n,1)

# 🔹 Définir le modèle et la grille de params
model = Ridge()
param_grid = {
    #'alpha': [0.01, 0.1, 1, 10, 100]
    'alpha': [5, 8, 10, 12, 14, 15, 15.5, 16, 16.5, 16.75, 17, 20, 25, 50, 75],  # Valeurs plus larges pour une meilleure exploration
}

# 🔹 GridSearch
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)

# 🔹 Afficher les meilleurs params
print("Meilleurs paramètres trouvés :", grid_search.best_params_)

with open('../models/best_param.pkl', 'wb') as f:
    pickle.dump(grid_search.best_params_, f)

Meilleurs paramètres trouvés : {'alpha': 16.5}


In [24]:
import pandas as pd
from sklearn.linear_model import Ridge
import pickle

with open('../models/best_param.pkl', 'rb') as f:
    best_params = pickle.load(f)

X_train_scaled = pd.read_csv('../data/processed_data/X_train_scaled.csv')
y_train = pd.read_csv('../data/processed_data/y_train.csv').values.ravel()  # .ravel() pour éviter une shape (n,1)

model = Ridge(**best_params)
model.fit(X_train_scaled, y_train)

with open('../models/best_model.pkl', 'wb') as f:
    pickle.dump(model, f)


In [None]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import json


with open('../models/best_model.pkl', 'rb') as f:
    model = pickle.load( f)

X_test_scaled = pd.read_csv('../data/processed_data/X_test_scaled.csv')
y_test = pd.read_csv('../data/processed_data/y_test.csv').values.ravel()    
y_pred = model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse}")
print(f"R2 : {r2}")

# Creer un dico avec les scores
scores = {
    "mse": mse,
    "r2": r2
}

# Sauvegarder au format JSON
with open('../metrics/scores.json', 'w') as f:
    json.dump(scores, f, indent=4)



UnsupportedOperation: read