In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
import numpy as np

In [25]:
# Charger les données à partir d'un fichier CSV
train_data = pd.read_csv('data/ds_salaries.csv')

In [26]:
# Supposons que 'train_data' est votre DataFrame
# Séparer un sous-ensemble pour MLOps
mlops_data = train_data.sample(n=1000, random_state=42)
train_data_reduced = train_data.drop(mlops_data.index)

# Préparation des données (suppression des colonnes non nécessaires, gestion des valeurs manquantes)
features = train_data_reduced.drop(columns=['salary_in_usd', 'salary', 'salary_currency'])
numeric_features = features.select_dtypes(include=['int64', 'float64'])
means = numeric_features.mean()
features.fillna(means, inplace=True)
features = pd.get_dummies(features)  # Encodage des variables catégorielles

X = features
y = train_data_reduced['salary_in_usd']


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
xgb_model = XGBRegressor(objective='reg:squarederror')

parameters = {
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.2],
    'n_estimators': [100, 200, 250],
    'subsample': [0.8, 0.9],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=xgb_model, param_grid=parameters, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)


162 fits failed out of a total of 648.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
162 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\yzi\AppData\Roaming\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\yzi\AppData\Roaming\Python\Python39\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return func(**kwargs)
  File "c:\Users\yzi\AppData\Roaming\Python\Python39\lib\site-packages\xgboost\sklearn.py", line 1090, in fit
    self._Booster = train(
  File "c:\Users\yzi\AppData\Roaming\Python\Python39\lib\site-packages\xgboost\core.py", line 730, in inner_f
    return

In [29]:
best_xgb = grid_search.best_estimator_
y_pred = best_xgb.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Optimized RMSE: ", rmse)

Optimized RMSE:  44349.249355504406


In [30]:
# Afficher les meilleurs paramètres
print("Meilleurs paramètres : ", grid_search.best_params_)

Meilleurs paramètres :  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.9}


In [33]:
import mlflow
from mlflow import pyfunc
import mlflow.sklearn
from sklearn.metrics import mean_squared_error
import numpy as np

# Configuration du tracking URI pour MLflow pour enregistrer les données localement avec le schéma 'file://'
mlflow.set_tracking_uri("file:///C:/Users/yzi/Desktop/Travaux/MLops/Logs")

# Vérification ou création d'une expérience
experiment_name = "XGBoost Regression Model"
if mlflow.get_experiment_by_name(experiment_name) is None:
    # Création de l'expérience si elle n'existe pas
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    # Récupération de l'ID de l'expérience existante
    experiment_id = mlflow.get_experiment_by_name(experiment_name).experiment_id

# Démarrage d'une run pour l'expérience spécifique
with mlflow.start_run(experiment_id=experiment_id):
    # Enregistrement des paramètres du modèle
    mlflow.log_params(grid_search.best_params_)

    # Calcul et enregistrement du RMSE
    y_pred = grid_search.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mlflow.log_metric("rmse", rmse)

    # Sauvegarde du modèle
    mlflow.sklearn.log_model(grid_search.best_estimator_, "xgboost_model")
