In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import math

df = pd.read_csv('dataset2.csv')

# separation des features et de la variable cible
X = df.drop('charges', axis=1)
Y = df['charges']
print(f'''verif des dimensions X et Y
      X (dataset sans la variable cible): {X.shape}
      Y (la variable cible) : {Y.shape}''')


# division du dataset en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
print(f''' verif du split 80 20 
80% du dataset : X train -> {X_train.shape}, Y train -> {y_train.shape}
20% du dataset : X test -> {X_test.shape}, Y test -> {y_test.shape}''')

# préprocesseur pour les variables numériques
preprocessor_num = Pipeline(steps=[
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])

# preprocessing avec transformation des categories et des nums
preprocessor = ColumnTransformer(
    transformers=[
        ('num', preprocessor_num, ['age', 'bmi', 'children']),
        ('cat', OneHotEncoder(), ['region', 'sex', 'smoker'])
    ]
)

pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', RandomForestRegressor())])


#test de différents hyperparam d'alpha pour le rf
param_distributions = {
    'regressor__n_estimators': [100, 200, 300],  # nb d'arbres dans la forêt
    'regressor__max_depth': [None, 10, 20, 30],  # profondeur maximale de l'arbre
}

### grid search
grid_rf = GridSearchCV(pipeline_rf, param_distributions, cv=5)


# eviter data leakage -> entraîner le pipeline sur les données d'entraînement 
grid_rf.fit(X_train, y_train)
#puis predire y sur l'ensemble de test avec le meme pipeline
y_pred = grid_rf.predict(X_test)
# print(y_pred)
score = grid_rf.score(X_train, y_train)
print(score)

#test des les differents metriques sur modele rf
#comparaison du y prédit avec le y de test
mse = mean_squared_error(y_test, y_pred)

print(f'Mean Squared Error du modèle rf: {mse}')

r2 = r2_score(y_test, y_pred)

print(f'Coefficient of Determination R² du modèle rf: {r2}')

rmse = math.sqrt(mse)
print(f'Root Mean Squared Error (RMSE) du modèle rf: {rmse}')



verif des dimensions X et Y
      X (dataset sans la variable cible): (1337, 6)
      Y (la variable cible) : (1337,)
 verif du split 80 20 
80% du dataset : X train -> (1069, 6), Y train -> (1069,)
20% du dataset : X test -> (268, 6), Y test -> (268,)
0.9646794361279235
Mean Squared Error du modèle rf: 20763715.723134566
Coefficient of Determination R² du modèle rf: 0.8870039155905598
Root Mean Squared Error (RMSE) du modèle rf: 4556.722037071668


In [34]:
# récupérer le modèle avec les meilleurs hyperparamètres
best_model = grid_rf.best_estimator_
print(f"meilleur model : {best_model}")


meilleur model : Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('poly',
                                                                   PolynomialFeatures(include_bias=False)),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'bmi', 'children']),
                                                 ('cat', OneHotEncoder(),
                                                  ['region', 'sex',
                                                   'smoker'])])),
                ('regressor',
                 RandomForestRegressor(max_depth=10, n_estimators=300))])


In [35]:
new_data = pd.DataFrame({
    'age': [35, 40, 30, 60,30],  
    'sex': ['female', 'male', 'female', 'female','male'],
    'bmi': [22.5, 30.0, 25.0, 47,20],
    'children': [1, 2, 0, 5,5],
    'smoker': ['no', 'yes', 'no', 'yes','yes'],
    'region': ['southwest', 'northeast', 'southeast', 'southeast', 'southeast']
})

# faire des prédictions avec le modèle optimisé, possible de faire best_model.predict aussi 
new_y_ped = grid_rf.predict(new_data)

# ajouter les prédictions à votre DataFrame d'origine
new_data['predicted_charges'] = new_y_ped

# afficher le DataFrame avec les prédictions
print(new_data)

   age     sex   bmi  children smoker     region  predicted_charges
0   35  female  22.5         1     no  southwest        7166.865131
1   40    male  30.0         2    yes  northeast       27939.418261
2   30  female  25.0         0     no  southeast        4513.381523
3   60  female  47.0         5    yes  southeast       48695.408371
4   30    male  20.0         5    yes  southeast       17686.034683


In [36]:
RandomForestRegressor?

[0;31mInit signature:[0m
[0mRandomForestRegressor[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_estimators[0m[0;34m=[0m[0;36m100[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcriterion[0m[0;34m=[0m[0;34m'squared_error'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_depth[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_split[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_samples_leaf[0m[0;34m=[0m[0;36m1[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_weight_fraction_leaf[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_features[0m[0;34m=[0m[0;36m1.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmax_leaf_nodes[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmin_impurity_decrease[0m[0;34m=[0m[0;36m0.0[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mbootstrap[0m[0;34m=[0m[0;32mTrue[0m[0;34m,