In [1]:
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from feature_engine.discretisation import ArbitraryDiscretiser
import math
import pandas as pd
import numpy as np
import streamlit as st
from joblib import dump 
import joblib




df = pd.read_csv('dataset2.csv')

print(df.head(), df.shape)
# separation des features et de la variable cible
X = df.drop('charges', axis=1)
y = df[['charges']]
print(f'''verif des dimensions X et Y
      X (dataset sans la variable cible): {X.shape}
      Y (la variable cible) : {y.shape}''')


# division du dataset en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])
print(f''' verif du split 80 20 
80% du dataset : X train -> {X_train.shape}, Y train -> {y_train.shape}
# 20% du dataset : X test -> {X_test.shape}, Y test -> {y_test.shape}''')


ModuleNotFoundError: No module named 'feature_engine'

In [None]:
# préprocesseur pour les variables numériques
# def log_transform(x):
#     return np.log(x + 1)
# log_transformer = FunctionTransformer(log_transform)
def custom_bmi_discretizer(df):
    custom_bins = [0, 30, 100, float('inf')]
    bmi_discretizer = ArbitraryDiscretiser(binning_dict={'bmi': custom_bins}, return_object=True)
    df[['bmi']] = bmi_discretizer.fit_transform(df[['bmi']])
    return df

bmi_discretizer_transformer = FunctionTransformer(custom_bmi_discretizer)



preprocessor_num = Pipeline(steps=[
    # ('log', FunctionTransformer(log_transform)),
    ('bmi_discretize', bmi_discretizer_transformer)
])


preprocessor_cat = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', preprocessor_num, ['age', 'bmi', 'children']),
        ('cat', preprocessor_cat, ['region', 'sex', 'smoker'])
    ]
)

pipeline_lasso = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('regression', Lasso())
])

#test de différents hyperparam d'alpha pour le lasso
param_grid_lasso = {
    'regression__alpha': np.arange(7,10,0.1),
    'regression__precompute': [True],
    'regression__max_iter': [5000],
    
}

### grid search
grid_lasso = GridSearchCV(pipeline_lasso, param_grid_lasso, cv=5)


# eviter data leakage -> entraîner le pipeline sur les données d'entraînement 
grid_lasso.fit(X_train, y_train)
#puis predire y sur l'ensemble de test avec le meme pipeline
y_pred = grid_lasso.predict(X_test)
# print(y_pred)





In [None]:
#test des les differents metriques sur modele lasso
#comparaison du y prédit avec le y de test
mse = mean_squared_error(y_test, y_pred)

print(f'Mean Squared Error du modèle lasso: {mse}')

r2 = r2_score(y_test, y_pred)

print(f'Coefficient de determination R² du modèle lasso: {r2}')

rmse = math.sqrt(mse)
print(f'Root Mean Squared Error (RMSE) du modèle lasso: {rmse} \n rappel moyenne charge : 13279')

Mean Squared Error du modèle lasso: 11510113.043645982
Coefficient de determination R² du modèle lasso: 0.9205737818443527
Root Mean Squared Error (RMSE) du modèle lasso: 3392.6557508309006 
 rappel moyenne charge : 13279


In [None]:
best_model = grid_lasso.best_estimator_
best_model

In [None]:
# extration  modèle Lasso
lasso_model = best_model.named_steps['regression']

# obtention des coefficients
coefficients = lasso_model.coef_
coefficients

for idx, coef in enumerate(sorted(coefficients, key=abs, reverse=True)):
    print(f"Coefficient {idx + 1}: {coef}")

Coefficient 1: 6180.709664267197
Coefficient 2: -5222.490141875619
Coefficient 3: 3783.2465757348227
Coefficient 4: -406.8574879399196
Coefficient 5: 378.9310313359798
Coefficient 6: 359.27056105283674
Coefficient 7: 330.8273748877076
Coefficient 8: -283.6667086778623
Coefficient 9: 271.12548387096064
Coefficient 10: 230.67320230900467
Coefficient 11: -215.51174790527747
Coefficient 12: 198.25079746718094
Coefficient 13: -183.0242700496272
Coefficient 14: 181.59450730036488
Coefficient 15: -176.24414886938342
Coefficient 16: -171.52172290065135
Coefficient 17: -170.31698069886812
Coefficient 18: 168.45139533870616
Coefficient 19: 154.1199558184476
Coefficient 20: 145.93481943607614
Coefficient 21: 113.45649698202057
Coefficient 22: -95.51733473712754
Coefficient 23: 94.45067802547645
Coefficient 24: 79.53576003427561
Coefficient 25: -64.55366767282585
Coefficient 26: -64.11086576555205
Coefficient 27: -48.26341652118636
Coefficient 28: 35.57621826442805
Coefficient 29: -18.442242474317

In [None]:
new_data = pd.DataFrame({
    'age': [35, 40, 30, 60,60],  
    'sex': ['female', 'male', 'male', 'female','female'],  
    'bmi': [22.5, 30.0, 25.0, 47,50],
    'children': [1, 2, 0, 5,5],
    'smoker': ['yes', 'no', 'no', 'yes','yes'],  
    'region': ['southwest', 'northeast', 'southeast', 'southeast', 'southeast'],
})

# faire des prédictions avec le modèle optimisé
new_y_ped = best_model.predict(new_data)

# ajouter les prédictions à new_data
new_data['predicted_charges'] = new_y_ped

print(new_data)

   age     sex   bmi  children smoker     region  predicted_charges
0   35  female  22.5         1    yes  southwest       19552.717529
1   40    male  30.0         2     no  northeast        9138.150049
2   30    male  25.0         0     no  southeast        4049.250120
3   60  female  47.0         5    yes  southeast       50210.544218
4   60  female  50.0         5    yes  southeast       50210.544218


In [None]:
import pickle
with open('best_model.pkl', 'wb') as fichier:
    pickle.dump(best_model, fichier)