In [12]:
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
import math
import pandas as pd
import numpy as np
import streamlit as st
from joblib import dump 
import joblib




df = pd.read_csv('dataset3.csv')

print(df.head(), df.shape)
# separation des features et de la variable cible
X = df.drop('charges', axis=1)
y = df[['charges']]
print(f'''verif des dimensions X et Y
      X (dataset sans la variable cible): {X.shape}
      Y (la variable cible) : {y.shape}''')


# division du dataset en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])
print(f''' verif du split 80 20 
80% du dataset : X train -> {X_train.shape}, Y train -> {y_train.shape}
# 20% du dataset : X test -> {X_test.shape}, Y test -> {y_test.shape}''')


   age  sex     bmi  children  smoker     region      charges  \
0   19    1  27.900         0       1  southwest  16884.92400   
1   18    0  33.770         1       0  southeast   1725.55230   
2   28    0  33.000         3       0  southeast   4449.46200   
3   33    0  22.705         0       0  northwest  21984.47061   
4   32    0  28.880         0       0  northwest   3866.85520   

   bmi_smoker_interaction  
0                    27.9  
1                     0.0  
2                     0.0  
3                     0.0  
4                     0.0   (1337, 8)
verif des dimensions X et Y
      X (dataset sans la variable cible): (1337, 7)
      Y (la variable cible) : (1337, 1)
 verif du split 80 20 
80% du dataset : X train -> (1136, 7), Y train -> (1136, 1)
# 20% du dataset : X test -> (201, 7), Y test -> (201, 1)


In [13]:
#Elasticnet
# préprocesseur pour les variables numériques
def log_transform(x):
    return np.log(x + 1)

log_transformer = FunctionTransformer(log_transform)

preprocessor_num = Pipeline(steps=[
    ('log', log_transformer),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler())
])
preprocessor_cat = Pipeline(steps=[
    ('encoder', OneHotEncoder()),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
])

# preprocessing avec transformation des categories et des nums
preprocessor = ColumnTransformer(
    transformers=[
        ('num', preprocessor_num , ['age', 'bmi', 'children', 'bmi_smoker_interaction']),
        ('cat', preprocessor_cat , ['region']),
    ]
)
# création du pipeline de prétraitement et du modèle elasticnet
pipeline_lasso = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regression', Lasso())])


#test de différents hyperparam d'alpha pour le lasso
param_grid_lasso = {
    'regression__alpha': [5, 5.1, 7],
    'regression__precompute': [True],
    'regression__max_iter': [5000],
    
}

### grid search
grid_lasso = GridSearchCV(pipeline_lasso, param_grid_lasso, cv=5)


# eviter data leakage -> entraîner le pipeline sur les données d'entraînement 
grid_lasso.fit(X_train, y_train)
#puis predire y sur l'ensemble de test avec le meme pipeline
y_pred = grid_lasso.predict(X_test)
# print(y_pred)





In [14]:


#test des les differents metriques sur modele elasticnet
#comparaison du y prédit avec le y de test
mse = mean_squared_error(y_test, y_pred)

print(f'Mean Squared Error du modèle elasticnet: {mse}')

r2 = r2_score(y_test, y_pred)

print(f'Coefficient de determination R² du modèle elasticnet: {r2}')

rmse = math.sqrt(mse)
print(f'Root Mean Squared Error (RMSE) du modèle elasticnet: {rmse} \n rappel moyenne charge : 13279')

Mean Squared Error du modèle elasticnet: 13558711.546076775
Coefficient de determination R² du modèle elasticnet: 0.9064373062988563
Root Mean Squared Error (RMSE) du modèle elasticnet: 3682.215575720245 
 rappel moyenne charge : 13279


In [15]:
best_model = grid_lasso.best_estimator_
best_model





In [16]:
new_data = pd.DataFrame({
    'age': [35, 40, 30, 60,60],  
    'sex': [1, 0, 0, 1,1],  
    'bmi': [22.5, 30.0, 25.0, 47,50],
    'children': [1, 2, 0, 5,5],
    'smoker': [1, 0, 0, 1,1],  
    'region': ['southwest', 'northeast', 'southeast', 'southeast', 'southeast'],
    'bmi_smoker_interaction': [22.5,0,0,47,50]
})

# faire des prédictions avec le modèle optimisé, possible de faire best_model.predict aussi 
new_y_ped = grid_lasso.predict(new_data)

# ajouter les prédictions à new_data
new_data['predicted_charges'] = new_y_ped

print(new_data)

   age  sex   bmi  children  smoker     region  bmi_smoker_interaction  \
0   35    1  22.5         1       1  southwest                    22.5   
1   40    0  30.0         2       0  northeast                     0.0   
2   30    0  25.0         0       0  southeast                     0.0   
3   60    1  47.0         5       1  southeast                    47.0   
4   60    1  50.0         5       1  southeast                    50.0   

   predicted_charges  
0       17892.748882  
1        9924.218113  
2        4150.183381  
3       60316.227571  
4       63663.901442  
