In [1]:
from sklearn.preprocessing import PolynomialFeatures, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.linear_model import Lasso, Ridge, ElasticNet, LinearRegression
from sklearn.model_selection import GridSearchCV
from feature_engine.discretisation import ArbitraryDiscretiser
import math
import pandas as pd
import numpy as np

df = pd.read_csv('dataset2.csv')

print(df.head(), df.shape)
# separation des features et de la variable cible
X = df.drop('charges', axis=1)
y = df[['charges']]
print(f'''verif des dimensions X et Y
      X (dataset sans la variable cible): {X.shape}
      Y (la variable cible) : {y.shape}''')


# division du dataset en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])
print(f''' verif du split 80 20 
80% du dataset : X train -> {X_train.shape}, Y train -> {y_train.shape}
20% du dataset : X test -> {X_test.shape}, Y test -> {y_test.shape}''')


   age     sex     bmi  children smoker     region      charges
0   19  female  27.900         0    yes  southwest  16884.92400
1   18    male  33.770         1     no  southeast   1725.55230
2   28    male  33.000         3     no  southeast   4449.46200
3   33    male  22.705         0     no  northwest  21984.47061
4   32    male  28.880         0     no  northwest   3866.85520 (1337, 7)
verif des dimensions X et Y
      X (dataset sans la variable cible): (1337, 6)
      Y (la variable cible) : (1337, 1)
 verif du split 80 20 
80% du dataset : X train -> (1136, 6), Y train -> (1136, 1)
20% du dataset : X test -> (201, 6), Y test -> (201, 1)


In [2]:
# function custom
def log_transform(x):
    return np.log(x + 1)

def custom_bmi_discretizer(df):
    custom_bins = [0, 30, 100, float('inf')]
    bmi_discretizer = ArbitraryDiscretiser(binning_dict={'bmi': custom_bins}, return_object=True)
    df[['bmi']] = bmi_discretizer.fit_transform(df[['bmi']])
    return df

log_transformer = FunctionTransformer(log_transform)
bmi_discretizer_transformer = FunctionTransformer(custom_bmi_discretizer)

#### ======= preprocessing
#processeur pour split le bmi 
preprocessor_bmi = Pipeline(steps=[
    ('bmi_discretize', bmi_discretizer_transformer)
])

#processeur pour les colonnes numériques 
preprocessor_num = Pipeline(steps=[
    ('log', FunctionTransformer(log_transform)),
])
#processeur pour les colonnes catégorielles 
preprocessor_cat = Pipeline(steps=[
    ('encoder', OneHotEncoder())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('bmi', preprocessor_bmi, ['bmi']),
        ('num', preprocessor_num, ['age', 'bmi', 'children']),
        ('cat', preprocessor_cat, ['region', 'sex', 'smoker'])
    ]
)

pipeline_elasticnet = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('regression', ElasticNet())
])

#test de différents hyperparam d'alpha pour le lasso
param_grid_elasticnet = {
    'regression__alpha': np.arange(0.01,5,0.1),
    'regression__l1_ratio' : [0.1, 0.3, 0.5, 0.7, 0.9, 1]
}

### grid search
grid_elasticnet = GridSearchCV(pipeline_elasticnet, param_grid_elasticnet, cv=10)

# eviter data leakage -> entraîner le pipeline sur les données d'entraînement 
grid_elasticnet.fit(X_train, y_train)
best_model = grid_elasticnet.best_estimator_
#puis predire y sur l'ensemble de test avec le meme pipeline
y_pred = best_model.predict(X_test)
# print(y_pred)


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [3]:
#test des les differents metriques sur modele lasso
#comparaison du y prédit avec le y de test
mse = mean_squared_error(y_test, y_pred)


r2 = r2_score(y_test, y_pred)

print(f'Coefficient de determination R² du modèle elasticnet: {r2}')

rmse = math.sqrt(mse)
print(f'Root Mean Squared Error (RMSE) du modèle elasticnet: {rmse} \n rappel moyenne charge : 13279')

Coefficient de determination R² du modèle elasticnet: 0.9248924254833081
Root Mean Squared Error (RMSE) du modèle elasticnet: 3299.1322184479377 
 rappel moyenne charge : 13279


In [4]:
best_model


In [5]:
new_data = pd.DataFrame({
    'age': [35, 40, 30, 60,60],  
    'sex': ['female', 'male', 'male', 'female','female'],  
    'bmi': [22.5, 30.0, 25.0, 47,50],
    'children': [1, 2, 0, 5,5],
    'smoker': ['yes', 'no', 'no', 'yes','yes'],  
    'region': ['southwest', 'northeast', 'southeast', 'southeast', 'southeast'],
})

# faire des prédictions avec le modèle optimisé
new_y_ped = best_model.predict(new_data)

# ajouter les prédictions à new_data
new_data['predicted_charges'] = new_y_ped

print(new_data)

   age     sex   bmi  children smoker     region  predicted_charges
0   35  female  22.5         1    yes  southwest       18968.367265
1   40    male  30.0         2     no  northeast       10313.613412
2   30    male  25.0         0     no  southeast        4079.091128
3   60  female  47.0         5    yes  southeast       51125.630212
4   60  female  50.0         5    yes  southeast       51507.579099


In [6]:
ElasticNet?

[1;31mInit signature:[0m
[0mElasticNet[0m[1;33m([0m[1;33m
[0m    [0malpha[0m[1;33m=[0m[1;36m1.0[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0ml1_ratio[0m[1;33m=[0m[1;36m0.5[0m[1;33m,[0m[1;33m
[0m    [0mfit_intercept[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mprecompute[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mmax_iter[0m[1;33m=[0m[1;36m1000[0m[1;33m,[0m[1;33m
[0m    [0mcopy_X[0m[1;33m=[0m[1;32mTrue[0m[1;33m,[0m[1;33m
[0m    [0mtol[0m[1;33m=[0m[1;36m0.0001[0m[1;33m,[0m[1;33m
[0m    [0mwarm_start[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mpositive[0m[1;33m=[0m[1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0mrandom_state[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mselection[0m[1;33m=[0m[1;34m'cyclic'[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
Linear regression with combined L