In [956]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [957]:
donnees = pd.read_csv("df_final_assurance.csv")
donnees_originales = donnees.copy()
donnees.head()

Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,False,True,0.0,0.0,0.0,1.0
1,18,33.77,1,1725.5523,True,False,0.0,0.0,1.0,0.0
2,28,33.0,3,4449.462,True,False,0.0,0.0,1.0,0.0
3,33,22.705,0,21984.47061,True,False,0.0,1.0,0.0,0.0
4,32,28.88,0,3866.8552,True,False,0.0,1.0,0.0,0.0


In [958]:
donnees.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337 entries, 0 to 1336
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   age               1337 non-null   int64  
 1   bmi               1337 non-null   float64
 2   children          1337 non-null   int64  
 3   charges           1337 non-null   float64
 4   sex_male          1337 non-null   bool   
 5   smoker_yes        1337 non-null   bool   
 6   region_northeast  1337 non-null   float64
 7   region_northwest  1337 non-null   float64
 8   region_southeast  1337 non-null   float64
 9   region_southwest  1337 non-null   float64
dtypes: bool(2), float64(6), int64(2)
memory usage: 86.3 KB


#### fonction pour tester le modele :

In [959]:
def test_model(model, age, bmi, children, sex_male, smoker_yes, region_northeast, region_northwest, region_southeast, region_southwest, scaler=None) : 
    x = np.array([age, bmi, children, sex_male, smoker_yes, region_northeast, region_northwest, region_southeast, region_southwest]).reshape(1,-1)
    return f"montant de l'assurance : {model.predict(x)[0]}"


#### trainset et testset

In [960]:
X = donnees.drop("charges", axis=1)     # features
y = donnees["charges"]                  # target
# print(X.isna().sum())
# print(y.isna().sum())

# 1. dummy Model : LinearRegression

In [961]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker_yes"])

In [962]:
from sklearn.linear_model import LinearRegression

dummy_model = LinearRegression()
dummy_model.fit(X_train, y_train)

In [963]:
dummy_model.score(X_test, y_test)

0.8265441393970117

#### test 

In [964]:
# test_model(model=dummy_model,age=34,bmi=24.2,children=1,sex_male=1,smoker_yes=0,region_northeast=1,region_northwest=0,region_southeast=0,region_southwest=0)

In [965]:
data = pd.read_csv("dataset_assurance.csv")
data.drop_duplicates(inplace=True)

X = data.drop("charges", axis = 1)
y = data["charges"]
print(data.shape)
data.head()


(1337, 7)


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# pre-processing

## encodage 


In [966]:


from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

encodeur_smoker = LabelBinarizer(pos_label=1, neg_label=0)
encodeur_smoker.classes_ = ["yes", "non"]

encodeur_sex = LabelBinarizer(pos_label=1, neg_label=0)
encodeur_sex.classes_ = np.array(['female', 'male'])  # on définit l'ordre

X["smoker"] = encodeur_smoker.fit_transform(X["smoker"])
X["sex"] = encodeur_sex.fit_transform(X["sex"])
X.rename(columns={"sex" : "sex_male"}, inplace=True)
X

encodeur_region = OneHotEncoder(sparse_output=False)
region_encodee = encodeur_region.fit_transform(X[["region"]])
region_encodee

df_region_encodee = pd.DataFrame(region_encodee, columns=encodeur_region.get_feature_names_out(["region"]), index = X.index)
df_region_encodee

X = pd.concat([X, df_region_encodee], axis = 1)

# suppression de la colonne region
X.drop("region", axis = 1, inplace=True)
X



Unnamed: 0,age,sex_male,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,0.0,0.0,0.0,1.0
1,18,1,33.770,1,0,0.0,0.0,1.0,0.0
2,28,1,33.000,3,0,0.0,0.0,1.0,0.0
3,33,1,22.705,0,0,0.0,1.0,0.0,0.0
4,32,1,28.880,0,0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,0.0,1.0,0.0,0.0
1334,18,0,31.920,0,0,1.0,0.0,0.0,0.0
1335,18,0,36.850,0,0,0.0,0.0,1.0,0.0
1336,21,0,25.800,0,0,0.0,0.0,0.0,1.0


# 2. model Lasso

split testset et dataset

In [967]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

# X_train

## normalisation

Robuste scaler qui est peu sensible aux valeurs aberrantes car il soustrait les données à la médiane qu'il divise par le IQR

In [968]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
X_train_stand = scaler.fit_transform(X_train)
X_test_stand = scaler.transform(X_test)
# X_train_stand[0]


### premier model lasso

In [969]:
from sklearn.linear_model import Lasso

lasso1 = Lasso()
lasso1.fit(X_train, y_train)

In [970]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error

def metriques(model, X_test, y_test) : 
    pred_y = model.predict(X_test)
    mse = mean_squared_error(y_test, pred_y)
    mae = mean_absolute_error(y_test, pred_y)
    r2 = r2_score(y_test, pred_y)
    med_ae = median_absolute_error(y_test, pred_y)
    
    
    print(f"MSE : {mse}")
    print(f"RMSE : {np.sqrt(mse)}")
    print(f"MAE : {mae}")
    print(f"R2 : {r2}")
    print(f"MedAE : {med_ae}")      # cette métrique est moi sensible aux valeurs aberrantes
    
    return mse, mae, r2, med_ae


In [971]:
def coefficients(model, X) : 
    coefs = model.coef_
    noms_cols = X.columns
    coefs_df = pd.DataFrame({"variables" : noms_cols, "coef" : coefs})
    print("coefficients des variables")
    print(coefs_df)


In [972]:
metriques(model=lasso1, X_test=X_test, y_test=y_test)

MSE : 25131323.68988347
RMSE : 5013.115168224591
MAE : 3496.14252871994
R2 : 0.8265798093933763
MedAE : 2064.292800037346


(25131323.68988347,
 3496.14252871994,
 0.8265798093933763,
 np.float64(2064.292800037346))

In [973]:
coefficients(model=lasso1, X=X)

coefficients des variables
          variables          coef
0               age    260.030579
1          sex_male   -191.236625
2               bmi    334.943427
3          children    536.852657
4            smoker  23816.140418
5  region_northeast   1049.729829
6  region_northwest    632.830398
7  region_southeast   -139.298269
8  region_southwest    -98.614536


## Sélection du meilleur alpha

avec grid search cv, model lassoCV


In [974]:
from sklearn.model_selection import GridSearchCV

params = {
    "alpha" : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
}

lasso_cv = GridSearchCV(lasso1, param_grid=params, cv=5, n_jobs=-1)     # n_jobs pour déterminer le nombre de cœurs CPU utilisés pour exécuter les calculs en parallèle. -1 pour tous les coeurs


In [975]:
lasso_cv.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [976]:
print("meilleurs parametres : ", lasso_cv.best_params_)
lasso_cv.best_estimator_

meilleurs parametres :  {'alpha': 1}


### nouveau model avec le meilleur alpha
il sera le même que le premier, car alpha par défaut=1

In [977]:
lasso2 = Lasso(alpha=1)
lasso2.fit(X_train, y_train)

In [978]:
metriques(model=lasso2, X_test=X_test, y_test=y_test)


MSE : 25131323.68988347
RMSE : 5013.115168224591
MAE : 3496.14252871994
R2 : 0.8265798093933763
MedAE : 2064.292800037346


(25131323.68988347,
 3496.14252871994,
 0.8265798093933763,
 np.float64(2064.292800037346))

In [979]:
coefficients(model=lasso2, X=X)

coefficients des variables
          variables          coef
0               age    260.030579
1          sex_male   -191.236625
2               bmi    334.943427
3          children    536.852657
4            smoker  23816.140418
5  region_northeast   1049.729829
6  region_northwest    632.830398
7  region_southeast   -139.298269
8  region_southwest    -98.614536


## feature engineering

#### création de polynomes 
PolynomialFeatures

utiliser un modele qui fixe les coefficients à zero lorsque les poids ne sont pas utiles (il fait du feature selection)


In [980]:
from sklearn.preprocessing import PolynomialFeatures


# poly = PolynomialFeatures(degree=2, include_bias=False)
poly = PolynomialFeatures(degree=2)

X_fe = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])
type(X_fe)


numpy.ndarray

#### création de nouvelles variables 

In [981]:
# X_fe = X.copy()
# y_fe = y.copy()

# X_fe["age_carre"] = X_fe["age"]**2  
# X_fe["bmi_carre"] = X_fe["bmi"]**2  
# X_fe["age_bmi"] = X_fe["age"] * X_fe["bmi"]
# X_fe["smoker_bmi"] = X_fe["smoker"] * X_fe["bmi"]
# X_fe["smoker_age"] = X_fe["smoker"] * X_fe["age"]
# X_fe["children_bmi"] = X_fe["children"] * X_fe["bmi"]

en se basant sur une logique métier dans le domaine de l'assurance :

    age_squared :

    L'impact de l'âge sur les prix n'est pas forcément constant
    Le risque santé peut augmenter plus rapidement avec l'âge
    Par exemple : la différence de risque entre 60-61 ans peut être plus importante qu'entre 20-21 ans

    bmi_squared :

    L'impact du BMI sur la santé n'est pas linéaire
    Les risques de santé augmentent plus rapidement quand le BMI devient très élevé
    Par exemple : passer d'un BMI de 35 à 36 est plus risqué que passer de 20 à 21

    smoker_bmi :

    Les fumeurs avec un BMI élevé peuvent présenter des risques plus importants
    Cette interaction est particulièrement pertinente pour les risques cardio-vasculaires

J'ai choisi ces interactions spécifiques car elles ont un sens médical et assurantiel, plutôt que de créer toutes les interactions possibles qui n'auraient pas forcément de justification métier.

In [982]:
# X.shape
X_fe.shape

(1337, 55)

In [983]:
X_train, X_test, y_train, y_test = train_test_split(X_fe, y_fe, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

### normalisation du nouveau X_fe 

In [984]:
scaler = RobustScaler()
X_train_stand = scaler.fit_transform(X_train)
X_test_stand = scaler.transform(X_test)

In [985]:
lasso3 = Lasso()
lasso3.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


In [986]:
metriques(model=lasso3, X_test=X_test, y_test=y_test)


MSE : 14591951.297023077
RMSE : 3819.941268792372
MAE : 2500.1770729225523
R2 : 0.8993073740771175
MedAE : 1671.673997720989


(14591951.297023077,
 2500.1770729225523,
 0.8993073740771175,
 np.float64(1671.673997720989))

In [987]:
coefficients(model=lasso3, X=X_fe)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

### recherche du meilleur alpha

In [895]:
params2 = {
    "alpha" : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
}

lasso_cv2 = GridSearchCV(lasso3, param_grid=params2, cv=5, n_jobs=-1)


In [896]:
lasso_cv2.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


In [897]:
print("meilleur score : ", lasso_cv2.best_score_)
print("meilleurs parametres : ", lasso_cv2.best_params_)
lasso_cv2.best_estimator_

meilleur score :  0.8271576130040416
meilleurs parametres :  {'alpha': 10}


In [898]:
lasso4 = Lasso(alpha=10)
lasso4.fit(X_train, y_train)

metriques(model=lasso_cv2, X_test=X_test, y_test=y_test)

MSE : 14427749.320060663
RMSE : 3798.3877263992763
MAE : 2476.8054038794558
R2 : 0.9004404595641453
MedAE : 1623.9290108415103


(14427749.320060663,
 2476.8054038794558,
 0.9004404595641453,
 np.float64(1623.9290108415103))

In [899]:
coefficients(model=lasso4, X=X_fe)

coefficients des variables
           variables          coef
0                age   -110.519494
1           sex_male   -580.092460
2                bmi    540.374743
3           children    593.493687
4             smoker -19452.764441
5   region_northeast   1109.276646
6   region_northwest    414.874507
7   region_southeast    -12.281771
8   region_southwest   -288.671121
9          age_carre      4.157666
10         bmi_carre     -9.187205
11           age_bmi      1.418723
12        smoker_bmi   1438.313632
13        smoker_age    -21.016907
14      children_bmi      4.818067


    R² est passé de 0.83 à 0.90 
    RMSE a baissé de 5013 à 3798 
    MAE a baissé de 3496 à 2476 

    points intéressants :

    smoker_bmi a un fort impact positif (+1438), confirmant que l'effet du tabagisme augmente avec le BMI
    age_squared est positif (+4.16), montrant un effet accéléré avec l'âge
    bmi_squared est négatif (-9.19), suggérant un effet qui ralentit pour les BMI très élevés
    Les interactions créées ont permis de capturer des relations plus complexes

    Changements notables :

    Le coefficient de smoker est devenu négatif car son effet est maintenant capturé via les interactions
    L'effet direct de l'âge est devenu négatif mais est compensé par age_squared

    Ces résultats montrent que l'ajout des interactions était pertinent et a significativement amélioré le modèle tout en gardant son interprétabilité.

# 3. model Ridge

In [900]:
X_train, X_test, y_train, y_test = X_train, X_test, y_train, y_test = train_test_split(X_fe, y_fe, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])


In [None]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train


array([[-0.84      , -1.        ,  0.09040892, ...,  0.        ,
         0.        ,  1.63796633],
       [ 1.        , -1.        ,  0.08446097, ..., 31.3       ,
        64.        ,  0.58854493],
       [-0.8       ,  0.        , -0.32773234, ...,  0.        ,
         0.        , -0.45753436],
       ...,
       [-0.28      ,  0.        ,  0.80237918, ...,  0.        ,
         0.        ,  0.16635334],
       [ 0.8       , -1.        , -0.36758364, ...,  0.        ,
         0.        , -0.45753436],
       [-0.64      ,  0.        , -0.72327138, ...,  0.        ,
         0.        , -0.45753436]], shape=(1136, 15))

In [902]:
from sklearn.linear_model import Ridge

ridge1 = Ridge()
ridge1.fit(X_train, y_train)


In [903]:
metriques(model=ridge1, X_test=X_test, y_test=y_test)


MSE : 14329046.725303615
RMSE : 3785.3727326781986
MAE : 2459.3238972891168
R2 : 0.9011215626770318
MedAE : 1610.6290871952233


(14329046.725303615,
 2459.3238972891168,
 0.9011215626770318,
 np.float64(1610.6290871952233))

In [904]:
X.columns

Index(['age', 'sex_male', 'bmi', 'children', 'smoker', 'region_northeast',
       'region_northwest', 'region_southeast', 'region_southwest'],
      dtype='object')

In [905]:
ridge1.coef_

array([ -1634.75761607,   -612.61502207,   4012.16543516,   1371.79672533,
       -18353.68669022,    835.60530245,    159.55845385,   -368.83068374,
         -626.33307256,   7028.50859596,  -4088.50579246,   1095.24229862,
         1412.61859983,    -28.00943974,     75.96349283])

## amélioration

recherche du meilleur nombre de cv

In [906]:
params = {
    "alpha" : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
}

ridge_cv = GridSearchCV(estimator=ridge1, param_grid=params, cv=4, n_jobs=-1)
ridge_cv.fit(X_train, y_train)


In [907]:
metriques(model=ridge_cv, X_test=X_test, y_test=y_test)

MSE : 14587796.584997686
RMSE : 3819.3974112414235
MAE : 2495.9468930542125
R2 : 0.899336043914021
MedAE : 1640.8449561017997


(14587796.584997686,
 2495.9468930542125,
 0.899336043914021,
 np.float64(1640.8449561017997))

#### création de nouvelles variables 

In [908]:
X_fe = X.copy()
y_fe = y.copy()

X_fe["age_carre"] = X_fe["age"]**2  
X_fe["bmi_carre"] = X_fe["bmi"]**2  
X_fe["age_bmi"] = X_fe["age"] * X_fe["bmi"]
X_fe["smoker_bmi"] = X_fe["smoker"] * X_fe["bmi"]
X_fe["smoker_age"] = X_fe["smoker"] * X_fe["age"]
X_fe["children_bmi"] = X_fe["children"] * X_fe["bmi"]

In [909]:
X_train, X_test, y_train, y_test = train_test_split(X_fe, y_fe, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])


In [910]:
scaler = RobustScaler()
X_train_stand = scaler.fit_transform(X_train)
X_test_stand = scaler.transform(X_test)

In [None]:
ridge3 = Ridge(alpha=1)
ridge3.fit(X_train, y_train)
metriques(model=ridge3, X_test=X_test, y_test=y_test)
ridge3.score(X_test, y_test)

MSE : 14427977.833823793
RMSE : 3798.4178066431546
MAE : 2465.920009876347
R2 : 0.9004388826913611
MedAE : 1618.8103614039028


0.9004388826913611

# 4. model ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet

elastic = ElasticNet()

params = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}
elastic_cv = GridSearchCV(elastic, param_grid=params, cv=5, n_jobs=-1)
elastic_cv.fit(X_train, y_train)

best_alpha = elastic_cv.best_params_['alpha']
best_l1_ratio = elastic_cv.best_params_['l1_ratio']
final_elastic = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio)
final_elastic.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [913]:
metriques(model=final_elastic, X_test=X_test, y_test=y_test)

MSE : 14495609.066179153
RMSE : 3807.3099514196574
MAE : 2479.023534514491
R2 : 0.8999721893587393
MedAE : 1611.5897753053614


(14495609.066179153,
 2479.023534514491,
 0.8999721893587393,
 np.float64(1611.5897753053614))

In [914]:
final_elastic.coef_

array([-1.15022619e+02, -6.20049621e+02,  5.61737342e+02,  7.89639785e+02,
       -1.95665327e+04,  8.45812845e+02,  1.55265387e+02, -3.58872704e+02,
       -6.33965130e+02,  4.19969737e+00, -9.41851747e+00,  1.44930032e+00,
        1.44249861e+03, -2.12660107e+01, -1.29385117e+00])

In [915]:
from sklearn.linear_model import ElasticNetCV



# 5. model SVR (Support Vector Regression)

In [916]:
from sklearn.svm import SVR

