In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
donnees = pd.read_csv("dataset_assurance_cleaned.csv")
donnees_originales = donnees.copy()
donnees.head()

In [None]:
donnees.info()

#### trainset et testset

In [4]:
X = donnees.drop("charges", axis=1)     # features
y = donnees["charges"]                  # target


# 1. dummy Model : LinearRegression

In [5]:
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

encodeur_smoker = LabelBinarizer(pos_label=1, neg_label=0)
encodeur_smoker.classes_ = ["yes", "non"]

encodeur_sex = LabelBinarizer(pos_label=1, neg_label=0)
encodeur_sex.classes_ = np.array(['male', 'female'])  # on définit l'ordre

X["smoker"] = encodeur_smoker.fit_transform(X["smoker"])
X["sex"] = encodeur_sex.fit_transform(X["sex"])
X.rename(columns={"sex" : "sex_male"}, inplace=True)
X

encodeur_region = OneHotEncoder(sparse_output=False)
region_encodee = encodeur_region.fit_transform(X[["region"]])
region_encodee

df_region_encodee = pd.DataFrame(region_encodee, columns=encodeur_region.get_feature_names_out(["region"]), index = X.index)
df_region_encodee

X = pd.concat([X, df_region_encodee], axis = 1)

# suppression de la colonne region
X.drop("region", axis = 1, inplace=True)


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

In [None]:
from sklearn.linear_model import LinearRegression

dummy_model = LinearRegression()
dummy_model.fit(X_train, y_train)

In [None]:
dummy_model.score(X_test, y_test)

#### test 

In [9]:
# data = pd.read_csv("dataset_assurance.csv")
# data.drop_duplicates(inplace=True)

# X = data.drop("charges", axis = 1)
# y = data["charges"]
# print(data.shape)
# data.head()


# pre-processing

## encodage 


In [10]:
# from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

# encodeur_smoker = LabelBinarizer(pos_label=1, neg_label=0)
# encodeur_smoker.classes_ = ["yes", "non"]

# encodeur_sex = LabelBinarizer(pos_label=1, neg_label=0)
# encodeur_sex.classes_ = np.array(['male', 'female'])  # on définit l'ordre

# X["smoker"] = encodeur_smoker.fit_transform(X["smoker"])
# X["sex"] = encodeur_sex.fit_transform(X["sex"])
# X.rename(columns={"sex" : "sex_male"}, inplace=True)
# X

# encodeur_region = OneHotEncoder(sparse_output=False)
# region_encodee = encodeur_region.fit_transform(X[["region"]])
# region_encodee

# df_region_encodee = pd.DataFrame(region_encodee, columns=encodeur_region.get_feature_names_out(["region"]), index = X.index)
# df_region_encodee

# X = pd.concat([X, df_region_encodee], axis = 1)

# # suppression de la colonne region
# X.drop("region", axis = 1, inplace=True)


# 2. model Lasso

split testset et dataset

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

# X_train

## normalisation

Robuste scaler qui est peu sensible aux valeurs aberrantes car il soustrait les données à la médiane qu'il divise par le IQR

In [12]:
from sklearn.preprocessing import RobustScaler, StandardScaler

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


### premier model lasso

In [None]:
from sklearn.linear_model import Lasso

lasso1 = Lasso()
lasso1.fit(X_train, y_train)

In [14]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, median_absolute_error

def metriques(model, X_test, y_test) : 
    pred_y = model.predict(X_test)
    mse = mean_squared_error(y_test, pred_y)
    mae = mean_absolute_error(y_test, pred_y)
    r2 = r2_score(y_test, pred_y)
    med_ae = median_absolute_error(y_test, pred_y)
    
    
    print(f"MSE : {mse}")
    print(f"RMSE : {np.sqrt(mse)}")
    print(f"MAE : {mae}")
    print(f"R2 : {r2}")
    print(f"MedAE : {med_ae}")
    
    return mse, mae, r2, med_ae


In [15]:
def coefficients(model, X) : 
    coefs = model.coef_
    noms_cols = X.columns
    coefs_df = pd.DataFrame({"variables" : noms_cols, "coef" : coefs})
    print("coefficients des variables")
    print(coefs_df)


In [None]:
metriques(model=lasso1, X_test=X_test, y_test=y_test)

In [None]:
coefficients(model=lasso1, X=X)

## Sélection du meilleur alpha

avec grid search cv, model lassoCV


In [18]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

params = {
    "alpha" : [0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
}

lasso_cv = GridSearchCV(lasso1, param_grid=params, cv=5, n_jobs=-1)     

In [None]:
lasso_cv.fit(X_train, y_train)

In [None]:
print("meilleurs parametres : ", lasso_cv.best_params_)
lasso_cv.best_estimator_

### nouveau model avec le meilleur alpha
il sera le même que le premier, car alpha par défaut=1

In [None]:
lasso2 = Lasso(alpha=1)
lasso2.fit(X_train, y_train)

In [None]:
metriques(model=lasso2, X_test=X_test, y_test=y_test)


In [None]:
coefficients(model=lasso2, X=X)

## feature engineering

#### création de polynomes 
PolynomialFeatures

utiliser un modele qui fixe les coefficients à zero lorsque les poids ne sont pas utiles (il fait du feature selection)


In [None]:
from sklearn.preprocessing import PolynomialFeatures


# poly = PolynomialFeatures(degree=2, include_bias=False)
poly = PolynomialFeatures(degree=2)

X_fe = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])
type(X_fe)


In [25]:
X_train, X_test, y_train, y_test = train_test_split(X_fe, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

### normalisation du nouveau X_fe 

In [26]:
scaler = RobustScaler()
X_train_stand = scaler.fit_transform(X_train)
X_test_stand = scaler.transform(X_test)

In [None]:
lasso3 = Lasso()
lasso3.fit(X_train, y_train)

In [None]:
metriques(model=lasso3, X_test=X_test, y_test=y_test)


### recherche du meilleur alpha

In [29]:
params2 = {
    "alpha" : [i for i in range(10,100)]
}

lasso_cv2 = GridSearchCV(lasso3, param_grid=params2, cv=5, n_jobs=-1)

In [None]:
lasso_cv2.fit(X_train, y_train)

In [None]:
print("meilleur score : ", lasso_cv2.best_score_)
best_aplha = lasso_cv2.best_params_["alpha"]
print("meilleur alpha : ", best_aplha)
lasso_cv2.best_estimator_

In [None]:
lasso4 = Lasso(alpha=best_aplha)
lasso4.fit(X_train, y_train)

metriques(model=lasso4, X_test=X_test, y_test=y_test)

    R² est passé de 0.83 à 0.90 
    RMSE a baissé de 5013 à 3798 
    MAE a baissé de 3496 à 2476 

    points intéressants :

    smoker_bmi a un fort impact positif (+1438), confirmant que l'effet du tabagisme augmente avec le BMI
    age_squared est positif (+4.16), montrant un effet accéléré avec l'âge
    bmi_squared est négatif (-9.19), suggérant un effet qui ralentit pour les BMI très élevés
    Les interactions créées ont permis de capturer des relations plus complexes

    Changements notables :

    Le coefficient de smoker est devenu négatif car son effet est maintenant capturé via les interactions
    L'effet direct de l'âge est devenu négatif mais est compensé par age_squared

    Ces résultats montrent que l'ajout des interactions était pertinent et a significativement amélioré le modèle tout en gardant son interprétabilité.

# 3. model Ridge

In [33]:
X_train, X_test, y_train, y_test = X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])


In [None]:
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train


In [None]:
from sklearn.linear_model import Ridge

ridge1 = Ridge()
ridge1.fit(X_train, y_train)


In [None]:
metriques(model=ridge1, X_test=X_test, y_test=y_test)


In [None]:
X.columns

In [None]:
ridge1.coef_
coefficients(model=ridge1, X=X)

## amélioration

recherche du meilleur nombre de cv

In [None]:
params = {
    "alpha" : np.arange(1, 20)
}

ridge_cv = GridSearchCV(estimator=ridge1, param_grid=params, cv=5, n_jobs=-1)
ridge_cv.fit(X_train, y_train)


In [None]:
metriques(model=ridge_cv, X_test=X_test, y_test=y_test)

#### création de nouvelles variables 

In [41]:
X_fe = X.copy()
y_fe = y.copy()

X_fe["age_carre"] = X_fe["age"]**2  
X_fe["bmi_carre"] = X_fe["bmi"]**2  
X_fe["age_bmi"] = X_fe["age"] * X_fe["bmi"]
X_fe["smoker_bmi"] = X_fe["smoker"] * X_fe["bmi"]
X_fe["smoker_age"] = X_fe["smoker"] * X_fe["age"]
X_fe["children_bmi"] = X_fe["children"] * X_fe["bmi"]

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X_fe, y_fe, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])


In [43]:
scaler = RobustScaler()
X_train_stand = scaler.fit_transform(X_train)
X_test_stand = scaler.transform(X_test)

In [None]:
ridge3 = Ridge(alpha=1)
ridge3.fit(X_train, y_train)
metriques(model=ridge3, X_test=X_test, y_test=y_test)
ridge3.score(X_test, y_test)

# 4. model ElasticNet

In [None]:
from sklearn.linear_model import ElasticNet

elastic1 = ElasticNet(random_state=42)

elastic1.fit(X_train, y_train)

metriques(model=elastic1, X_test=X_test, y_test=y_test)


In [None]:
params = {
    'alpha': [0.0001, 0.0005, 0.001, 0.005, 0.01],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.8, 0.9, 0.95]
}
elastic_cv = GridSearchCV(elastic1, param_grid=params, cv=5, n_jobs=-1, scoring='r2')
elastic_cv.fit(X_train, y_train)

best_alpha = elastic_cv.best_params_['alpha']
best_l1_ratio = elastic_cv.best_params_['l1_ratio']
final_elastic = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio)
final_elastic.fit(X_train, y_train)

In [None]:
metriques(model=final_elastic, X_test=X_test, y_test=y_test)

In [None]:
X = X.copy()
y = y.copy()
y_fe = np.log(y)


poly = PolynomialFeatures(degree=2, include_bias=False)

X_fe = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_fe, y_fe, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

X_fe

In [None]:
# elastic_cv2 = ElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio, random_state=42)
elastic_cv2 = ElasticNet(alpha=0.001, l1_ratio=0.9, random_state=42)

elastic_cv2.fit(X_train, y_train)


In [None]:
metriques(model=elastic_cv2, X_test=X_test, y_test=y_test)

# groupement des ages, children et bmi en catégories

In [52]:
def categorie_age(age) :
    if age < 25:
        return "24 et moins"
    elif age < 35:
        return "25-34"
    elif age < 45:
        return "35-44"
    elif age < 55:
        return "45-54"
    else:
        return "55 et plus"

def categorie_bmi(bmi):
    if bmi < 18.5 :
        return "insuffisance pondérale"
    elif bmi < 24.5:
        return "normal"
    elif bmi < 30:
        return "surpoids"
    elif bmi < 35:
        return "obésité I"
    elif bmi < 40:
        return "obésité II"
    else:
        return "obésité III"
    
def est_obese(bmi): 
    return bmi >= 30

def a_des_enfants(enfant): 
    return enfant > 0

def region_west(region): 
    return region in ["northwest", "southwest"]



In [None]:
X3 = X.copy()
y3 = np.log(y.copy())
X3['bmi_cat'] = X3['bmi'].apply(categorie_bmi)

X3 = pd.get_dummies(X3, columns=['bmi_cat'], drop_first=True)
X3["bmi"] = np.log(X3["bmi"])

X3

In [54]:
poly = PolynomialFeatures(degree=2, include_bias=False)

X3 = poly.fit_transform(X3)
X_train, X_test, y_train, y_test = train_test_split(X3, y3, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
lasso5 = Lasso(alpha=0.001, random_state=42)
lasso5.fit(X_train, y_train)

metriques(model=lasso5, X_test=X_test, y_test=y_test)

In [None]:
elasticnet5 = ElasticNet(random_state=42, alpha=0.001, l1_ratio=0.9)
elasticnet5.fit(X_train, y_train)

metriques(model=elasticnet5, X_test=X_test, y_test=y_test)


# meilleur résultat obtenu

In [57]:
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder

X = donnees.drop("charges", axis=1)     # features
y = donnees["charges"]                  # target

encodeur_smoker = LabelBinarizer(pos_label=1, neg_label=0)
encodeur_smoker.classes_ = ["yes", "non"]

encodeur_sex = LabelBinarizer(pos_label=1, neg_label=0)
encodeur_sex.classes_ = np.array(['male', 'female'])  # on définit l'ordre

X["smoker"] = encodeur_smoker.fit_transform(X["smoker"])
X["sex"] = encodeur_sex.fit_transform(X["sex"])
X.rename(columns={"sex" : "sex_male"}, inplace=True)

encodeur_region = OneHotEncoder(sparse_output=False)
region_encodee = encodeur_region.fit_transform(X[["region"]])
region_encodee

df_region_encodee = pd.DataFrame(region_encodee, columns=encodeur_region.get_feature_names_out(["region"]), index = X.index)
df_region_encodee

X = pd.concat([X, df_region_encodee], axis = 1)

# suppression de la colonne region
X.drop("region", axis = 1, inplace=True)


In [58]:
X_fe = X.copy()
y_fe = y.copy()

X_fe["bmi_cat"] = X_fe["bmi"].apply(categorie_bmi)
X_fe["age_cat"] = X_fe["age"].apply(categorie_bmi)
# X_fe.drop("bmi", inplace=True, axis = 1)
# X_fe.drop("age", inplace=True, axis = 1)

# encodage
X_fe = pd.get_dummies(X_fe, columns=["bmi_cat"], drop_first=True)
X_fe = pd.get_dummies(X_fe, columns=["age_cat"], drop_first=True)
# print(X_fe.head())

# polynomes
poly = PolynomialFeatures(degree=2, include_bias=False)
X_fe = poly.fit_transform(X_fe)

# testset - dataset
X_train, X_test, y_train, y_test = train_test_split(X_fe, y_fe, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

# mise à l'echelle
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
lasso_model_final = Lasso(alpha=238, random_state=42)
lasso_model_final.fit(X_train, y_train)
metriques(model=lasso_model_final, X_test=X_test, y_test=y_test)


# pipeline

In [60]:
# residus distribution des residus, QQ plot +++

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression


X = donnees.drop("charges", axis=1)
y = donnees["charges"]

def preprocessing_0(X, y) :
    
    encodeur_smoker = LabelBinarizer(pos_label=1, neg_label=0)
    encodeur_smoker.classes_ = ["yes", "non"]

    encodeur_sex = LabelBinarizer(pos_label=1, neg_label=0)
    encodeur_sex.classes_ = np.array(['male', 'female'])

    X["smoker"] = encodeur_smoker.fit_transform(X["smoker"])
    X["sex"] = encodeur_sex.fit_transform(X["sex"])
    X.rename(columns={"sex": "sex_male"}, inplace=True)

    # encodage de region
    encodeur_region = OneHotEncoder(sparse_output=False)
    region_encodee = encodeur_region.fit_transform(X[["region"]])
    df_region_encodee = pd.DataFrame(region_encodee, columns=encodeur_region.get_feature_names_out(["region"]), index=X.index)
    
    X = pd.concat([X, df_region_encodee], axis=1)
    X.drop("region", axis=1, inplace=True)

    X["bmi_cat"] = X["bmi"].apply(categorie_bmi)
    X["age_cat"] = X["age"].apply(categorie_bmi)
    X = pd.get_dummies(X, columns=["bmi_cat"], drop_first=True)
    X = pd.get_dummies(X, columns=["age_cat"], drop_first=True)

    return X, y

X, y = preprocessing_0(X, y)

X_train, X_test, y_train, y_test = train_test_split( X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X["smoker"])

final_pipeline = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ("feature_selection", SelectKBest(score_func=f_regression, k=55)),
    ('regressor', Lasso(alpha=238, random_state=42))
])

final_pipeline.fit(X_train, y_train)

metriques(model=final_pipeline, X_test=X_test, y_test=y_test)
