# Modelisation

## Importation des modules et du dataset

In [1]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler

data = pd.read_csv("dataset_cleaned.csv")

## Transformation des variables 

In [2]:
bmi_scale = [
    {"category": "Underweight", "range": [float("-inf"), 18.499]},
    {"category": "Healthy weight", "range": [18.5, 24.999]},
    {"category": "Overweight", "range": [25, 29.999]},
    {"category": "Obesity class I", "range": [30, 34.999]},
    {"category": "Obesity class II", "range": [35, 39.999]},
    {"category": "Obesity class III", "range": [40, float("inf")]},
]

bmi_count = []
for bmi in data["bmi"]:
    for group in bmi_scale:
        if bmi >= group["range"][0] and bmi <= group["range"][1]:
            bmi_count.append(group["category"])
            continue

data = data.drop("bmi", axis=1)
data["bmi"] = bmi_count

## Création des train set et test set

In [3]:
X = data.drop("charges", axis=1)
y = data.charges

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])

## Etapes de preprocessing 

In [4]:
num_col = list(X.select_dtypes(include=[float,int]).columns)
cat_col = list(X.select_dtypes(include=[object]).columns)

num_pipe = make_pipeline(StandardScaler())

preprocessing = ColumnTransformer([
    ("one_hot", OneHotEncoder(), cat_col),
    ("scaling", num_pipe, num_col),
])

## Regression linéaire

In [5]:
linear_model = LinearRegression()

pipe_lr = make_pipeline(preprocessing, PolynomialFeatures(2), linear_model)

pipe_lr.fit(X_train, y_train)
pipe_lr.score(X_test, y_test)
display(pipe_lr)

## ElasticNet

In [207]:
elastic_model = ElasticNet(alpha=0.1, l1_ratio=0.9)

pipe_en = make_pipeline(preprocessing, PolynomialFeatures(2), elastic_model)

pipe_en.fit(X_train, y_train)
pipe_en.score(X_test, y_test)

0.9174823660544655

### Recherche parametres avec grid search

In [208]:
params = {
    "elasticnet__alpha" : np.arange(0.1, 1, 0.1),
    "elasticnet__l1_ratio" : np.arange(0.1, 1, 0.1),
}

grid = GridSearchCV(pipe_en, param_grid=params, cv=5)

grid.fit(X_train, y_train)
display(grid.best_score_)
display(grid.score(X_test, y_test))
grid.best_params_

0.8414091204207244

0.9174823660544655

{'elasticnet__alpha': 0.1, 'elasticnet__l1_ratio': 0.9}

## Ridge

In [209]:
ridge_model = Ridge(alpha=1.9)

pipe_r = make_pipeline(preprocessing, PolynomialFeatures(2), ridge_model)

pipe_r.fit(X_train, y_train)
pipe_r.score(X_test, y_test)

0.9210229258570934

### Recherches des parametres avec grid search

In [211]:
params = {
    "ridge__alpha" : np.arange(0.5, 2, 0.25)
}

grid = GridSearchCV(pipe_r, param_grid=params, cv=5)

grid.fit(X_train, y_train)
display(grid.best_score_)
display(grid.score(X_test, y_test))
grid.best_params_

0.8456389806068112

0.9203439595296002

{'ridge__alpha': 0.5}

## Lasso

In [219]:
lasso_model = Lasso(alpha=40)

pipe_l = make_pipeline(preprocessing, PolynomialFeatures(2), lasso_model)

pipe_l.fit(X_train, y_train)
pipe_l.score(X_test, y_test)

0.9229672159033699

Idées pour améliorer model en amont :

- Créer des polynomials features (quels variables ensembles ?)
- Transformer certaines variables (transformation algorithmique) dont la distribution n'est pas normale
- Faire passer des variables quantitatives continues en autres types (ex bmi en catégories ?)

Améliorer model en aval : 

- Effectuer gridsearch -> pour les models Lasso et Elastic Net (pas de parametre learning rate à fixer pour linear regression)
- En utilisant elastic net -> si alpha = 0 -> LR
- Analyser les observations qui ont une grosse influence sur l'entrainement
