# Modelisation

## Importation des modules et du dataset

In [11]:
import pandas as pd
import pickle
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures, StandardScaler

data = pd.read_csv("dataset_cleaned.csv")

## Transformation des variables 

In [12]:
bmi_scale = [
    {"category": "Underweight", "range": [float("-inf"), 18.499]},
    {"category": "Healthy weight", "range": [18.5, 24.999]},
    {"category": "Overweight", "range": [25, 29.999]},
    {"category": "Obesity class I", "range": [30, 34.999]},
    {"category": "Obesity class II", "range": [35, 39.999]},
    {"category": "Obesity class III", "range": [40, float("inf")]},
]

bmi_count = []
for bmi in data["bmi"]:
    for group in bmi_scale:
        if bmi >= group["range"][0] and bmi <= group["range"][1]:
            bmi_count.append(group["category"])
            continue

data = data.drop("bmi", axis=1)
data["bmi"] = bmi_count

## Création des train set et test set

In [13]:
X = data.drop("charges", axis=1)
y = data.charges

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])

## Etapes de preprocessing 

In [14]:
num_col = list(X.select_dtypes(include=[float,int]).columns)
cat_col = list(X.select_dtypes(include=[object]).columns)

preprocessing = ColumnTransformer([
    ("one_hot", OneHotEncoder(), cat_col),
    ("scaling", StandardScaler(), num_col),
])

## Lasso

In [20]:
lasso_model = Lasso(alpha=37.3)

pipe_lasso = make_pipeline(preprocessing, PolynomialFeatures(2), lasso_model)

pipe_lasso.fit(X_train, y_train)

0.922984950453674

## Serialisation du model dans un fichier pkl

In [16]:
with open("model.pkl", "wb") as file:
    pickle.dump(pipe_lasso, file)
file.close()