Comment utiliser le pipeline de sk learn

In [11]:
from sklearn.model_selection import train_test_split

from sklearn.pipeline import make_pipeline, make_union
from sklearn.compose import make_column_transformer

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OrdinalEncoder, OneHotEncoder, Binarizer, PolynomialFeatures,MinMaxScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LassoCV

import seaborn as sns

import pandas as pd 
import numpy as np 

dataframe = pd.read_csv("dataset.csv", sep=",", index_col=False)
dataframe.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [12]:
# bins = [0, 18.5, 25, 30, 35, 40, float('inf')]
# labels = ['Poids insuffisant', 'Poids normal', 'Surpoids', 'Obésité grade I', 'Obésité grade II', 'Obésité grade III']

# # Remplacer le bmi_encoder par la catégorisation d'IMC
# dataframe['bmi_category'] = pd.cut(dataframe['bmi'], bins=bins, labels=labels, right=False)

# # Encodage numérique des catégories d'IMC
# category_mapping = {
#     'Poids insuffisant': 0,
#     'Poids normal': 1,
#     'Surpoids': 2,
#     'Obésité grade I': 3,
#     'Obésité grade II': 4,
#     'Obésité grade III': 5
# }
# dataframe['bmi_encoded'] = dataframe['bmi_category'].map(category_mapping)

In [13]:
#no age encoder
sex_encoder = OrdinalEncoder(categories=[['female', 'male']])

#bmi_encoder = Binarizer(threshold=30)

#bmi_encoder = KBinsDiscretizer(n_bins = 30, encode='ordinal')
# bmi_encoder.bin_edges_ = bmi_thresholds

# from bmi_transformer import BmiDigitizer

# bmi_thresholds = np.array([0, 18.5, 25, 30, 35, 40])
# bmi_encoder = BmiDigitizer(bins = bmi_thresholds)

bmi_encoder = make_union(Binarizer(threshold=29.99), StandardScaler())

#no children encoder
smoker_encoder = OrdinalEncoder(categories=[['no', 'yes']])
region_encoder = OneHotEncoder()

#dataframe[["bmi"]].head()
result = bmi_encoder.fit_transform(dataframe[["bmi"]])
result


array([[ 0.        , -0.45332   ],
       [ 1.        ,  0.5096211 ],
       [ 1.        ,  0.38330685],
       ...,
       [ 1.        ,  1.0148781 ],
       [ 0.        , -0.79781341],
       [ 0.        , -0.26138796]], shape=(1338, 2))

In [14]:
dataframe = dataframe[ dataframe['charges'] <= 60000]

y = dataframe['charges']
X = dataframe.drop('charges', axis=1)

# pour pouvoir comparer le score
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])


In [15]:
age_pipeline = make_pipeline( StandardScaler()) #, PolynomialFeatures(degree=2))
sex_pipeline = make_pipeline( sex_encoder)
bmi_pipeline = make_pipeline( bmi_encoder) #, OneHotEncoder())
children_pipeline = make_pipeline( StandardScaler())
smoker_pipeline = make_pipeline( smoker_encoder)
region_pipeline = make_pipeline( region_encoder)

In [16]:
# first_pipeline = make_column_transformer(
#     (age_pipeline, ['age']),  
#     (bmi_pipeline, ['bmi']),
#     (smoker_pipeline, ['smoker']))
    
# polynomial_pipeline = make_pipeline(first_pipeline, PolynomialFeatures(degree=2, include_bias=False))

# preprocessor = make_column_transformer( 
#     (polynomial_pipeline, ['age', 'bmi', 'smoker']),
#     (sex_pipeline, ['sex']), 
#     (children_pipeline, ['children']),
#     (region_pipeline, ['region'])) 

preprocessor = make_pipeline( 
    make_column_transformer(
        (OneHotEncoder(), ['region']),
        (StandardScaler(), ['age']),  
        (OrdinalEncoder(categories=[['female', 'male']]), ['sex']),
        (bmi_pipeline, ['bmi']),
        (StandardScaler(), ['children']), 
        (OrdinalEncoder(categories=[['no', 'yes']]), ['smoker']),
        (OneHotEncoder(), ['region'])),      
    PolynomialFeatures(degree=2))
    

In [17]:
model = make_pipeline(preprocessor, LassoCV(cv=5))
#model = make_pipeline(preprocessor, Lasso(alpha=39))
model

In [18]:

model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print(f"  score  = {score} ")

  score  = 0.8631441041683244 


Avec le meilleur alpha :

In [19]:
from sklearn.model_selection import GridSearchCV

dataframe = dataframe [dataframe['charges']<=60000]

X = dataframe.drop(columns=['charges'])  
y = dataframe['charges']   

#X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, train_size=0.85, random_state=42, stratify=X['smoker'])
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle = True, train_size=0.85, random_state=42)

model = make_pipeline(
    preprocessor, 
    #PolynomialFeatures(degree=2, include_bias=False),  
    Lasso()  # Modèle Lasso
)

param_grid = {'lasso__alpha':np.arange(20, 40, 1)}
grid_search = GridSearchCV(model, param_grid, cv=4, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print("Meilleurs paramètres :", grid_search.best_params_)

best_model = grid_search.best_estimator_
score = best_model.score(X_test, y_test)
print(f"  score  = {score} ")

Meilleurs paramètres : {'lasso__alpha': np.int64(39)}
  score  = 0.921814061762936 


In [20]:
y_predicted = model.predict(X_test)

import sklearn.metrics as skl_metrics

rmse = skl_metrics.root_mean_squared_error(y_test, y_predicted)
Rsquare = skl_metrics.r2_score(y_test, y_predicted)

print(f"r2_score = {Rsquare} ")
print(f"sklearn.metrics.mean_squared_error = {rmse}")


NotFittedError: Pipeline is not fitted yet.

In [None]:
import matplotlib.pyplot as plt

#Charges = f ( age )
plt.xlabel("age")
plt.ylabel("charges")
plt.plot(X_test["age"], y_predicted, "r.")
plt.plot(X_test["age"], y_test, "g.")
plt.show()

In [None]:
columns_to_check = ['bmi', 'charges']

for col in columns_to_check:
    plt.figure(figsize=(8, 4))
    sns.boxplot(x=dataframe[col])
    plt.title(f"Boxplot pour {col}")
    plt.show()

In [None]:
# Fonction pour repérer les valeurs aberrantes basées sur l'IQR
def detect_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] < lower_bound) | (df[column] > upper_bound)]

# Repérer les valeurs aberrantes dans `bmi` et `charges`
outliers_bmi = detect_outliers_iqr(dataframe, 'bmi')
outliers_charges = detect_outliers_iqr(dataframe, 'charges')

print("Valeurs aberrantes dans bmi :")
print(outliers_bmi)

print("\nValeurs aberrantes dans charges :")
print(outliers_charges)


In [None]:
sns.scatterplot(data=dataframe, x='bmi', y='charges')
plt.title("Scatter plot de BMI vs Charges")
plt.show()


In [94]:
dataframe_cleaned = dataframe [dataframe['charges']<=60000]