## Exemple de Preprocessing Simple (Encoding + Normalisation)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OrdinalEncoder, OneHotEncoder, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn import set_config
set_config(transform_output='pandas')

## Data Diamonds

In [2]:
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


# Train test Split

In [3]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=0)

print("train set shape = ", train_set.shape)
print("test set shape = ", test_set.shape)

train set shape =  (43152, 10)
test set shape =  (10788, 10)


# Encoding train set

In [4]:
df['cut'].unique()

['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
Categories (5, object): ['Ideal', 'Premium', 'Very Good', 'Good', 'Fair']

In [5]:
# ordre d'encodage des variables
cut_order = ['Fair','Good','Very Good','Premium', 'Ideal']
color_order = ['J','I','H','G','F','E','D']
clarity_order = ['I1', 'SI2', 'SI1','VS2', 'VS1', 'VVS2', 'VVS1', 'IF']

encoder = OrdinalEncoder(categories= [cut_order, color_order, clarity_order])
train_set[['cut','color','clarity']] = encoder.fit_transform(train_set[['cut','color', 'clarity']])

In [6]:
train_set.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
26250,1.63,4.0,3.0,4.0,61.7,55.0,15697,7.56,7.6,4.68
31510,0.34,4.0,3.0,3.0,62.2,57.0,765,4.47,4.44,2.77
40698,0.4,4.0,5.0,5.0,61.7,56.0,1158,4.73,4.77,2.93
42634,0.58,3.0,2.0,2.0,62.1,55.0,1332,5.38,5.35,3.33
47714,0.63,2.0,6.0,2.0,62.8,57.0,1885,5.4,5.46,3.41


# Normalisation

In [7]:
normaliser = MinMaxScaler()
train_set = pd.DataFrame(normaliser.fit_transform(train_set), columns=df.columns)
train_set.head(3)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
26250,0.297297,1.0,0.5,0.571429,0.519444,0.333333,0.83099,0.703911,0.129032,0.14717
31510,0.029106,1.0,0.5,0.428571,0.533333,0.388889,0.023681,0.416201,0.075382,0.087107
40698,0.04158,1.0,0.833333,0.714286,0.519444,0.361111,0.044929,0.44041,0.080985,0.092138


In [8]:
test_set.head(3)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
10176,1.1,Ideal,H,SI2,62.0,55.0,4733,6.61,6.65,4.11
16083,1.29,Ideal,H,SI1,62.6,56.0,6424,6.96,6.93,4.35
13420,1.2,Premium,I,SI1,61.1,58.0,5510,6.88,6.8,4.18


In [9]:
test_set_encoding = encoder.transform(df[['cut', 'color', 'clarity']])
test_set[ ['cut','color', 'clarity']] = pd.DataFrame(test_set_encoding)
test_set.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
10176,1.1,4.0,2.0,1.0,62.0,55.0,4733,6.61,6.65,4.11
16083,1.29,4.0,2.0,2.0,62.6,56.0,6424,6.96,6.93,4.35
13420,1.2,3.0,1.0,2.0,61.1,58.0,5510,6.88,6.8,4.18
20407,1.5,4.0,4.0,2.0,60.9,56.0,8770,7.43,7.36,4.5
8909,0.9,2.0,4.0,3.0,61.7,57.0,4493,6.17,6.21,3.82


In [10]:
test_set = normaliser.transform(test_set)

# Pipeline Sklearn

In [11]:
from sklearn.pipeline import Pipeline, make_pipeline

In [12]:
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [13]:
train_set, test_set = train_test_split(df, test_size=0.2, random_state=0)

In [14]:
pipeline = make_pipeline(OrdinalEncoder(), MinMaxScaler())
# Ou Pipeline(steps = [ ( 'Encoder', OrdinalEncoder() ), ( 'Scaler', MinMaxScaler()) ] )

In [15]:
pipeline

# Pipeline compose

In [16]:
from sklearn.compose import ColumnTransformer

In [17]:
categorical_cols = ['cut', 'color', 'clarity']
Encoder = ColumnTransformer(transformers=[("Encoder" , OrdinalEncoder(), categorical_cols )], remainder='passthrough')

In [18]:
pipeline = make_pipeline(Encoder, MinMaxScaler())
pipeline

In [19]:
pipeline.fit_transform(train_set).head(3)

Unnamed: 0,Encoder__cut,Encoder__color,Encoder__clarity,remainder__carat,remainder__depth,remainder__table,remainder__price,remainder__x,remainder__y,remainder__z
26250,0.5,0.5,0.571429,0.297297,0.519444,0.333333,0.83099,0.703911,0.129032,0.14717
31510,0.5,0.5,0.714286,0.029106,0.533333,0.388889,0.023681,0.416201,0.075382,0.087107
40698,0.5,0.166667,1.0,0.04158,0.519444,0.361111,0.044929,0.44041,0.080985,0.092138


In [20]:
pipeline.transform(test_set).head(3)

Unnamed: 0,Encoder__cut,Encoder__color,Encoder__clarity,remainder__carat,remainder__depth,remainder__table,remainder__price,remainder__x,remainder__y,remainder__z
10176,0.5,0.666667,0.428571,0.18711,0.527778,0.333333,0.238214,0.615456,0.112903,0.129245
16083,0.5,0.666667,0.285714,0.226611,0.544444,0.361111,0.329639,0.648045,0.117657,0.136792
13420,0.75,0.833333,0.285714,0.2079,0.502778,0.416667,0.280223,0.640596,0.11545,0.131447


# Pipeline Niveau 1

In [21]:
pipeline1 = Pipeline(steps=[
                             ('Normaliser', StandardScaler()), 
                             ('poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)),
                             ('SelectFeature', SelectKBest(score_func = f_regression, k = 4)) 
                            ])
pipeline1

# Pipeline Niveau 2

In [22]:
from sklearn.compose import  make_column_selector as selector

In [23]:
Encoder2 = ColumnTransformer(transformers = [('Encoder',
                                               OneHotEncoder(sparse_output = False, drop='first', handle_unknown='ignore'),
                                               selector(dtype_include='category'))],
                            remainder = 'passthrough')

pipeline2 = Pipeline(steps = [
                                ('Encoder', Encoder2),
                                ('Poly', PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
                                ('Normaliser', MinMaxScaler()) ] )
pipeline2

# Pipeline Niveau 3

In [24]:
# Pipeline pour selectionné meilleurs variable categorielle après encodage
pipeline_cat = Pipeline(steps= [
    ('OneHot', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')),
    ('SelectBest', SelectKBest(score_func=f_regression, k=1))
])

pipeline_cat

In [25]:
Encoder3 = ColumnTransformer(transformers = [('cat_pipeline', 
                                              pipeline_cat, 
                                              selector(dtype_include='category') ) ],
                             remainder= 'passthrough' )

In [26]:
pipeline3 = Pipeline(steps=[
    ('Encoder', Encoder3),
    ('Poly', PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)),
    ('Normaliser', MinMaxScaler())
])

pipeline3

 # Pipeline Niveau 4

In [27]:
columntransform = ColumnTransformer(remainder = 'passthrough',
                                    transformers = [
                                        ('categorical col transform',
                                        Pipeline(steps=[
                                            ('OneHot', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore') ),
                                            ('SelectBest', SelectKBest(score_func=f_regression, k=2))
                                        ]),
                                        selector(dtype_include='category')),
                                        ('Standar_Sacling', StandardScaler(), ['height', 'weight']),
                                        ('MinMax_scaling', MinMaxScaler(), ['age', 'floor']),
                                    ])

columntransform

In [28]:
pipeline4 = Pipeline(steps=[
                ('Encoder',columntransform),
                ('poly', PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
                ('normaliser', MinMaxScaler())
              ])

pipeline4

In [None]:
# niveau quatre autre manière

column_transform = ColumnTransformer(transformers = [('select_cat', 
                                                      Pipeline( steps = [('OnehotEncoder', OneHotEncoder(sparse_output = False, drop='first', handle_unknown='ignore')),
                                                                         ('SelectFeature', SelectKBest(score_func=f_regression, k=5))]), 
                                                      selector(dtype_include='category')),
                                                     ('StandardScaler', StandardScaler(), ['height', 'weight']),
                                                     ('MinMaxScaler', MinMaxScaler(), ['age', 'floor'])
                                                     ],
                                    remainder = 'passthrough')

pipeline2 = Pipeline(steps=[('ColumnTransformer', column_transform),
                            ('Poly', PolynomialFeatures(include_bias=True, interaction_only=True)),
                            ('MinMax', MinMaxScaler())])
pipeline2