In [None]:
# default_exp Pipeline

# Pipeline
> API details.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
#export
def get_data(path):
    data = pd.read_csv(path)
    return data

In [None]:
#hide
data = get_data("data/new_maisons-nan.csv")

In [None]:
#hide
data.head()

Unnamed: 0,surface,nb_chambre,date_creation,couleur,prix,classe
0,300.0,3.0,10/02/2010,bleu,100,N
1,200.0,2.0,03/09/2012,vert,90,N
2,250.0,3.0,21/08/2011,bleu,80,N
3,280.0,3.0,21/08/2010,bleu,85,N
4,200.0,,01/10/2012,bleu,82,N


In [None]:
#hide
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   surface        10 non-null     float64
 1   nb_chambre     9 non-null      float64
 2   date_creation  11 non-null     object 
 3   couleur        11 non-null     object 
 4   prix           11 non-null     int64  
 5   classe         11 non-null     object 
dtypes: float64(2), int64(1), object(3)
memory usage: 656.0+ bytes


### surface & nb_chambre columns have null values

In [None]:
#hide
data.describe()

Unnamed: 0,surface,nb_chambre,prix
count,10.0,9.0,11.0
mean,368.0,4.333333,327.0
std,141.090987,1.581139,231.350384
min,200.0,2.0,80.0
25%,257.5,3.0,87.5
50%,350.0,5.0,480.0
75%,487.5,6.0,520.0
max,600.0,6.0,600.0


In [None]:
#hide
X = data.iloc[:,:-2]
y = data["classe"]

In [None]:
#hide
print("top 5 from X")
print(X.head())
print("----------")
print("top 5 from y")
print(y.head())

top 5 from X
   surface  nb_chambre date_creation couleur
0    300.0         3.0    10/02/2010    bleu
1    200.0         2.0    03/09/2012    vert
2    250.0         3.0    21/08/2011    bleu
3    280.0         3.0    21/08/2010    bleu
4    200.0         NaN    01/10/2012    bleu
----------
top 5 from y
0    N
1    N
2    N
3    N
4    N
Name: classe, dtype: object


In [None]:
#export
class TransformeeMaison(BaseEstimator, TransformerMixin):

    def __init__(self,dateTo='annee'):
        #dateTo est un hyperparametre qui prend soit la valeur "annee" soit "age"
        self.ohe = OneHotEncoder()
        self.dateTo = dateTo
    def fit(self,X,y=None):
        X_ = X.copy()
        self.ohe.fit(X_.loc[:,['couleur']])
        self.surface_mean = X_.surface.mean()
        self.nb_chambre_mean = np.round(X_.nb_chambre.mean())
        return self
    def transform(self,X,y=None):
        X_ = X.copy()
        # remplacer les valeurs Nan
        X_.fillna({'surface':self.surface_mean,
                   'nb_chambre':self.nb_chambre_mean},
            inplace=True)
        # changer le type de la date en datetime
        X_.date_creation = pd.to_datetime(X_.date_creation)

        if self.dateTo == 'annee':
            X_["annee"] = X_.date_creation.apply(lambda d : d.year)
        elif self.dateTo =='age':
            X_["age"] = X_.date_creation.apply(lambda d : datetime.now().year - d.year)

        X_.drop("date_creation",axis=1,inplace=True)

        # remplacer la couleur par le code One Hot
        couleur_sparse=self.ohe.transform(X_.loc[:,['couleur']])
        couleur_encoded=couleur_sparse.toarray()
        df_couleur_encoded=pd.DataFrame(couleur_encoded,
                                        columns='couleur_'+self.ohe.categories_[0],
                                        index=X_.index)
        X_=pd.concat([X_,df_couleur_encoded], axis=1)
        X_.drop('couleur', axis=1, inplace=True, errors='ignore')

        return X_

In [None]:
#hide
trsf = TransformeeMaison(dateTo='age')
new_data = trsf.fit(X).transform(X)
new_data.head()

Unnamed: 0,surface,nb_chambre,age,couleur_bleu,couleur_vert
0,300.0,3.0,12,1.0,0.0
1,200.0,2.0,10,0.0,1.0
2,250.0,3.0,11,1.0,0.0
3,280.0,3.0,12,1.0,0.0
4,200.0,4.0,10,1.0,0.0


In [None]:
#hide
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   surface       11 non-null     float64
 1   nb_chambre    11 non-null     float64
 2   age           11 non-null     int64  
 3   couleur_bleu  11 non-null     float64
 4   couleur_vert  11 non-null     float64
dtypes: float64(4), int64(1)
memory usage: 568.0 bytes


#### Diviser les données en train et test

In [None]:
#export
def split_data(X,y,test_size):
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=test_size)
    return X_train , X_test,y_train,y_test

In [None]:
X_train,X_test,y_train,y_test = split_data(X,y,0.2)

## Appliquer la normalisation sur les données

In [None]:
#hide
X_trsf=trsf.fit(X_train).transform(X_train)
ss = StandardScaler()
ss_x_train = ss.fit_transform(X_trsf)

## Appliquer le ACP sur les données

In [None]:
#hide
pca = PCA()
pca.fit(ss_x_train)
print("Les variances par axe de pca")
print(pca.explained_variance_ratio_)
print("La somme cumulée par axe pca")
print(np.cumsum(pca.explained_variance_ratio_)*100)


Les variances par axe de pca
[6.67620901e-01 2.85684629e-01 4.28963110e-02 3.79815974e-03
 2.82888683e-34]
La somme cumulée par axe pca
[ 66.76209007  95.33055293  99.62018403 100.         100.        ]


## La mise en place du Pipeline + GridSearch + Modelling

##### la pipe commence par appliquer les transformations puis elle fait la normalisation et le PCA
#####  Enfin elle termine par appliquer le SVM pour la prédiction

In [None]:
#export
class Pipy :
    def __init__(self,Pipe,Params):
        self.Pipe = Pipe
        self.Params = Params
        self.gs = None
    def gridSearchy(self,X,y):
        self.gs = GridSearchCV(self.Pipe,self.Params,cv=3,n_jobs=-1)
        self.gs.fit(X,y)
        return self.gs
    def gridBestEstimator(self):
        return self.gs.best_estimator_

In [None]:
Pipe = Pipeline([
                 ('trsf',TransformeeMaison()),
                 ('ss',StandardScaler()),
                 ('pca',PCA()),
                 ('svm',SVC())
                ])
##les params sont pour le gridsearch pour trouver la meilleur combinaison
Params ={
    'trsf__dateTo':('age','annee'),
    'pca__n_components' : (2,3),
    'svm__kernel':('linear','rbf')
}

In [None]:
p =Pipy(Pipe,Params)

In [None]:
p.gridSearchy(X_train,y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('trsf', TransformeeMaison()),
                                       ('ss', StandardScaler()), ('pca', PCA()),
                                       ('svm', SVC())]),
             n_jobs=-1,
             param_grid={'pca__n_components': (2, 3),
                         'svm__kernel': ('linear', 'rbf'),
                         'trsf__dateTo': ('age', 'annee')})

In [None]:
p.gridBestEstimator()

Pipeline(steps=[('trsf', TransformeeMaison(dateTo='age')),
                ('ss', StandardScaler()), ('pca', PCA(n_components=2)),
                ('svm', SVC())])