<a href="https://colab.research.google.com/github/Ghalia671/MODULE1/blob/master/Ghalia_1_6_1_pipeline_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing et modèle

## Mauvaise approche

In [1]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

cancer = load_breast_cancer()

X,y=cancer.data, cancer.target

scaler = MinMaxScaler().fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X,y, random_state=5)


In [2]:
svm = SVC()

svm.fit(X_train, y_train)

print("Train score: {:.2f}".format(svm.score(X_train, y_train)))
print("Test score: {:.2f}".format(svm.score(X_test, y_test)))

Train score: 0.91
Test score: 0.95


## Bonne approche

éviter le data leakage

In [3]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

cancer = load_breast_cancer()
X,y=cancer.data, cancer.target

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=5)

scaler = MinMaxScaler().fit(X_train)

In [4]:

X_train_scaled = scaler.transform(X_train)
#X_train_scaled = scaler.fit_transform(X_train)

svm = SVC()

svm.fit(X_train_scaled, y_train)

X_test_scaled = scaler.transform(X_test)

print("Train score: {:.2f}".format(svm.score(X_train_scaled, y_train)))
print("Test score: {:.2f}".format(svm.score(X_test_scaled, y_test)))

Train score: 0.99
Test score: 0.98


# Construction de pipeline

## Utilisation de  `Pipeline`

In [5]:
from sklearn.svm import SVC
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

cancer = load_breast_cancer()
X,y=cancer.data, cancer.target

X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=0)


In [6]:
from sklearn.pipeline import Pipeline

pipe = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC())])

In [7]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', MinMaxScaler()), ('svm', SVC())])

In [8]:
print("Test score: {:.2f}".format(pipe.score(X_test, y_test)))

Test score: 0.97


## Utilisation de ``make_pipeline``

In [8]:
from sklearn.pipeline import make_pipeline

pipe_long = Pipeline([("scaler", MinMaxScaler()), ("svm", SVC(C=100))])

pipe_short = make_pipeline(MinMaxScaler(), SVC(C=100))

In [9]:
pipe_short.steps

[('minmaxscaler', MinMaxScaler()), ('svc', SVC(C=100))]

In [10]:
pipe_long.steps

[('scaler', MinMaxScaler()), ('svm', SVC(C=100))]

## Pipeline à trois étapes

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pipe = make_pipeline(StandardScaler(), PCA(n_components=2), SVC(C=100))

pipe.steps

[('standardscaler', StandardScaler()),
 ('pca', PCA(n_components=2)),
 ('svc', SVC(C=100))]

In [12]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('pca', PCA(n_components=2)), ('svc', SVC(C=100))])

In [13]:
print("Test score: {:.2f}".format(pipe.score(X_test, y_test)))

Test score: 0.92


## Diagramme

In [14]:
from sklearn import set_config
set_config(display='diagram')
pipe

In [15]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.23.2.


In [18]:
#!pip install --upgrade scikit-learn

Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/5c/a1/273def87037a7fb010512bbc5901c31cfddfca8080bc63b42b26e3cc55b3/scikit_learn-0.23.2-cp36-cp36m-manylinux1_x86_64.whl (6.8MB)
[K     |████████████████████████████████| 6.8MB 2.8MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl
Installing collected packages: threadpoolctl, scikit-learn
  Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.23.2 threadpoolctl-2.1.0


# Etapes dans un pipeline

In [16]:
pipe.steps

[('standardscaler', StandardScaler()),
 ('pca', PCA(n_components=2)),
 ('svc', SVC(C=100))]

In [17]:
pipe.named_steps["svc"].support_

array([  4,  14,  36,  40,  56,  79, 102, 105, 109, 111, 136, 148, 157,
       179, 185, 194, 195, 213, 218, 237, 269, 286, 292, 307, 310, 312,
       341, 370, 377, 378, 384, 392, 407,   8,   9,  42,  82,  87, 112,
       117, 124, 166, 181, 184, 188, 204, 208, 226, 259, 283, 294, 306,
       316, 324, 355, 366, 367, 368, 381, 393, 405, 408, 412, 419],
      dtype=int32)

In [18]:

pipe.named_steps["pca"].components_


array([[ 0.22136524,  0.10000219,  0.22951811,  0.22352098,  0.14302288,
         0.24211071,  0.26026925,  0.26425272,  0.1342154 ,  0.058505  ,
         0.20686479,  0.00729622,  0.20987422,  0.20223841,  0.01725187,
         0.16639025,  0.13855921,  0.17994092,  0.02943904,  0.10192967,
         0.23041956,  0.100572  ,  0.23779661,  0.22751009,  0.13135979,
         0.21077884,  0.2301419 ,  0.25334406,  0.11911651,  0.13088259],
       [-0.2301732 , -0.05721755, -0.21335503, -0.22693534,  0.17877041,
         0.14744861,  0.06557463, -0.03134067,  0.19050711,  0.36396122,
        -0.10501365,  0.0939736 , -0.0974744 , -0.14961032,  0.21204003,
         0.235435  ,  0.21050921,  0.15228014,  0.1810749 ,  0.27867942,
        -0.2159829 , -0.04249497, -0.20003599, -0.21518192,  0.17146856,
         0.13883173,  0.10503362,  0.00064033,  0.14065767,  0.27318654]])

In [19]:
pipe.named_steps["standardscaler"].mean_

array([1.41591714e+01, 1.92330047e+01, 9.21438967e+01, 6.58415023e+02,
       9.63659859e-02, 1.03669601e-01, 8.86501308e-02, 4.91440610e-02,
       1.80473239e-01, 6.26169953e-02, 4.04795070e-01, 1.21222723e+00,
       2.84097934e+00, 4.06956737e+01, 6.98667371e-03, 2.50779930e-02,
       3.16993207e-02, 1.17015352e-02, 2.04367559e-02, 3.71274953e-03,
       1.63168169e+01, 2.56379812e+01, 1.07459131e+02, 8.87647887e+02,
       1.32503404e-01, 2.52836338e-01, 2.69481120e-01, 1.15279345e-01,
       2.89649296e-01, 8.35402582e-02])

# Exercice

**Question** Construire un modèle de classification avec KNN sur les données iris, sans et avec la normalisation (min max et centrer-réduction). Comparer les performances des deux modèles.

In [20]:
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris

iris = load_iris()

X,y=iris.data, iris.target
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y,
            test_size=0.4, random_state=0)
clf = KNeighborsClassifier(n_neighbors = 5)
clf = clf.fit(X_train, y_train)
print("train score : ",clf.score(X_train, y_train))
print("test score : ",clf.score(X_test, y_test))

train score :  1.0
test score :  0.95


In [21]:
scaler = MinMaxScaler().fit(X_train)
clf = clf.fit(X_train, y_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("train score : ",clf.score(X_train_scaled, y_train))
print("test score : ",clf.score(X_test_scaled, y_test))

train score :  0.37777777777777777
test score :  0.26666666666666666


In [22]:
scaler = StandardScaler().fit(X_train)
clf = clf.fit(X_train, y_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("train score : ",clf.score(X_train_scaled, y_train))
print("test score : ",clf.score(X_test_scaled, y_test))

train score :  0.37777777777777777
test score :  0.26666666666666666
