# Notebook pour les pipelines : les exemples

Attention : pour le modèle LinearSVC, il n'y a pas de predict_proba. Il y a une `decision_function(X)`qui normalement fait la même chose, mais elle n'est pas compatible avec gridsearch... il faudra récupérer les paramètres du grid et les faire tourner autrement (repasser par un pipeline.fit() avec les meilleurs paramètres)... blablabla je mets ça pour moi (Julie) :) 

In [2]:
from typing import List
from lxml import etree
from preTraitements.xml import get_X_Y_from_root
from preTraitements.xml import get_tree_root_from_file

In [3]:
tree_train, root_train = get_tree_root_from_file("./corpus/train_deft09_parlement_appr.xml/deft09_parlement_appr_fr.xml")
X_train, y_train = get_X_Y_from_root(root_train)

tree_test, root_test = get_tree_root_from_file("./corpus/deft09_parlement_test.xml/deft09_parlement_test_fr.xml")
X_test, y_test = get_X_Y_from_root(root_test)

In [4]:
y_test

[]

In [5]:
len(X_train)

19370

In [6]:
X_train_sample = X_train[:500]
y_train_sample = y_train[:500]

In [7]:
len(X_train_sample)

500

In [8]:
## Package
import glob
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [9]:
from sklearn.preprocessing import FunctionTransformer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler

## Exemple d'une pipeline simple : CountVectorizer, TfidfTransform et LinearSVC

In [27]:
pipeline = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', CountVectorizer()),
          ('tf_idf', TfidfTransformer())
        ])),
  #('standard', StandardScaler(with_mean=False)),
  ('clf', LinearSVC())])

param_grid = {
    "ngram_tf_idf__counts__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    "ngram_tf_idf__counts__min_df": (1,5, 10),
    "ngram_tf_idf__counts__ngram_range": ((1, 1), (1, 2),(2,2)),  # unigrams or bigrams
    "ngram_tf_idf__counts__stop_words":('english',None),
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"clf__penalty": ('l1','l2'),
    "clf__dual":(True,False),
    "clf__C":(0.1,0.5,1.0,1.5)
    }

In [28]:
pipeline

In [29]:
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
grid_search.fit(X_train_sample, y_train_sample)

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits
[CV 1/5; 1/1440] START clf__C=0.1, clf__dual=True, ngram_tf_idf__counts__max_df=0.2, ngram_tf_idf__counts__min_df=1, ngram_tf_idf__counts__ngram_range=(1, 1), ngram_tf_idf__counts__stop_words=english, ngram_tf_idf__tf_idf__use_idf=True
[CV 1/5; 1/1440] END clf__C=0.1, clf__dual=True, ngram_tf_idf__counts__max_df=0.2, ngram_tf_idf__counts__min_df=1, ngram_tf_idf__counts__ngram_range=(1, 1), ngram_tf_idf__counts__stop_words=english, ngram_tf_idf__tf_idf__use_idf=True;, score=0.380 total time=   0.3s
[CV 2/5; 1/1440] START clf__C=0.1, clf__dual=True, ngram_tf_idf__counts__max_df=0.2, ngram_tf_idf__counts__min_df=1, ngram_tf_idf__counts__ngram_range=(1, 1), ngram_tf_idf__counts__stop_words=english, ngram_tf_idf__tf_idf__use_idf=True
[CV 2/5; 1/1440] END clf__C=0.1, clf__dual=True, ngram_tf_idf__counts__max_df=0.2, ngram_tf_idf__counts__min_df=1, ngram_tf_idf__counts__ngram_range=(1, 1), ngram_tf_idf__counts__stop_words=englis

In [30]:
# si on veut sauvegarder les meilleurs paramètres
best_param = grid_search.best_params_
print(best_param) # sous forme de dict, facilement réutilisable dans le prédict !!!!

{'clf__C': 0.1, 'clf__dual': True, 'ngram_tf_idf__counts__max_df': 0.2, 'ngram_tf_idf__counts__min_df': 1, 'ngram_tf_idf__counts__ngram_range': (1, 1), 'ngram_tf_idf__counts__stop_words': 'english', 'ngram_tf_idf__tf_idf__use_idf': False}


In [68]:
from sklearn.metrics import classification_report
# grid search.predict utilise les meilleurs paramètres directement.
y_pred = grid_search.predict(X_test)


In [45]:
# à lancer qd y_test[] ne sera plus vide
#print("Classification report:\n\n{}".format(classification_report(y_test, y_pred)))

[]


In [33]:
#sauvegarde model
from joblib import dump, load
dump(grid_search, 'test_model_save.joblib') 




['test_model_save.joblib']

## Pipeline avec un transformer personnalisé : exemple de la class StatText

In [54]:
from sklearn.feature_extraction import DictVectorizer

In [52]:
# fonction pr extraire les features qu'on veut, la sortie doit être un dictionnaire. 
# si jamais vous utiliser Counter() -> vous pouvez faire return dict(counter) pour avoir le dictionnaire slmt
def text_stats(posts):
    return [{"length": len(text), "num_sentences": text.count(".")} for text in posts]
# on peut transformer la fonction en transformer (object accepter dans les pipelines)
text_stats_transformer = FunctionTransformer(text_stats)

In [55]:
# création d'une pipeline avec FeatureUnion : on veut faire le traitement de la vectorisation
# et en parallèle on veut extraire le nombre de phrases et la longueur du texte
# on a donc deux features : 
# - Vectorizer()
# - statistique du text (text_stats)


clf = LinearSVC()

pipeline = Pipeline([
    # on unie nos features -> à la fin on a une grosse matrice
  ('features', FeatureUnion([
    # première pipeline : ngram_tf_idf : on vectorise et on passe le tf_idf
        ('ngram_tf_idf', Pipeline([
          ('counts', CountVectorizer()),
          ('tf_idf', TfidfTransformer())
        ])),
    # deuxième pipeline : on extrait les informations sur le texte et on vectorise avec DictVectorizer()
    # ça nous donne un format que le modèle accepte
        ('stats_pipe', Pipeline([
          ('stats', FunctionTransformer(text_stats)),
          ('vect', DictVectorizer())
        ]))
    ])),
    # Si besoin : on standardise nos données. Sinon, elles convergent mal (warning)
     ('standard', StandardScaler(with_mean=False)),
     # le classifieur
  ('clf', clf)
])

In [56]:
pipeline

In [57]:
param_grid = {
    # attention à mettre tous les prefixes...
    "features__ngram_tf_idf__counts__max_df": (0.2, 0.4, 0.6, 0.8, 1.0),
    "features__ngram_tf_idf__counts__min_df": (1,5, 10),
    "features__ngram_tf_idf__counts__ngram_range": ((1, 1), (1, 2)),  # unigrams or bigrams
    "features__ngram_tf_idf__counts__stop_words":('english',None),
    "features__ngram_tf_idf__tf_idf__use_idf":(True,False)
}

In [58]:
grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10)
grid_search.fit(X_train_sample, y_train_sample)

Fitting 5 folds for each of 120 candidates, totalling 600 fits
[CV 1/5; 1/120] START features__ngram_tf_idf__counts__max_df=0.2, features__ngram_tf_idf__counts__min_df=1, features__ngram_tf_idf__counts__ngram_range=(1, 1), features__ngram_tf_idf__counts__stop_words=english, features__ngram_tf_idf__tf_idf__use_idf=True
[CV 1/5; 1/120] END features__ngram_tf_idf__counts__max_df=0.2, features__ngram_tf_idf__counts__min_df=1, features__ngram_tf_idf__counts__ngram_range=(1, 1), features__ngram_tf_idf__counts__stop_words=english, features__ngram_tf_idf__tf_idf__use_idf=True;, score=0.330 total time=   0.1s
[CV 2/5; 1/120] START features__ngram_tf_idf__counts__max_df=0.2, features__ngram_tf_idf__counts__min_df=1, features__ngram_tf_idf__counts__ngram_range=(1, 1), features__ngram_tf_idf__counts__stop_words=english, features__ngram_tf_idf__tf_idf__use_idf=True
[CV 2/5; 1/120] END features__ngram_tf_idf__counts__max_df=0.2, features__ngram_tf_idf__counts__min_df=1, features__ngram_tf_idf__count

In [59]:
best_param = grid_search.best_params_
print(best_param) # sous forme de dict, facilement réutilisable dans le prédict !!!!

{'features__ngram_tf_idf__counts__max_df': 1.0, 'features__ngram_tf_idf__counts__min_df': 5, 'features__ngram_tf_idf__counts__ngram_range': (1, 1), 'features__ngram_tf_idf__counts__stop_words': None, 'features__ngram_tf_idf__tf_idf__use_idf': False}


In [63]:
# predict utilise (normalement) les meilleurs paramètres.
y_pred = grid_search.predict(X_test)
#print("Classification report:\n\n{}".format(classification_report(data_y_test, y_pred)))