# Importation des modules

In [None]:
###### modules pour le chargement des données depuis le XML ######
import glob
from lxml import etree
from preTraitements.xml import get_X_Y_from_root
from preTraitements.xml import get_tree_root_from_file

###### modules pour la classification ######

# modèles
from sklearn.svm import LinearSVC, SVC

# vectorisation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import StandardScaler

# création de nos transformers
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.feature_extraction import DictVectorizer # créer nos propres transformer
#%pip install transformers[sentencepiece]
#%pip install torch
from transformers import pipeline

# recherche des meilleurs hyperparamètres
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# résultats
from sklearn.metrics import classification_report

# sauvegarde des modèles
from joblib import dump, load

###### modules pour la visualisation ######
import matplotlib.pyplot as plt
import pandas as pd

###### miscellaneous ######
from typing import List # typage des fonctions
import numpy as np
import re
from collections import namedtuple

# Pré-traitement des données

## Chargement des données dans le bon format

In [None]:
tree_train, root_train = get_tree_root_from_file("./corpus/train_deft09_parlement_appr.xml/deft09_parlement_appr_fr.xml")
X_train, y_train = get_X_Y_from_root(root_train)

tree_test, root_test = get_tree_root_from_file("./corpus/deft09_parlement_test.xml/deft09_parlement_test_fr.xml")
X_test, y_test = get_X_Y_from_root(root_test) # y_test est vide : pas accès aux résultats

In [None]:
import re
pattern = re.compile(r'\d+\t(\w+(-\w+)?)')
y_test = []
folder =  "/content/drive/My Drive/Colab Notebooks/Apprentissage_Artificiel/corpus"
files_ref = glb.glob(folder+"/deft09_parlement_ref/deft09_parlement_ref_fr.txt")
for file in files_ref:
  input = file
with open(input,'r') as file:
    line = file.readline()
    while line:
        m= re.match(pattern,line)
        if m:
            y_test.append(m.group(1))
        else:
            y_test.append('PSE')
        line = file.readline()

In [None]:
from sklearn.preprocessing import LabelBinarizer, LabelEncoder

# Créer un objet LabelBinarizer
lb = LabelEncoder()

# Convertir les étiquettes de classe en un tableau binaire
y_train_bin = lb.fit_transform(y_train)
y_test_bin = lb.fit_transform(y_test)


## Nettoyage des données

In [None]:
pattern_clean = re.compile(r"[^ \w]") # pattern à utiliser pour nettoyer les données

def clean(data:list)->list: 
    """
    Cette fonction renvoie une liste de string nettoyé. 
    Les caractères sont transformés en minuscule et on ne garde que le token.
    Toute la ponctuation est retirée.

    Args:
        data (_type_): liste de string

    Returns:
        _type_: liste de string
    """
    global pattern_clean
    return re.sub(pattern_clean, "", data).lower()

X_train_clean = [clean(x) for x in X_train]
X_test_clean = [clean(x) for x in X_test]


# Extraction des features

## GloVe

In [None]:
import glob
import os
#root folder
root_folder='.'
glove_filename='vectors/vectors.txt'

# Variable for data directory
glove_path = os.path.abspath(glove_filename)

'/content/drive/My Drive/Colab Notebooks/Apprentissage_Artificiel/vectors.txt'

In [None]:
from gensim.models import KeyedVectors
word2vec_output_file = glove_filename+'.word2vec'
glove = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

In [None]:
#remplace un CountVectorizer()
def GloveVectorizer(X_train):
    model = glove
    n=0
    m = glove.get_vector('président')
    docs = np.zeros((len(X_train),m.shape[0]))
    for document in X_train:
        word_vectors=[]
        for token in document.split():
            if token in model:
                word_vectors.append(model[token])
        if len(word_vectors)>0:
            word_vectors = np.array(word_vectors)
            docs[n]=word_vectors.mean(axis=0)
        else:
            docs[n]=np.zeros(m.shape[0])
        n=n+1
    return docs

## Word2vec

In [None]:
w2v_filename='vectors/text8-vector.bin'

# Variable for data directory
w2v_path = os.path.abspath(w2v_filename)

wv_from_bin = KeyedVectors.load_word2vec_format(w2v_path, binary=True)
def w2vVectorizer(X_train):
    model = wv_from_bin
    n=0
    m = wv_from_bin.get_vector('président')
    docs = np.zeros((len(X_train),m.shape[0]))
    for document in X_train:
        word_vectors=[]
        for token in document.split():
            if token in model:
                word_vectors.append(model[token])
        if len(word_vectors)>0:
            word_vectors = np.array(word_vectors)
            docs[n]=word_vectors.mean(axis=0)
        else:
            docs[n]=np.zeros(m.shape[0])       
        n=n+1
    return docs

## FastText

In [None]:
#########pas possible ! cf le github fasttext pour le faire en ligne de commande
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from gensim.models.fasttext import load_facebook_vectors
fde = '../cc.fr.300.bin' #modèle disponible sur le site de fasttext
ft_path = os.path.abspath(fde)
print(ft_path)
model = load_facebook_vectors(fde)

ImportError: ignored

In [None]:
def FTVectorizer(X_train):
    global model
    n=0
    m = model.get_vector('président')
    docs = np.zeros((len(X_train),m.shape[0]))
    for document in X_train:
        word_vectors=[]
        for token in document.split():
            if token in model:
                word_vectors.append(model[token])
        if len(word_vectors)>0:
            word_vectors = np.array(word_vectors)
            docs[n]=word_vectors.mean(axis=0)
        n=n+1
    return docs

## Transformers personnalisés

Les transformers personnalisés sont disponibles dans les fichiers `add_features.ipynb`

# Création des PipeLine

on crée d'abord toutes les pipelines qu'on veut

on aura une param_grid par pipeline

In [None]:
liste_clfs = []

In [None]:
# named tuple Classifieur : 
# - pipeline
# - param_grid pour le grid_search
Classifieur = namedtuple('Classifieur', ["pipeline", "param_grid"]) #TODO : instancier le nom du classifieur pour que ça soit facile à récupérer après

## `LinearSVC()`

In [None]:
liste_svc = []

### Avec CountVectorizer()

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
from sklearn.svm import LinearSVC
pipeline_svm = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', CountVectorizer(min_df=1, ngram_range=(1,1))),
          ('tf_idf', TfidfTransformer(sublinear_tf=True))
        ])),
  ('standard', StandardScaler(with_mean=False)),
  #('svm', SVC(probability=True,kernel = 'linear',class_weight='balanced'))])
  ('svm', LinearSVC(class_weight='balanced',max_iter=10000))])
param_grid_svm = {
    "ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
    "ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    "svm__kernel":('linear', 'poly', 'rbf'),
    "svm__class_weight":('balanced',None),
    "svm__C":(0.1,0.5) # complexité du modèle
    }

svm = Classifieur(pipeline_svm, param_grid_svm)
liste_svc.append(svm)

### Avec GloveVectorizer()

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_svm = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(GloveVectorizer)),
          #('tf_idf', TfidfTransformer())
        ])),
  #('standard', StandardScaler(with_mean=False)),
  #('svm', SVC(probability=True,kernel = 'linear',class_weight='balanced'))])
  ('svm', OneVsOneClassifier(LinearSVC(class_weight='balanced',max_iter=10000,C=0.5)))])

param_grid_svm = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    #"ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    #"svm__kernel":('linear', 'poly', 'rbf'),
    #svm__class_weight":('balanced',None),
    #"svm__C":(0.1,0.5) # complexité du modèle
    }


liste_svc.append(pipeline_svm)

### Avec Word2Vec

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_svm = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(w2vVectorizer)),
          #('tf_idf', TfidfTransformer())
        ])),
  #('standard', StandardScaler(with_mean=False)),
  #('svm', SVC(probability=True,kernel = 'linear',class_weight='balanced'))])
  ('svm', OneVsOneClassifier(LinearSVC(class_weight='balanced',max_iter=10000,C=0.1)))])

param_grid_svm = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    #"ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    #"svm__kernel":('linear', 'poly', 'rbf'),
    #svm__class_weight":('balanced',None),
    #"svm__C":(0.1,0.5) # complexité du modèle
    }

liste_svc.append(pipeline_svm)

### Test

In [None]:
best_models_w2v = []
from sklearn.preprocessing import LabelEncoder

# Créer un objet LabelBinarizer
lb = LabelEncoder()

# Convertir les étiquettes de classe en un tableau binaire
y_train_bin = lb.fit_transform(y_train)
y_test_bin = lb.fit_transform(y_test)
for (pipeline, param_grid) in liste_svc:
    print("="*80)
    grid_search = GridSearchCV(pipeline, param_grid=param_grid,verbose=10)
    estimator = grid_search.fit(X_train_clean, y_train_bin)
    best_models_w2v.append(estimator)
    print("="*80,'\n')

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START svm__C=0.1..................................................
[CV 1/5; 1/2] END ...................svm__C=0.1;, score=0.344 total time=  22.3s
[CV 2/5; 1/2] START svm__C=0.1..................................................
[CV 2/5; 1/2] END ...................svm__C=0.1;, score=0.341 total time=  16.3s
[CV 3/5; 1/2] START svm__C=0.1..................................................
[CV 3/5; 1/2] END ...................svm__C=0.1;, score=0.354 total time=  16.6s
[CV 4/5; 1/2] START svm__C=0.1..................................................
[CV 4/5; 1/2] END ...................svm__C=0.1;, score=0.336 total time=  17.0s
[CV 5/5; 1/2] START svm__C=0.1..................................................
[CV 5/5; 1/2] END ...................svm__C=0.1;, score=0.352 total time=  19.3s
[CV 1/5; 2/2] START svm__C=0.5..................................................
[CV 1/5; 2/2] END ...................svm__C=0.5;,

In [None]:
for m in liste_svc:
  model = m
  m.fit(X_train_clean,y_train_bin)
  y_pred = m.predict(X_test_clean)
  print(classification_report(y_test_bin, y_pred))
  print("="*80)

              precision    recall  f1-score   support

           0       0.17      0.31      0.22      1339
           1       0.33      0.58      0.42      1793
           2       0.45      0.21      0.29      4571
           3       0.38      0.28      0.32      3629
           4       0.18      0.27      0.22      1585

    accuracy                           0.30     12917
   macro avg       0.30      0.33      0.29     12917
weighted avg       0.35      0.30      0.30     12917

              precision    recall  f1-score   support

           0       0.21      0.43      0.28      1339
           1       0.45      0.60      0.51      1793
           2       0.53      0.29      0.38      4571
           3       0.42      0.31      0.36      3629
           4       0.25      0.40      0.31      1585

    accuracy                           0.37     12917
   macro avg       0.37      0.41      0.37     12917
weighted avg       0.42      0.37      0.37     12917



          precision    recall  f1-score   support

           0       0.17      0.31      0.22      1339
           1       0.33      0.58      0.42      1793
           2       0.45      0.21      0.29      4571
           3       0.38      0.28      0.32      3629
           4       0.18      0.27      0.22      1585

    accuracy                           0.30     12917
   macro avg       0.30      0.33      0.29     12917
weighted avg       0.35      0.30      0.30     12917

================================================================================
              precision    recall  f1-score   support

           0       0.21      0.43      0.28      1339
           1       0.45      0.60      0.51      1793
           2       0.53      0.29      0.38      4571
           3       0.42      0.31      0.36      3629
           4       0.25      0.40      0.31      1585

    accuracy                           0.37     12917
   macro avg       0.37      0.41      0.37     12917
weighted avg       0.42      0.37      0.37     12917

================================================================================

In [None]:
for clf, estimator in zip(['CountVec','glove','word2vec'],best_models_w2v):
  pd_estimator = pd.DataFrame.from_dict(estimator.cv_results_)
  pd_estimator = pd_estimator.sort_values(by="rank_test_score", ascending=True)
  pd_estimator.to_csv(f"{clf}_cvresults_all.csv")


## `KNN`

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.impute import SimpleImputer

In [None]:
liste_knn = []

### Avec CountVectorizer

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_knn = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', CountVectorizer()),
          ('tf_idf', TfidfTransformer(use_idf=False,sublinear_tf=True))
        ])),
  ('in',SimpleImputer(strategy='mean')),
  ('select',SelectKBest()),
  #('standard', StandardScaler(with_mean=False)),
  ('knn', KNeighborsClassifier())])

param_grid_knn = {
    "ngram_tf_idf__counts__ngram_range": ((1, 1), (2,2)),  # unigrams or bigrams
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    #"ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    "knn__algorithm":('auto', 'brute'),
    "knn__n_neighbors":(1,2,3,4,5),
    "select__k":(100,200,500,1000)
    }

knn = Classifieur(pipeline_knn, param_grid_knn)
liste_knn.append(knn)

### Avec GloveVectorizer

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_knn = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(GloveVectorizer)),
          ('tf_idf', TfidfTransformer()),
        ])),
  #('in',SimpleImputer(strategy='mean')),
  ('select',SelectKBest()),
  #("min",MinMaxScaler()),
  ('knn', KNeighborsClassifier())])

param_grid_knn = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    "knn__algorithm":('auto', 'brute'),
    "select__k":(10,20,50),
    "knn__n_neighbors":(2,5)
    }

knn = Classifieur(pipeline_knn, param_grid_knn)
liste_knn.append(knn)

### Avec Word2Vec

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_knn = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(w2vVectorizer)),
          ('tf_idf', TfidfTransformer())
        ])),
  #('standard', StandardScaler(with_mean=False)),
    ('select',SelectKBest()),
  ('knn', KNeighborsClassifier())])

param_grid_knn = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
   # "ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    "knn__algorithm":('auto', 'brute'),
    "select__k":(10,50,100,200),
    "knn__n_neighbors":(2,5)
    }

knn = Classifieur(pipeline_knn, param_grid_knn)
liste_knn.append(knn)

### TEST

In [None]:
best_models_knn = []

for (pipeline, param_grid) in liste_knn:
    print("="*80)
    grid_search = GridSearchCV(pipeline, param_grid=param_grid,verbose=10)
    estimator = grid_search.fit(X_train_clean, y_train_bin)
    best_models_knn.append(estimator)
    print("="*80,'\n')

Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV 1/5; 1/24] START knn__algorithm=auto, knn__n_neighbors=2, ngram_tf_idf__tf_idf__use_idf=True, select__k=10
[CV 1/5; 1/24] END knn__algorithm=auto, knn__n_neighbors=2, ngram_tf_idf__tf_idf__use_idf=True, select__k=10;, score=0.273 total time=  17.3s
[CV 2/5; 1/24] START knn__algorithm=auto, knn__n_neighbors=2, ngram_tf_idf__tf_idf__use_idf=True, select__k=10
[CV 2/5; 1/24] END knn__algorithm=auto, knn__n_neighbors=2, ngram_tf_idf__tf_idf__use_idf=True, select__k=10;, score=0.284 total time=  17.8s
[CV 3/5; 1/24] START knn__algorithm=auto, knn__n_neighbors=2, ngram_tf_idf__tf_idf__use_idf=True, select__k=10
[CV 3/5; 1/24] END knn__algorithm=auto, knn__n_neighbors=2, ngram_tf_idf__tf_idf__use_idf=True, select__k=10;, score=0.297 total time=  17.9s
[CV 4/5; 1/24] START knn__algorithm=auto, knn__n_neighbors=2, ngram_tf_idf__tf_idf__use_idf=True, select__k=10
[CV 4/5; 1/24] END knn__algorithm=auto, knn__n_neighbors=2, ngram_tf

In [None]:
for m in best_models_knn:
  y_pred = m.predict(X_test_clean)
  print(classification_report(y_test_bin, y_pred))
  print("="*80)

              precision    recall  f1-score   support

           0       0.36      0.26      0.30      1339
           1       0.53      0.44      0.48      1793
           2       0.47      0.69      0.56      4571
           3       0.46      0.43      0.44      3629
           4       0.41      0.12      0.18      1585

    accuracy                           0.47     12917
   macro avg       0.45      0.39      0.39     12917
weighted avg       0.46      0.47      0.44     12917

              precision    recall  f1-score   support

           0       0.43      0.31      0.36      1339
           1       0.58      0.55      0.56      1793
           2       0.52      0.72      0.60      4571
           3       0.51      0.48      0.49      3629
           4       0.52      0.15      0.23      1585

    accuracy                           0.52     12917
   macro avg       0.51      0.44      0.45     12917
weighted avg       0.51      0.52      0.50     12917



In [None]:
for clf, estimator in zip(['glove','word2vec'],best_models_knn):
  pd_estimator = pd.DataFrame.from_dict(estimator.cv_results_)
  pd_estimator = pd_estimator.sort_values(by="rank_test_score", ascending=True)
  pd_estimator.to_csv(f"{clf}_cvresults_all_knn.csv")
  

## `Régression Logistique`

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
liste_reglog = []

### Avec CountVectorizer

In [None]:
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import LabelBinarizer 

# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_reglog = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', CountVectorizer(min_df=1)),
          ('tf_idf', TfidfTransformer())
        ])),
  ('standard', StandardScaler(with_mean=False)),
  ('rl', OneVsOneClassifier(LogisticRegression(max_iter=10000,multi_class='ovr',solver='liblinear')))])

param_grid_reglog = {
    #"ngram_tf_idf__counts__ngram_range": ((1, 1),(2,2)),  # unigrams or bigrams
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    #"rl__class_weight":('balanced',None),
    #"rl__penalty": ('l1','l2'),
    #"rl__solver":('newton-cg', 'newton-cholesky', 'sag', 'saga'),
    #"rl__C":(0.1,0.5,1.0,1.5) # complexité du modèle
    }

# Créer un objet LabelBinarizer
lb = LabelBinarizer()
y_test_bin = lb.fit_transform(y_test)

pipeline_reglog.fit(X_train_clean, y_train_bin)
y_pred = pipeline_reglog.predict(X_test_clean)
print(pipeline_reglog.score(X_test_clean,y_test_bin))
print(pipeline_reglog.score(X_train_clean,y_train_bin))


# Convertir les étiquettes de classe en un tableau binaire
y_pred_bin = lb.fit_transform(y_pred)

print(classification_report(y_test, y_pred_bin.classes_))

reglog = Classifieur(pipeline_reglog, param_grid_reglog)
liste_reglog.append(reglog)

KeyboardInterrupt: ignored

In [None]:
print(pipeline_reglog.score(X_test_clean,y_test_bin.cl))

### Avec GloveVectorizer()

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
from sklearn.preprocessing import MinMaxScaler
pipeline_reglog = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(GloveVectorizer)),
          #('tf_idf', TfidfTransformer()),
         # ("zero",FunctionTransformer(replace_zero_values))
        ])),
        #('min',MinMaxScaler()),
   # ('in',SimpleImputer(strategy='mean')),
 # ('standard', StandardScaler(with_mean=False)),
  ('rl', LogisticRegression(max_iter=100000, penalty='l2'))])

param_grid_reglog = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    #"ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    "rl__class_weight":('balanced',None),
    #"rl__penalty": ('l1','l2'),
    "rl__multi_class":('auto','ovr','multinomial'),
    "rl__solver":('newton-cg', 'sag', 'saga','lbfgs','liblinear'), # 'newton-cholesky' -> à tester avec OneVs Rest
    "rl__C":(0.1,0.5,1.0) # complexité du modèle
    }

reglog = Classifieur(pipeline_reglog, param_grid_reglog)
liste_reglog.append(reglog)

### Avec Word2VecVectorizer()

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
liste_reglog = []
pipeline_reglog = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(w2vVectorizer)),
          #('tf_idf', TfidfTransformer())
        ])),
  #('standard', StandardScaler(with_mean=False)),
  ('rl',LogisticRegression(max_iter=100000 , penalty='l2'))])

param_grid_reglog = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    #"ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    #"rl__class_weight":('balanced',None),
    #"rl__penalty": ('l1','l2'),
    "rl__multi_class":('auto','ovr','multinomial'),
    "rl__solver":('newton-cg', 'sag', 'saga','lbfgs','liblinear'), # 'newton-cholesky' -> à tester avec OneVs Rest
    "rl__C":(0.1,0.5) # complexité du modèle
    }

reglog = Classifieur(pipeline_reglog, param_grid_reglog)
liste_reglog.append(reglog)

### TEST

In [None]:
best_models_reglog = []

for (pipeline, param_grid) in liste_reglog:
    print("="*80)
    grid_search = GridSearchCV(pipeline, param_grid=param_grid,verbose=10)
    estimator = grid_search.fit(X_train_clean, y_train_bin)
    best_models_reglog.append(estimator)
    print("="*80,'\n')

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5; 1/30] START rl__C=0.1, rl__multi_class=auto, rl__solver=newton-cg......
[CV 1/5; 1/30] END rl__C=0.1, rl__multi_class=auto, rl__solver=newton-cg;, score=0.439 total time=  25.7s
[CV 2/5; 1/30] START rl__C=0.1, rl__multi_class=auto, rl__solver=newton-cg......
[CV 2/5; 1/30] END rl__C=0.1, rl__multi_class=auto, rl__solver=newton-cg;, score=0.422 total time=  26.1s
[CV 3/5; 1/30] START rl__C=0.1, rl__multi_class=auto, rl__solver=newton-cg......
[CV 3/5; 1/30] END rl__C=0.1, rl__multi_class=auto, rl__solver=newton-cg;, score=0.425 total time=  26.5s
[CV 4/5; 1/30] START rl__C=0.1, rl__multi_class=auto, rl__solver=newton-cg......
[CV 4/5; 1/30] END rl__C=0.1, rl__multi_class=auto, rl__solver=newton-cg;, score=0.426 total time=  28.0s
[CV 5/5; 1/30] START rl__C=0.1, rl__multi_class=auto, rl__solver=newton-cg......
[CV 5/5; 1/30] END rl__C=0.1, rl__multi_class=auto, rl__solver=newton-cg;, score=0.430 total time=  29.2s
[CV

10 fits failed out of a total of 150.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 1519, in fit
    multi_class = _check_multi_class(self.multi_class, solver, len(self.classes_))
  File "/usr/local/lib/python3.8/dist-packages/sklearn/linear_model/_logistic.py", line 




In [None]:
from sklearn.metrics import classification_report
for model in best_models_reglog:
  clf = model
  y_pred = clf.predict(X_test_clean)
  print(classification_report(y_test_bin, y_pred))
  print("="*80)

              precision    recall  f1-score   support

           0       0.40      0.02      0.04      1339
           1       0.58      0.49      0.53      1793
           2       0.44      0.73      0.55      4571
           3       0.39      0.36      0.38      3629
           4       0.34      0.07      0.12      1585

    accuracy                           0.44     12917
   macro avg       0.43      0.33      0.32     12917
weighted avg       0.43      0.44      0.39     12917



[0m[01;34mdrive[0m/  [01;34msample_data[0m/
/content/drive/MyDrive/saladier_vf
glove_cvresults_all_knn.csv     w2v_cvresults_all_svm.gsheet
glove_cvresults_all_knn.gsheet  word2vec_cvresults_all_knn.csv
glove_cvresults_all_rdf.csv     word2vec_cvresults_all_knn.gsheet
glove_cvresults_all_rdf.gsheet  word2vec_cvresults_all_rdf.csv
glove_cvresults_all_svm.csv     word2vec_cvresults_all_rdf.gsheet
w2v_cvresults_all_svm.csv


In [None]:
for clf, estimator in zip(['glove','word2vec'],best_models_reglog):
  pd_estimator = pd.DataFrame.from_dict(estimator.cv_results_)
  pd_estimator = pd_estimator.sort_values(by="rank_test_score",ascending=True)
  pd_estimator.to_csv(f"{clf}_cvresults_1000_logreg.csv")

In [None]:
print(len(y_test),len(y_pred))

12917 12917


## `Naives Bayes`

On ne peut pas utiliser NB avec Glove ou w2v. En plus, pas d'interet car les post traitement font perdre de la précision... tant pis...

In [None]:
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler #fixed import
from sklearn.decomposition import NMF
from sklearn.naive_bayes import CategoricalNB,MultinomialNB,GaussianNB, ComplementNB

### Multinomial 
Seulement avec countvectorizer()


In [None]:
liste_mnb = []

#### Avec CountVectorizer()

In [None]:


# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_mnb = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', CountVectorizer()),
          ('tf_idf', TfidfTransformer()),
        ])),
  ('standard', StandardScaler(with_mean=False)),
  ('mnb', MultinomialNB())])

param_grid_mnb = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
    "ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    "mnb__fit_prior":(True,False),
    "mnb__alpha":[0.1, 1.0, 10.0] # complexité du modèle
    }

mnb = Classifieur(pipeline_mnb, param_grid_mnb)
liste_mnb.append(mnb)

#### Avec Glove -> solution ?

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_mnb = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(GloveVectorizer)),
          #('tf_idf', TfidfTransformer())
        ])),
  ("min",MinMaxScaler()),
  #('decomp', NMF()),
  ('mnb', MultinomialNB())])

param_grid_mnb = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    #"ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    "mnb__fit_prior":(True,False),
    "mnb__alpha":np.linspace(0.5, 1.5, 6) # complexité du modèle
    }

mnb = Classifieur(pipeline_mnb, param_grid_mnb)
liste_mnb.append(mnb)

#### Word2Vec

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_mnb = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(w2vVectorizer)),
          #('tf_idf', TfidfTransformer())
        ])),
  ("min",MinMaxScaler()),
  #('decomp', NMF()),
  ('mnb', MultinomialNB())])

param_grid_mnb = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    #"ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    "mnb__fit_prior":(True,False),
    "mnb__alpha":np.linspace(0.5, 1.5, 6) # complexité du modèle
    }

mnb = Classifieur(pipeline_mnb, param_grid_mnb)
liste_mnb.append(mnb)

### GaussianNB
#### CountVectorizer()

In [None]:
liste_mnb = []

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_mnb = Pipeline([
          ('counts', CountVectorizer()),
          #('tf_idf', TfidfTransformer()),
  ('standard', StandardScaler(with_mean=False)),
  ('mnb', GaussianNB())])

param_grid_mnb = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    #"tf_idf__use_idf":(True,False),
    #"tf_idf__sublinear_tf":(True,False),
    #"mnb__priors":([0.25, 0.75],None)
    "mnb__var_smoothing":(1e-9,1e-3)
    }


mnb = Classifieur(pipeline_mnb, param_grid_mnb)
liste_mnb.append(mnb)

vectorizer = CountVectorizer()
X = vectorizer.transform(X_train_clean)
clf = GaussianNB()
clf.fit(X.toarray())



In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X_train_clean)
clf = GaussianNB()
clf.fit(X.toarray(),y_train_bin)


NameError: ignored

In [None]:
Xc = vectorizer.fit_transform(X_test_clean)
y_pred = clf.predict(Xc)

#### Avec GloveVectorizer()

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_mnb = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(GloveVectorizer)),
          #('tf_idf', TfidfTransformer())
        ])),
  #('decomp', NMF()),
  ('mnb', GaussianNB())])

param_grid_mnb = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    #"ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    #"mnb__priors":([0.25, 0.75],None)
    "mnb__var_smoothing":(1e-9,1e-3)
    }

mnb = Classifieur(pipeline_mnb, param_grid_mnb)
liste_mnb.append(mnb)

#### Avec w2vVectorizer

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_mnb = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(w2vVectorizer)),
          #('tf_idf', TfidfTransformer())
        ])),
  #('decomp', NMF()),
  ('mnb', GaussianNB())])

param_grid_mnb = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    #"ngram_tf_idf__tf_idf__use_idf":(True,False),
    #"ngram_tf_idf__tf_idf__sublinear_tf":(True,False),
    #"mnb__priors":([0.25, 0.75],None)
    "mnb__var_smoothing":(1e-9,1e-3)
    }

mnb = Classifieur(pipeline_mnb, param_grid_mnb)
liste_mnb.append(mnb)

### TEST

In [None]:
best_models_mnb = []

for (pipeline, param_grid) in liste_mnb:
    print("="*80)
    grid_search = GridSearchCV(pipeline, param_grid=param_grid,verbose=10)
    estimator = grid_search.fit(X_train_clean[:100], y_train_bin[:100])
    best_models_mnb.append(estimator)
    print("="*80,'\n')

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START mnb__var_smoothing=1e-09....................................
[CV 1/5; 1/2] END .......mnb__var_smoothing=1e-09;, score=nan total time=   0.0s
[CV 2/5; 1/2] START mnb__var_smoothing=1e-09....................................
[CV 2/5; 1/2] END .......mnb__var_smoothing=1e-09;, score=nan total time=   0.0s
[CV 3/5; 1/2] START mnb__var_smoothing=1e-09....................................
[CV 3/5; 1/2] END .......mnb__var_smoothing=1e-09;, score=nan total time=   0.0s
[CV 4/5; 1/2] START mnb__var_smoothing=1e-09....................................
[CV 4/5; 1/2] END .......mnb__var_smoothing=1e-09;, score=nan total time=   0.0s
[CV 5/5; 1/2] START mnb__var_smoothing=1e-09....................................
[CV 5/5; 1/2] END .......mnb__var_smoothing=1e-09;, score=nan total time=   0.0s
[CV 1/5; 2/2] START mnb__var_smoothing=0.001....................................
[CV 1/5; 2/2] END .......mnb__var_smoothing=0.001

10 fits failed out of a total of 10.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/naive_bayes.py", line 245, in fit
    return self._partial_fit(
  File "/usr/local/lib/python3.8/dist-packages/sklearn/naive_bayes.py", line 402, in _partial_fit
    X, y = self._validate_data(X, y, reset=first_call)
 

TypeError: ignored

In [None]:
for model in best_models_mnb:
  clf = model
  y_pred = clf.predict(X_test_clean)
  print(classification_report(y_test_bin, y_pred))
  print("="*80)

              precision    recall  f1-score   support

           0       0.12      0.07      0.09      1339
           1       0.28      0.48      0.36      1793
           2       0.39      0.23      0.29      4571
           3       0.31      0.28      0.29      3629
           4       0.14      0.28      0.18      1585

    accuracy                           0.27     12917
   macro avg       0.25      0.27      0.24     12917
weighted avg       0.29      0.27      0.26     12917

              precision    recall  f1-score   support

           0       0.13      0.02      0.04      1339
           1       0.29      0.51      0.37      1793
           2       0.41      0.25      0.31      4571
           3       0.31      0.30      0.31      3629
           4       0.14      0.28      0.18      1585

    accuracy                           0.28     12917
   macro avg       0.26      0.27      0.24     12917
weighted avg       0.30      0.28      0.28     12917



In [None]:
for clf, estimator in zip(['glove','word2vec'],best_models_mnb):
  pd_estimator = pd.DataFrame.from_dict(estimator.cv_results_)
  pd_estimator =pd_estimator.sort_values(by="rank_test_score", ascending=True)
  pd_estimator.to_csv(f"{clf}GNB_cvresults_all.csv")


## `Random Forest Classifier`

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
liste_rfc = []

### Avec CountVectorizer

In [None]:

# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_r = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', CountVectorizer(min_df=1)),
          ('tf_idf', TfidfTransformer())
        ])),
  ('standard', StandardScaler(with_mean=False)),
  ('r', RandomForestClassifier())])

param_grid_r = {
    "ngram_tf_idf__counts__ngram_range": ((1, 1),(2,2)),  # unigrams or bigrams
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
    "r__criterion":('gini', 'entropy'),
    "r__max_features":('sqrt', 'log2')
    }

r = Classifieur(pipeline_r, param_grid_r)
liste_rfc.append(r)

### Avec GloveVectorizer

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_r = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(GloveVectorizer)),
          ('tf_idf', TfidfTransformer())
        ])),
  ('standard', StandardScaler(with_mean=False)),
  ('r', RandomForestClassifier())])

param_grid_r = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
    "r__criterion":('gini', 'entropy'),
    "r__max_features":('sqrt', 'log2')
    }

r = Classifieur(pipeline_r, param_grid_r)
liste_rfc.append(r)

### Avec w2vVectorizer

In [None]:
# liste de tuples nommés Classifieur(pipeline, param_grid)
pipeline_r = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(w2vVectorizer)),
          ('tf_idf', TfidfTransformer())
        ])),
  ('standard', StandardScaler(with_mean=False)),
  ('r',RandomForestClassifier())])

param_grid_reglog = {
    #"ngram_tf_idf__counts__stop_words":('french',None), #TODO: pas de french : utiliser la liste de spacy ou nltk
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
    "ngram_tf_idf__tf_idf__use_idf":(True,False),
    "r__criterion":('gini', 'entropy'),
    "r__max_features":('sqrt', 'log2')
    }

r = Classifieur(pipeline_r, param_grid_r)
liste_rfc.append(r)

### TEST

In [None]:
best_models_rtf = []

for (pipeline, param_grid) in liste_rfc:
    print("="*80)
    grid_search = GridSearchCV(pipeline, param_grid=param_grid,verbose=10)
    estimator = grid_search.fit(X_train_clean, y_train_bin)
    best_models_rtf.append(estimator)
    print("="*80,'\n')

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5; 1/8] START ngram_tf_idf__tf_idf__use_idf=True, r__criterion=gini, r__max_features=sqrt
[CV 1/5; 1/8] END ngram_tf_idf__tf_idf__use_idf=True, r__criterion=gini, r__max_features=sqrt;, score=0.394 total time=  59.5s
[CV 2/5; 1/8] START ngram_tf_idf__tf_idf__use_idf=True, r__criterion=gini, r__max_features=sqrt
[CV 2/5; 1/8] END ngram_tf_idf__tf_idf__use_idf=True, r__criterion=gini, r__max_features=sqrt;, score=0.390 total time=  59.2s
[CV 3/5; 1/8] START ngram_tf_idf__tf_idf__use_idf=True, r__criterion=gini, r__max_features=sqrt
[CV 3/5; 1/8] END ngram_tf_idf__tf_idf__use_idf=True, r__criterion=gini, r__max_features=sqrt;, score=0.390 total time= 1.2min
[CV 4/5; 1/8] START ngram_tf_idf__tf_idf__use_idf=True, r__criterion=gini, r__max_features=sqrt
[CV 4/5; 1/8] END ngram_tf_idf__tf_idf__use_idf=True, r__criterion=gini, r__max_features=sqrt;, score=0.390 total time=  59.3s
[CV 5/5; 1/8] START ngram_tf_idf__tf_idf__use_id

In [None]:
for m in best_models_rtf:
  y_pred = m.predict(X_test_clean)
  print(classification_report(y_test_bin, y_pred))
  print("="*80)

              precision    recall  f1-score   support

           0       1.00      0.61      0.76      1339
           1       0.85      0.73      0.78      1793
           2       0.66      0.89      0.76      4571
           3       0.76      0.71      0.74      3629
           4       0.99      0.61      0.76      1585

    accuracy                           0.75     12917
   macro avg       0.85      0.71      0.76     12917
weighted avg       0.79      0.75      0.75     12917

              precision    recall  f1-score   support

           0       1.00      0.61      0.76      1339
           1       0.87      0.75      0.81      1793
           2       0.67      0.91      0.77      4571
           3       0.78      0.72      0.75      3629
           4       0.99      0.62      0.76      1585

    accuracy                           0.77     12917
   macro avg       0.86      0.72      0.77     12917
weighted avg       0.80      0.77      0.77     12917



In [None]:
for clf, estimator in zip(['glove','word2vec'],best_models_rtf):
  pd_estimator = pd.DataFrame.from_dict(estimator.cv_results_)
  pd_estimator = pd_estimator.sort_values(by="rank_test_score",ascending=True)
  pd_estimator.to_csv(f"{clf}_cvresults_all_rdf.csv")

# Visualisation des résultats

- `predict_proba` donne une liste de probabilité
- `clf.classes_` donne les étiquettes des classes (pour qu'on récupère l'ordre)

Les colonnes qu'on veut dans notre dataframe récapitulatif : 

- le nom du classifieur
- `estimator.best_params_` = les paramètres sous forme de dictionnaire du meilleur modèle trouvé avec le gridsearch
- `estimator.best_estimator_` = la pipeline du meilleur modèle
- `estimator.best_score_` = le mean_test_score du meilleur modèle trouvé avec le gridsearch


## Sortie de figure / plot

On sort les figures du meilleur classifieur

In [1]:
from sklearn.pipeline import make_pipeline

In [None]:
pipe = make_pipeline(CountVectorizer(min_df=1, ngram_range=(1,1)),TfidfTransformer(),LogisticRegression(max_iter=10000,multi_class="ovr"))
pipe.fit(X_train_clean, y_train_bin)
y_pred = clf.predict(X_test_clean)
print(classification_report(y_test_bin,y_pred))

### Learning Curve

In [None]:
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

# Utilisez la fonction learning_curve pour obtenir les scores de formation et de validation pour différentes tailles d'ensemble de données
train_sizes, train_scores, validation_scores = learning_curve(pipe, X_train_clean, y_train_bin, cv=5)

In [None]:
test_scores = validation_scores

# Calcul des moyennes et écart-types pour chaque point de la courbe
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

# Tracé de la courbe
plt.figure()
plt.title("Regression Logistique")
plt.xlabel("Taille de l'ensemble d'entraînement")
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Score de l'ensemble d'entraînement")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Score de l'ensemble de test")

plt.legend(loc="best")
plt.savefig("PlusjoliLearningCurve.png")
plt.show()




### Matrice de confusion

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
clf = LogisticRegression()
fig, ax = plt.subplots(figsize=(10, 5))
ConfusionMatrixDisplay.from_predictions(y_test_bin, y_pred, ax=ax, normalize='true')
ax.xaxis.set_ticklabels(np.array(['ELDR', 'GUE-NGL', 'PPE-DE', 'PSE', 'Verts-ALE']))
ax.yaxis.set_ticklabels(np.array(['ELDR', 'GUE-NGL', 'PPE-DE', 'PSE', 'Verts-ALE']))
_ = ax.set_title(
    f"Confusion Matrix for {clf.__class__.__name__}\non the original documents"
)

# Expérience avec FastText

Quelques test effectués avec FastText. On ne l'a utilisé que sur les meilleurs modèles car le temps de chargement des vecteurs est relativement long. 

In [None]:
pipeline_svm = Pipeline([
          ('counts', FunctionTransformer(FTVectorizer)),
          ('svm', LinearSVC(class_weight='balanced',max_iter=10000,C=0.1))
                        ])

pipeline_svm.fit(X_train_clean,y_train_bin)

In [None]:
pipeline_r = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(FTVectorizer)),
          ('tf_idf', TfidfTransformer(use_idf=False))
        ])),
  ('standard', StandardScaler(with_mean=False)),
  ('r',RandomForestClassifier(criterion='gini',max_features='sqrt'))])

pipeline_r.fit(X_train_clean,y_train_bin)
y_pred = pipeline_r.predict(X_test_clean)
print(classification_report(y_test_bin, y_pred))

In [None]:
pipeline_knn = Pipeline([
        ('ngram_tf_idf', Pipeline([
          ('counts', FunctionTransformer(FTVectorizer)),
          ('tf_idf', TfidfTransformer(use_idf = False))
        ])),
    ('select',SelectKBest(k=20)),
  ('knn', KNeighborsClassifier(algorithm='auto',n_neighbors=5,))])

pipeline_knn.fit(X_train_clean,y_train_bin)
ypred = pipeline_knn.predict(X_test_clean)
print(classification_report(y_test_bin, y_pred))

In [None]:
pipeline_reglog = Pipeline([
  ('counts', FunctionTransformer(FTVectorizer)),
  ('standard', StandardScaler(with_mean=False)),
  ('rl',LogisticRegression(max_iter=100000 , penalty='l2',solver="liblinear",multi_class="ovr",C=0.1))

])

pipeline_reglog.fit(X_train_clean,y_train_bin)
y_pred = pipeline_reglog.predict(X_test_clean)
print(classification_report(y_test_bin,y_pred))