In [38]:
import pandas as pd
import pickle
import spacy
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import xgboost as xgb

import nltk

from time import time

from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation, PCA
from sklearn.metrics import f1_score, classification_report, multilabel_confusion_matrix
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_validate, cross_val_score, learning_curve
from sklearn.manifold import TSNE

from bayes_opt import BayesianOptimization

from skmultilearn.problem_transform import BinaryRelevance

from nltk.stem.snowball import SnowballStemmer

# Ignorance de certains avertissements
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

# Fonctions utiles

In [49]:
def clean_text(texte):
    
    char_gardes = r'[^a-z#+.\s]'
    return re.sub(char_gardes, '', texte)  

def remove_sw(texte, return_verbs, return_token):
    
    doc = nlp(texte)
    
    if return_verbs :
    
        tokens = [token for token in doc if \
                  token.is_stop == False and token.is_punct == False\
                  and token.is_space == False
                 ]
    
    else :
        
        tokens = [token for token in doc if \
                  token.is_stop == False and token.is_punct == False\
                  and token.pos_ != "VERB"\
                  and token.is_space == False
                 ]
    
    if return_token :
        
        return tokens
    
    txt = ""
    
    for t in tokens :
        
        txt = txt + " " + t.text
        txt = txt.lstrip()
        
    return txt


def pipe_clsw(texte, return_verbs = True, return_token = True):
    """
    Nettoye et enlève les SW d'un text/str et retourne une liste de tokens.
    Par défaut : 
    - On garde les verbe. Si False on les élimine du résultat.
    - On retourne une liste de token. Si False on retourne une str des tokens.
    """
    
    res = remove_sw(clean_text(texte), return_verbs, return_token)
    
    return res

# **Approche 100% SUPERVISEE**

## 1-) Approche PCA directe

On repart de nos données vectorisées - normalisée

In [44]:
feat_train_vect = pickle.load(open("Data/feat_train_vect.pickle", "rb"))
feat_test_vect = pickle.load(open("Data/feat_test_vect.pickle", "rb"))

y_train = pickle.load(open("Data/y_train.pickle", "rb"))
y_test = pickle.load(open("Data/y_test.pickle", "rb"))

In [8]:
feat_train_vect.toarray().shape

(20000, 4789)

### **Réductions dimensionnelles**

In [15]:
std = StandardScaler().fit(feat_train_vect.toarray())
feat_train_scaled = std.transform(feat_train_vect.toarray())
feat_test_scaled = std.transform(feat_test_vect.toarray())

### **1-) PCA**

In [36]:
for n in [5, 10, 100, 500, 1000]:
    pca = PCA(n_components = n)
    pca.fit(feat_train_scaled)
    print(f"Pour n = {n}, la PCA explique {pca.explained_variance_ratio_.sum()} % de la variance des données")    

Pour n = 5, la PCA explique 0.015277060867347151 % de la variance des données
Pour n = 10, la PCA explique 0.024608497457983547 % de la variance des données
Pour n = 100, la PCA explique 0.10618675848159564 % de la variance des données
Pour n = 500, la PCA explique 0.2975371731613061 % de la variance des données
Pour n = 1000, la PCA explique 0.4690885327636175 % de la variance des données


Une réduction en **1000** composants principaux n'explique que **47%** de la variance des données.<br>Manifestement la PCA ne va pas nous permettre de réduire suffisamment le nombre de dimension de nos données. Mais on va quand même faire un essai pour **n = 1000**.

In [39]:
pca_1000 = PCA(n_components = 1000).fit(feat_train_scaled)
X_train = pca_1000.transform(feat_train_scaled)
X_test = pca_1000.transform(feat_test_scaled)

In [43]:
X_train.shape

(20000, 1000)

In [45]:
def bo_tune_m_clas3(max_depth, gamma, n_estimators ,learning_rate, eta):
    params = {'max_depth': int(max_depth),
              'gamma': gamma,
              'n_estimators': int(n_estimators),
              'learning_rate': learning_rate,
              'subsample': 0.8,
              'eta': eta, 
              'verbosity': 0, 
              'objective': "reg:logistic",
              'random_state' : 47#, 'eval_metric': 'f1'
             }
    
    # création du model qu'on va utiliser
    xgb_model = xgb.XGBClassifier(**params)
    # création du modèle multilabels
    m_clas = BinaryRelevance(classifier = xgb_model)
    
    # Cross validating with the specified parameters in 5 folds and 50 iterations
    cv_result = cross_validate(m_clas, X_train, y_train, scoring = "f1_weighted")
    #Return the negative the error
    #print(cv_result["test_score"].mean())
    return cv_result["test_score"].mean()

In [46]:
m_clas_bo3 = BayesianOptimization(bo_tune_m_clas3, {'max_depth': (3, 10),
                                                  'gamma': (0, 1),
                                                  'eta' : (0.01, 0.1),
                                                  'learning_rate':(0, 1),
                                                  'n_estimators':(50, 250)
                                                 })

In [47]:
start = time()
m_clas_bo3.maximize(n_iter=10, init_points=5, acq='ucb')
print(f"L'optimization a mis : {time() - start:.0f} secondes.")

|   iter    |  target   |    eta    |   gamma   | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------------------


KeyboardInterrupt: 

Trop long on abandonne...

## **Word Embedding Spacy**

In [51]:
feat = pickle.load(open("Data/X.pickle", "rb"))
targ = pickle.load(open("Data/y.pickle", "rb"))

feat_train = feat[:20000]
feat_test = feat[20000:]

targ_train = targ[:20000]
targ_test = targ[20000:]

In [57]:
# on garde les verbes, mais on veut le résultat en "string"
now = time()
feat_train_clean = [pipe_clsw(t, True, False) for t in feat_train]
feat_test_clean = [pipe_clsw(t, True, False) for t in feat_test]
print(f"Ce travail de nettoyage a pris {time() - now:.0f} secondes.")

Ce travail de nettoyage a pris 422 secondes.


In [58]:
feat_train_clean[1]

'spring boot controller content negotiation simple rest controller written springboot application sure implement content negotiation return json xml based contenttype parameter request header explain wrong controller method json calling method specify contenttype applicationxml textxml implement methods different mapping different content type able xml xml work specify mediatypes single method like provided example like message endpoint receive xml contenttype request set applicationxmljson contenttype applicationjson help appreciated editi updated controller accept media types'

In [59]:
pickle_out = open("Data/feat_train_clean.pickle", "wb")
pickle.dump(feat_train_clean, pickle_out)
pickle_out.close()

pickle_out = open("Data/feat_test_clean.pickle", "wb")
pickle.dump(feat_test_clean, pickle_out)
pickle_out.close()

In [60]:
feat_train_clean = pickle.load(open("Data/feat_train_clean.pickle", "rb"))
feat_test_clean = pickle.load(open("Data/feat_test_clean.pickle", "rb"))

In [48]:
# chargement du modèle
nlp = spacy.load('en_core_web_lg')

In [61]:
# transformation du corpus
now = time()
X_train_spacy = np.array([nlp(w).vector for w in feat_train_clean])
X_test_spacy = np.array([nlp(w).vector for w in feat_test_clean])
print(f"Cette transformation Word-Embedding avec Spacy a pris {time() - now:.0f} secondes.")

Cette transformation Word-Embedding avec Spacy a pris 291 secondes.


In [65]:
pickle_out = open("Data/X_train_spacy.pickle", "wb")
pickle.dump(X_train_spacy, pickle_out)
pickle_out.close()

pickle_out = open("Data/X_test_spacy.pickle", "wb")
pickle.dump(X_test_spacy, pickle_out)
pickle_out.close()

In [94]:
X_train_spacy = pickle.load(open("Data/X_train_spacy.pickle", "rb"))
X_test_spacy = pickle.load(open("Data/X_test_spacy.pickle", "rb"))

y_train = pickle.load(open("Data/y_train.pickle", "rb"))
y_test = pickle.load(open("Data/y_test.pickle", "rb"))

In [68]:
X_test_spacy.shape

(5555, 300)

**PCA**

300 dimensions c'est encore beaucoup, nous allons réutiliser la **PCA** pour réduire autant que possible ce nombre.

In [95]:
std = StandardScaler().fit(X_train_spacy)
X_train_spacy_scaled = std.transform(X_train_spacy)
X_test_spacy_scaled = std.transform(X_test_spacy)

In [96]:
for n in [5, 10, 50, 100, 150, 200, 250, 300]:
    pca = PCA(n_components = n)
    pca.fit(X_train_spacy_scaled)
    print(f"Pour n = {n}, la PCA explique {pca.explained_variance_ratio_.sum()*100:.2f}\
    % de la variance des données")    

Pour n = 5, la PCA explique 25.85 % de la variance des données
Pour n = 10, la PCA explique 37.20 % de la variance des données
Pour n = 50, la PCA explique 69.11 % de la variance des données
Pour n = 100, la PCA explique 83.68 % de la variance des données
Pour n = 150, la PCA explique 91.34 % de la variance des données
Pour n = 200, la PCA explique 95.91 % de la variance des données
Pour n = 250, la PCA explique 98.74 % de la variance des données
Pour n = 300, la PCA explique 100.00 % de la variance des données


Essayons maintenant des modélisations en fonction de **n**.

In [100]:
def modelize(n, X_train, y_train, X_test, y_test):
    
    start = time()
    
    pca = PCA(n_components = n)
    pca.fit(X_train_spacy_scaled)
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    
    xgb_model = xgb.XGBClassifier()
    m_class = BinaryRelevance(classifier = xgb_model).fit(X_train_pca, y_train)
    pred_train = m_class.predict(X_train_pca)
    pred_test = m_class.predict(X_test_pca)
    
    score_train = f1_score(y_train, pred_train, average = "weighted")
    score_test = f1_score(y_test, pred_test, average = "weighted")
    
    temps = (time() - start)
    
    return (score_train, score_test, temps)

In [101]:
results = []

for n in [5, 10, 50, 100, 150, 200, 250, 300]:
    
    results.append(modelize(n, X_train_spacy_scaled, y_train, X_test_spacy_scaled, y_test))

In [102]:
results

[(0.49258234078897284, 0.11604894114651894, 23.865284204483032),
 (0.7788768095214799, 0.20266677765162033, 59.51931405067444),
 (0.9872378668581084, 0.4545490488435735, 172.51770997047424),
 (0.9959518214265013, 0.4911362155057302, 198.9031629562378),
 (0.9979583756542356, 0.4983300010230284, 264.8279039859772),
 (0.9992428273383103, 0.490155068485608, 331.8358862400055),
 (0.9996602350529881, 0.4867507452259863, 400.6845922470093),
 (0.9997925033389263, 0.4821342713785624, 480.8770890235901)]

**Optimisation bayesiène pour N=150**

In [71]:
pca = PCA(n_components = 150)
pca.fit(X_train_spacy_scaled)
X_train_spacy_pca = pca.transform(X_train_spacy_scaled)
X_test_spacy_pca = pca.transform(X_test_spacy_scaled)

In [77]:
now = time()
xgb_model = xgb.XGBClassifier()
m_class_test = BinaryRelevance(classifier = xgb_model).fit(X_train_spacy_pca, y_train)
print(f"l'entrainement a pris {time() - now:.0f} secondes")

l'entrainement a pris 248 secondes


In [78]:
preds = m_class_test.predict(X_test_spacy_pca)

f1_score(y_test, 
         preds, 
         average = "weighted")

0.4959637586670044

248 secondes pour une modélisation... Pour un résultat bien meilleure que ce qu'on a eu avant. On va tenter une recherche de meilleurs hyper-paramètres avec la méthode BO, et **sans** validation croisée.

In [103]:
def bo_tune_m_clas3(max_depth, gamma, n_estimators ,learning_rate, eta):
    params = {'max_depth': int(max_depth),
              'gamma': gamma,
              'n_estimators': int(n_estimators),
              'learning_rate': learning_rate,
              'subsample': 0.8,
              'eta': eta, 
              'verbosity': 0, 
              'objective': "reg:logistic",
              'random_state' : 47#, 'eval_metric': 'f1'
             }
    
    # création du model qu'on va utiliser
    xgb_model = xgb.XGBClassifier(**params)
    # création du modèle multilabels
    m_clas = BinaryRelevance(classifier = xgb_model)
    
    # Cross validating with the specified parameters in 5 folds and 50 iterations
    # cv_result = cross_validate(m_clas, X_train, y_train, scoring = "f1_weighted")
    #Return the negative the error
    #print(cv_result["test_score"].mean())
    
    m_clas.fit(X_train_spacy_pca, y_train)
    preds = m_clas.predict(X_test_spacy_pca)

    score = f1_score(y_test, preds, average = "weighted")
    
    
    # return cv_result["test_score"].mean()
    return score

In [104]:
m_clas_bo3 = BayesianOptimization(bo_tune_m_clas3, 
                                  {'max_depth': (3, 10),
                                   'gamma': (0, 1),
                                   'eta' : (0.01, 0.1),
                                   'learning_rate':(0, 1),
                                   'n_estimators':(50, 250)}, 
                                  random_state = 47)

In [105]:
start = time()
m_clas_bo3.maximize(n_iter=10, init_points=5, acq='ucb')
print(f"L'optimization a mis : {time() - start:.0f} secondes.")

|   iter    |  target   |    eta    |   gamma   | learni... | max_depth | n_esti... |
-------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.5258  [0m | [0m 0.02021 [0m | [0m 0.9745  [0m | [0m 0.7287  [0m | [0m 5.46    [0m | [0m 191.5   [0m |
| [0m 2       [0m | [0m 0.5034  [0m | [0m 0.08196 [0m | [0m 0.6456  [0m | [0m 0.4146  [0m | [0m 7.942   [0m | [0m 99.33   [0m |
| [0m 3       [0m | [0m 0.4946  [0m | [0m 0.03304 [0m | [0m 0.02401 [0m | [0m 0.09873 [0m | [0m 5.103   [0m | [0m 178.2   [0m |
| [95m 4       [0m | [95m 0.5275  [0m | [95m 0.039   [0m | [95m 0.1855  [0m | [95m 0.9172  [0m | [95m 4.896   [0m | [95m 104.7   [0m |
| [95m 5       [0m | [95m 0.5465  [0m | [95m 0.0959  [0m | [95m 0.1271  [0m | [95m 0.7473  [0m | [95m 3.037   [0m | [95m 221.4   [0m |
| [0m 6       [0m | [0m 0.4971  [0m | [0m 0.07494 [0m | [0m 0.831   [0m | [0m 0.2529  [0m | [

In [109]:
best_params_03 = m_clas_bo3.max['params']
best_params_03

{'eta': 0.04988373441772504,
 'gamma': 0.38774660406222805,
 'learning_rate': 0.3779282680211212,
 'max_depth': 3.125741502716014,
 'n_estimators': 209.3536693510256}

In [110]:
best_params_03['max_depth']= int(best_params_03['max_depth'])
best_params_03['n_estimators']= int(best_params_03['n_estimators'])

In [111]:
xgb_model3 = xgb.XGBClassifier(objective = "reg:logistic", random_state = 47, **best_params_03)
# création du modèle multilabels
m_clas3 = BinaryRelevance(classifier = xgb_model3)

In [112]:
start = time()
m_clas3.fit(X_train_spacy_pca, y_train)
print(f"L'entrainement a pris {time() - start:.0f} secondes")

L'entrainement a pris 437 secondes


In [116]:
pred_03 = m_clas3.predict(X_test_spacy_pca).toarray()
f1_score(y_test, 
         pred_03, 
         average = "weighted")

0.5473448009607736