In [8]:
import pandas as pd
import numpy as np
from collections import defaultdict
from unidecode import unidecode
from spacy.lang.fr import French
from spacy.lang.fr import stop_words as spacy_stopwords
import string
from tqdm import tqdm
import matplotlib.pyplot as plt
tqdm.pandas()

In [4]:
mails = pd.read_csv('mails.csv', index_col = 0, encoding='UTF-16', header=0, skipinitialspace=True, skip_blank_lines = True)
mails['text'] = mails[['header', 'body']].astype(str).agg(' '.join, axis=1)
print(mails.shape)
mails.head()

(3407, 7)


Unnamed: 0,body,header,date,from,to,label,text
0,yohan quand tu as le temps dis moi si c est cl...,1ère relecture gt conso,"Fri, 29 May 2020 16:53:04 +0200",plonquet nadège,,0,1ère relecture gt conso yohan quand tu as le t...
1,pour accepter la demande cliquez simplement su...,accepter,"Mon, 16 Sep 2019 17:09:37 +0200",plonquet nadège,,0,accepter pour accepter la demande cliquez simp...
2,bonjour pouvez vous donner les accès à decibel...,acces decibel,"Tue, 23 Mar 2021 13:44:07 +0100",guillaume veronique,,1,acces decibel bonjour pouvez vous donner les a...
3,comme ça ne fonctionne toujours pas sur mon po...,accès rec4 decibel sdw rec4 hm dm ad restituti...,"Mon, 24 Feb 2020 14:28:57 +0100",gueniot bernard,,1,accès rec4 decibel sdw rec4 hm dm ad restituti...
4,tu sais ce que c est que ces actes qui n ont p...,actes indemnités hospitalières,"Mon, 25 Mar 2019 11:25:36 +0100",levisse xavier,,1,actes indemnités hospitalières tu sais ce que ...


In [5]:
nlp = French() ##charger le modèle ici
stop_words = spacy_stopwords.STOP_WORDS
punctuations = string.punctuation

In [None]:
bonjour = nlp("bonjour")
bonjour.vector.shape

In [11]:
def tokenize(sentence):
    # sentence = nlp(sentence)
    # lemmatizing
    sentence = sentence.split(' ') #[unidecode(word.lemma_.lower()).strip() if word.lemma_ != "-PRON-" else word.lower_ for word in sentence ]
    # removing stop words
    sentence = [ word for word in sentence if word not in stop_words and word not in punctuations]        
    return sentence

In [None]:
sentences = mails.text.progress_apply(tokenize)
sentences

In [None]:
vocab = set()
for s in sentences:
    vocab.update(set(s))
print(len(vocab))

In [None]:
vectors=[]
for token in tqdm(vocab):
    vectors.append(nlp(token).vector)

## Représentation des mots

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
embeddings_2d = pca.fit_transform(vectors)

In [None]:
fig = plt.figure(figsize=(16, 9))

x_axis = embeddings_2d[:, 0]
y_axis = embeddings_2d[:, 1]

#plt.scatter(x_axis, y_axis, s=5, alpha=0.5) # alpha for transparency
#plt.show()

## Apprentissage semi-supervisé

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

plt.style.use('ggplot')

In [13]:
class SpacyEmbeddings(TransformerMixin): # it inherits the sklearn's base class for transformers
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [sentence for sentence in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}
bow_vector = CountVectorizer(tokenizer=tokenize, ngram_range=(1,1))

In [104]:
X_unlabeled = mails[mails['label'] == 2]
X, y = mails[mails['label'] != 2].text, mails[mails['label'] != 2].label

In [96]:
from sklearn.tree import DecisionTreeClassifier
pipe = Pipeline([("embedder", SpacyEmbeddings()),
                 ('vectorizer', bow_vector),
                 ('classifier', DecisionTreeClassifier())])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.9283489096573209
              precision    recall  f1-score   support

           0       0.97      0.95      0.96       830
           1       0.71      0.81      0.76       133

    accuracy                           0.93       963
   macro avg       0.84      0.88      0.86       963
weighted avg       0.93      0.93      0.93       963



In [92]:
from sklearn.cluster import KMeans
pipe = Pipeline([("embedder", SpacyEmbeddings()),
                 ('vectorizer', bow_vector),
                 ('classifier', KMeans(n_clusters=2))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.7518172377985463
              precision    recall  f1-score   support

           0       0.85      0.87      0.86       829
           1       0.04      0.04      0.04       134

    accuracy                           0.75       963
   macro avg       0.45      0.45      0.45       963
weighted avg       0.74      0.75      0.74       963



In [97]:
from sklearn.svm import SVC
pipe = Pipeline([("embedder", SpacyEmbeddings()),
                 ('vectorizer', bow_vector),
                 ('classifier', SVC(C=0.1325, kernel = 'linear'))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.9210799584631361
              precision    recall  f1-score   support

           0       0.95      0.96      0.95       833
           1       0.71      0.70      0.71       130

    accuracy                           0.92       963
   macro avg       0.83      0.83      0.83       963
weighted avg       0.92      0.92      0.92       963



In [94]:
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([("embedder", SpacyEmbeddings()),
                 ('vectorizer', bow_vector),
                 ('classifier', LogisticRegression(C=1, solver = 'newton-cg'))])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.9086188992731049
              precision    recall  f1-score   support

           0       0.95      0.94      0.95       831
           1       0.66      0.68      0.67       132

    accuracy                           0.91       963
   macro avg       0.81      0.81      0.81       963
weighted avg       0.91      0.91      0.91       963



### Prédiction des labels manquants

In [140]:
estimator = Pipeline([("embedder", SpacyEmbeddings()),
                 ('vectorizer', bow_vector),
                 ('classifier', DecisionTreeClassifier())])  #SVC ou arbre

X_unlabeled_copy = X_unlabeled.copy()
newX = X
newY = y
threshold = 0.9
print(newX.shape, X_unlabeled_copy.shape)
while not X_unlabeled_copy.empty:

    estimator.fit(newX.text, newY)
    y_pred = estimator.predict(X_unlabeled_copy.text)
    y_prob = estimator.predict_proba(X_unlabeled_copy.text)
    X_unlabeled_copy['_pred_'] = y_pred
    X_unlabeled_copy['_prob_'] = [y_prob[k,y_pred[k]] for k in range(len(y_pred))]
    confident_pred = X_unlabeled_copy[X_unlabeled_copy['_prob_'] >= threshold]
    X_unlabeled_copy = X_unlabeled_copy[X_unlabeled_copy['_prob_'] < threshold].drop(['_pred_', '_prob_'], axis = 1)
    newX = pd.concat([newX, confident_pred.drop(['_pred_', '_prob_'], axis = 1)], ignore_index = True)
    newY = pd.concat([newY, confident_pred['_pred_']], ignore_index = True)
    print(newX.shape, X_unlabeled_copy.shape, threshold)

    if len(confident_pred)==0:
        break

newX = newX.text

(3210, 7) (197, 7)
(3407, 7) (0, 7) 0.9


# Catégorisation des mails

In [151]:
from sklearn.tree import DecisionTreeClassifier
pipe = Pipeline([("embedder", SpacyEmbeddings()),
                 ('vectorizer', bow_vector),
                 ('classifier', DecisionTreeClassifier())])

X_train, X_test, y_train, y_test = train_test_split(newX, newY, test_size=0.3)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.9286412512218963
              precision    recall  f1-score   support

           0       0.96      0.96      0.96       874
           1       0.76      0.74      0.75       149

    accuracy                           0.93      1023
   macro avg       0.86      0.85      0.86      1023
weighted avg       0.93      0.93      0.93      1023



In [150]:
X_train, X_test, y_train, y_test = train_test_split(newX, newY, test_size=0.3)
pipe = Pipeline([("embedder", SpacyEmbeddings()),
                 ('vectorizer', bow_vector),
                 ('classifier', KMeans(n_clusters=2))])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.7341153470185728
              precision    recall  f1-score   support

           0       0.82      0.87      0.85       858
           1       0.03      0.02      0.03       165

    accuracy                           0.73      1023
   macro avg       0.43      0.45      0.44      1023
weighted avg       0.70      0.73      0.71      1023



In [147]:
X_train, X_test, y_train, y_test = train_test_split(newX, newY, test_size=0.3)
pipe = Pipeline([("embedder", SpacyEmbeddings()),
                 ('vectorizer', bow_vector),
                 ('classifier', SVC(C=0.1325, kernel = 'linear'))])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.8983382209188661
              precision    recall  f1-score   support

           0       0.95      0.93      0.94       859
           1       0.67      0.73      0.70       164

    accuracy                           0.90      1023
   macro avg       0.81      0.83      0.82      1023
weighted avg       0.90      0.90      0.90      1023



In [148]:
X_train, X_test, y_train, y_test = train_test_split(newX, newY, test_size=0.3)
pipe = Pipeline([("embedder", SpacyEmbeddings()),
                 ('vectorizer', bow_vector),
                 ('classifier', LogisticRegression(C=1, solver = 'newton-cg'))])
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test, y_pred))

0.9198435972629521
              precision    recall  f1-score   support

           0       0.94      0.97      0.95       869
           1       0.78      0.65      0.71       154

    accuracy                           0.92      1023
   macro avg       0.86      0.81      0.83      1023
weighted avg       0.92      0.92      0.92      1023



## Optimisation des modèles

In [48]:
import pickle
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(newX, newY, test_size=0.3)

gridCV = GridSearchCV(SVC(),
    n_jobs=-1,
    param_grid = {
        'C' : [0.1325],
        'kernel' : ['linear'],
        #'degree' : [1, 2, 3, 4],
        #'gamma' : ['scale', 'auto'] + list(np.linspace(0,2,3)),
        #'coef0' : np.linspace(-2,2,5),
        'class_weight' : ['balanced', None],
        #'decision_function_shape' : ['ovo', 'ovr']
    }
)

gridCV.fit(X_train, y_train)
print(gridCV.best_params_)
print(gridCV.best_score_)
#{'C': 0.1325, 'class_weight': None, 'kernel': 'linear'}

{'C': 0.1325, 'class_weight': None, 'kernel': 'linear'}
0.9192010235179116


In [105]:
model_svc = SVC(C=0.1325, kernel = 'linear')
model_svc.fit(X_train, y_train)
y_pred = model_svc.predict(X_test)
accuracy_score(y_test, y_pred)

0.9105691056910569

In [None]:
filename = 'svc_c01325_lin.sav'
with open(filename, 'wb') as file:
    pickle.dump(model_svc, file)

In [None]:
from sklearn.model_selection import GridSearchCV
X_train, X_test, y_train, y_test = train_test_split(newX, newY, test_size=0.3)

gridCV2 = GridSearchCV(LogisticRegression(),
    n_jobs=-1,
    param_grid = {
'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
'dual' : [False, True],
'C' : np.linspace(0.0001,3, 10),
'fit_intercept' : [True, False],
'class_weight' : [None, 'balanced'],
'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'warm_start' : [True],
'l1_ratio' : [0, 0.5, 1]
    }
)

gridCV2.fit(X_train, y_train)
print(gridCV2.best_params_)
print(gridCV2.best_score_)
#{'C': 0.3334222222222222, 'class_weight': None, 'dual': False, 'fit_intercept': False, 'l1_ratio': 0, 'penalty': 'l1', 'solver': 'liblinear', 'warm_start': True}
#0.91

In [24]:
X_train, X_test, y_train, y_test = train_test_split(newX, newY, test_size=0.3)
LR = LogisticRegression(C = 0.3334222222222222, class_weight = None, dual = True, fit_intercept = False, 
                        solver = 'liblinear', warm_start = True)
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
accuracy_score(y_test, y_pred)



0.9196337741607324

In [116]:
filename = 'lr_c033_liblinear_dual.sav'
with open(filename, 'wb') as file:
    pickle.dump(LR, file)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
LR = LogisticRegression(C = 0.3334222222222222, class_weight = None, dual = True, fit_intercept = False, 
                        solver = 'liblinear', warm_start = True)
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
accuracy_score(y_test, y_pred)



0.9116080937167199

Sur ces méthodes de machine learning classique, l'extension du dataset à partir des données non-labellisées ne produit pas une grande amélioration des performances, ce sera plus flagrant lorsqu'on passera à des méthodes de Deep Learning

## Prédiction des probabilités

In [117]:
np.round(LR.predict_proba(newX),2)

array([[0.99, 0.01],
       [0.78, 0.22],
       [0.14, 0.86],
       ...,
       [0.94, 0.06],
       [0.98, 0.02],
       [0.66, 0.34]])

## Exemple de pipeline

In [15]:
from lemmatization import lemmatizeText, initSpacy
lemmatizer = initSpacy('fr_core_news_md')

In [33]:
input_mail = """
    DRS - 
    Bonjour
    j'ai un souci avec le décisionnel DECIBEL, il y a un problème de référenciel.

    Cordialement,
    Martin
    """

In [29]:
def pipeline(mail, model_file, schema, lemmatizer):
    features = defaultdict(int)
    features.update(lemmatizeText(mail, lemmatizer))
    X = np.array([features[key] for key in schema]).reshape(1, -1)    
    with open(model_file, 'rb') as file:
        model = pickle.load(file)

    try:
        probas = model.predict_proba(X)[0]
    except AttributeError:
        probas = [None, None]
    
    y = model.predict(X)[0]
    return {'predicted_class' : y, 'confidence' : probas[y], 'model' : model_file}

In [34]:
model_file = 'lr_c1_newtoncg.sav'
schema = X.columns.to_list()
predicted_class = pipeline(input_mail, model_file, schema, lemmatizer)
predicted_class

array([1])

to-do :

* vote majoritaire entre plusieurs modèles
* refit les modèles en infériorité numérique / refit périodiquement les moins bons modèles (p.r à un score de prédiction calculé sur la période)
* gerer les cas douteux grace à la proba
* LSTM / biLSTM
* Transformers