In [1]:
from melusine.nlp_tools.embedding import Embedding
import pandas as pd
from melusine.models.neural_architectures import cnn_model, rnn_model, transformers_model, bert_model
from melusine.models.train import NeuralModel
from sklearn.metrics import classification_report
from sklearn.model_selection import ShuffleSplit, cross_validate
import numpy as np
from collections import Counter
from gensim.models.keyedvectors import KeyedVectors
import warnings
import joblib
np.random.seed(42)
warnings.simplefilter('ignore')

In [2]:
data = pd.read_json('data/mails_preprocessing.json')
data.head()

Unnamed: 0,from,label,text,_questionmark_count_,text_lem,_AJD_count_,_ADP_count_,_ADV_count_,_AUX_count_,_CCONJ_count_,...,_NOUN_count_,_NUM_count_,_PRON_count_,_PROPN_count_,_PUNCT_count_,_SCONJ_count_,_SYM_count_,_VERB_count_,_X_count_,unique_words_count
0,=?iso-8859-1?q?guillaume_v=e9ronique?=\r\n\t<v...,1,acces decibel bonjour pouvez vous donner les a...,2,acce decibel bonjour pouvoir donner acces deci...,0,7,1,1,0,...,18,2,3,0,2,0,0,3,1,233
1,levisse xavier <xavier.levisse@harmonie-mutuel...,1,actes indemnités hospitalières tu sais ce que ...,2,acte indemnite hospitalier savoir acte frais r...,0,3,3,0,2,...,12,2,5,0,2,1,0,6,0,177
2,courtais yohan <yohan.courtais@harmonie-mutuel...,1,analyse des obsèques naissances appareils audi...,1,analyse obseque naissance appareil auditif bon...,0,6,3,0,2,...,23,1,8,0,4,3,0,8,0,274
3,levisse xavier <xavier.levisse@harmonie-mutuel...,1,ano ihm bonjour j ai un multivalue filtres eta...,1,ano ihm bonjour multivalu filtre etablissement...,0,2,1,0,1,...,11,1,1,0,1,0,0,2,3,172
4,=?iso-8859-1?q?pernot_val=e9rie?= <valerie.per...,1,ano alimentation réseau sur dcb bonjour je m i...,1,ano alimentation reseau dob bonjour metre inte...,0,38,6,6,2,...,56,3,14,1,10,1,0,20,0,659


In [3]:
embedding = Embedding(input_column='text_lem', min_count=2, method = 'word2vec_sg', size = 500)
embedding.train(data.astype({'text_lem':str}))
# embedding = pd.read_pickle('data/mails_embedded_doc2vec_bigrams.pkl')

In [14]:
embedding.save('models/MELUSINE_EMBEDDING')

In [4]:
features = [data.columns[3]] + list(data.columns[5:])
Counter(data.label)

Counter({1: 4759, 0: 590, 2: 3215})

In [5]:
# subsampling data
np.random.seed(42)
df1 = data[data.label==0].sample(590)
df2 = data[data.label==1].sample(590)
df_subsampled = pd.concat([df1,df2], ignore_index=True)

In [7]:
# no subsampling
df = data[data.label!=2]

In [11]:
shuffler = ShuffleSplit(n_splits=1,test_size=0.2, random_state=42)
train_indexes, test_indexes = list(shuffler.split(df.index))[0]
df_train = df.iloc[train_indexes,:]
df_test = df.iloc[test_indexes,:]
Counter(df_train.label)

Counter({1: 3794, 0: 485})

# CNN

In [15]:
np.random.seed(42)
cnn_nn = NeuralModel(
    architecture_function = cnn_model,
    pretrained_embedding = embedding,
    text_input_column = 'text_lem',
    meta_input_list = features,
    batch_size = 32,
    embedding_dim = 500,
    seq_size=518,
    n_epochs=200
)

In [16]:
cnn_nn.fit(df_train, df_train.label, verbose = 0, workers = 6, use_multiprocessing = True)

In [17]:
y_pred = cnn_nn.predict(df_test)
print(classification_report(df_test.label, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94       105
           1       0.99      0.99      0.99       965

    accuracy                           0.99      1070
   macro avg       0.96      0.97      0.97      1070
weighted avg       0.99      0.99      0.99      1070



In [18]:
cnn_nn.save_nn_model('models/CNN_518features_input_text_lem')
joblib.dump(cnn_nn, 'models/CNN_518features_input_text_lem.pkl', compress=True)

['models/CNN_518features_input_text_lem.pkl']

# RNN

In [19]:
np.random.seed(42)
rnn_nn = NeuralModel(architecture_function=rnn_model,
    pretrained_embedding=embedding,
    text_input_column='text_lem',
    meta_input_list = features,
    batch_size = 32,
    embedding_dim = 500,
    seq_size=518,
    n_epochs=200
)

In [20]:
rnn_nn.fit(df_train, df_train.label, verbose = 0, workers = 6, use_multiprocessing = True)

In [21]:
y_pred = rnn_nn.predict(df_test)
print(classification_report(df_test.label, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.91      0.93       105
           1       0.99      0.99      0.99       965

    accuracy                           0.99      1070
   macro avg       0.97      0.95      0.96      1070
weighted avg       0.99      0.99      0.99      1070



In [22]:
rnn_nn.save_nn_model('models/RNN_518features_input_text_lem')
joblib.dump(rnn_nn, 'models/RNN_518features_input_text_lem.pkl', compress=True)

['models/RNN_518features_input_text_lem.pkl']

# Transformer

In [23]:
np.random.seed(42)
transf_nn = NeuralModel(architecture_function=transformers_model,
    pretrained_embedding=embedding,
    text_input_column='text_lem',
    meta_input_list = features,
    batch_size = 32,
    embedding_dim = 500,
    seq_size=518,
    n_epochs=200
)

In [24]:
transf_nn.fit(df_train, df_train.label, verbose = 0, workers = 6, use_multiprocessing = True)

In [27]:
y_pred = transf_nn.predict(df_test)
print(classification_report(df_test.label, y_pred))

              precision    recall  f1-score   support

           0       0.91      0.82      0.86       105
           1       0.98      0.99      0.99       965

    accuracy                           0.97      1070
   macro avg       0.95      0.91      0.93      1070
weighted avg       0.97      0.97      0.97      1070



In [28]:
transf_nn.save_nn_model('models/TRANSF_518features_input_text_lem')
joblib.dump(transf_nn, 'models/TRANSF_518features_input_text_lem.pkl', compress=True)

['models/TRANSF_518features_input_text_lem.pkl']