In [2]:
from melusine.nlp_tools.embedding import Embedding
import pandas as pd
from melusine.nlp_tools.phraser import Phraser, phraser_on_body
from melusine.nlp_tools.tokenizer import Tokenizer
from sklearn.pipeline import Pipeline
from melusine.utils.transformer_scheduler import TransformerScheduler
from melusine.models.neural_architectures import cnn_model, rnn_model, transformers_model, bert_model
from melusine.models.train import NeuralModel
from sklearn import metrics
from sklearn.model_selection import train_test_split, ShuffleSplit
import numpy as np
import tqdm
tqdm.pandas()

In [4]:
data = pd.read_pickle('data/mails_lemmatized.pkl')
df_email = pd.DataFrame()
df_email['text'] = [' '.join(words) for words in data]
df_email['label'] = pd.read_pickle('data/mail_labels.pkl')
df_email = df_email[df_email['label']!=2]
df_email.head()

Unnamed: 0,text,label
0,1ere relecture gt consommation yohan temps c c...,0
1,accepter accepter demande cliquez simplement b...,0
2,acce decibel bonjour pouvoir donner acces deci...,1
3,acces rec4 decibel sdw rec4 hm dm ad restituti...,1
4,acte indemnite hospitalier savoir c acte n fra...,1


In [41]:
shuffler = ShuffleSplit(n_splits=1,test_size=0.15)
train_indexes, test_indexes = list(shuffler.split(df_email.index))[0]

In [None]:
def tokenize(row):
    tokens = row.split(' ')
    return tokens 

tknz = Tokenizer('text')
df_email2 = tknz.fit_transform(df_email)

In [51]:
df_train = df_email.iloc[train_indexes]
df_test = df_email.iloc[test_indexes]

In [52]:
embedding = Embedding(input_column='text', min_count=5)
embedding.train(df_train)

# CNN

In [64]:
cnn_nn = NeuralModel(architecture_function=cnn_model,
                       pretrained_embedding=embedding,
                       text_input_column='text')

In [55]:
cnn_nn.fit(df_test.drop('label', axis = 1), df_train.label)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["tokens"] = apply_func(X, self.tokenize)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X["tokens"] = apply_func(X, lambda x: x["tokens"][0])


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [60]:
y_pred = cnn_nn.predict(df_test.drop('label', axis = 1))
CM = metrics.confusion_matrix(df_test.label, y_pred)
print(CM)
FN = CM[1,0]
print(FN/np.sum(CM))

[[409   0]
 [ 73   0]]
0.15145228215767634


# RNN

In [68]:
rnn_nn = NeuralModel(architecture_function=rnn_model,
                       pretrained_embedding=embedding,
                       text_input_column='text')

In [70]:
rnn_nn.fit(df_train.drop('label', axis = 1), df_train.label)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [71]:
y_pred = rnn_nn.predict(df_test.drop('label', axis = 1))
CM = metrics.confusion_matrix(df_test.label, y_pred)
print(CM)
FN = CM[1,0]
print(FN/np.sum(CM))

[[409   0]
 [ 73   0]]
0.15145228215767634


# Transformer

In [72]:
transf_nn = NeuralModel(architecture_function=transformers_model,
                       pretrained_embedding=embedding,
                       text_input_column='text')

In [73]:
transf_nn.fit(df_train.drop('label', axis = 1), df_train.label)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [74]:
y_pred = transf_nn.predict(df_test.drop('label', axis = 1))
CM = metrics.confusion_matrix(df_test.label, y_pred)
print(CM)
FN = CM[1,0]
print(FN/np.sum(CM))

[[409   0]
 [ 73   0]]
0.15145228215767634


# Bert

In [None]:
bert_nn = NeuralModel(architecture_function=bert_model,
                       pretrained_embedding=embedding,
                       text_input_column='text')

In [None]:
bert_nn.fit(df_train.drop('label', axis = 1), df_train.label)

In [None]:
y_pred = bert_nn.predict(df_test.drop('label', axis = 1))
CM = metrics.confusion_matrix(df_test.label, y_pred)
print(CM)
FN = CM[1,0]
print(FN/np.sum(CM))