In [80]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Embedding, LSTM, Conv1D, MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from sklearn import metrics

In [3]:
data = pd.read_pickle('mails_lemmatized.pkl')

In [4]:
df = pd.DataFrame()
df['text'] = [' '.join(words) for words in data]
df['label'] = pd.read_pickle('mail_labels.pkl')

In [5]:
df = df[df['label']!=2]
df.head()

Unnamed: 0,text,label
0,1ere relecture gt consommation yohan temps c c...,0
1,accepter accepter demande cliquez simplement b...,0
2,acce decibel bonjour pouvoir donner acces deci...,1
3,acces rec4 decibel sdw rec4 hm dm ad restituti...,1
4,acte indemnite hospitalier savoir c acte n fra...,1


# LSTM

![Image](http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(df.text, df.label, test_size=0.15)

In [42]:
max_words = 3000
max_len = 200
embedding_dim = 300
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [90]:
class_weight = compute_class_weight('balanced', [0,1], Y_train.values)
class_weight = {0:class_weight[0], 1:class_weight[1]}
class_weight

{0: 0.5824081981212639, 1: 3.533678756476684}

In [37]:
model=Sequential()
model.add(Embedding(max_words,embedding_dim, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(256,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(8,activation='relu'))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 200, 300)          900000    
_________________________________________________________________
lstm_14 (LSTM)               (None, 200, 128)          219648    
_________________________________________________________________
lstm_15 (LSTM)               (None, 64)                49408     
_________________________________________________________________
dense_17 (Dense)             (None, 256)               16640     
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 8)                 2056      
_________________________________________________________________
dense_19 (Dense)             (None, 1)                

In [91]:
model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)], class_weight=class_weight)

Epoch 1/10


<keras.callbacks.History at 0x19d7ba6a220>

In [92]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
model.evaluate(test_sequences_matrix,Y_test)



[0.635083794593811, 0.589211642742157]

In [93]:
y_pred = model.predict(test_sequences_matrix)
y_pred = [1*(pred>=0.5) for pred in y_pred]
CM = metrics.confusion_matrix(Y_test, y_pred)
print(CM)
FN = CM[1,0]
print(FN/np.sum(CM))

[[240 173]
 [ 25  44]]
0.05186721991701245


# CNN + LSTM

In [43]:
model2 = Sequential()
model2.add(Embedding(max_words,embedding_dim, input_length=max_len))
model2.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model2.add(MaxPooling1D(pool_size=2))
model2.add(LSTM(100))
model2.add(Dropout(0.5))
model2.add(Dense(8,activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 200, 300)          900000    
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 200, 32)           28832     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 100, 32)           0         
_________________________________________________________________
lstm_17 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dropout_7 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_22 (Dense)             (None, 8)                 808       
_________________________________________________________________
dense_23 (Dense)             (None, 1)               

In [89]:
class_weight = compute_class_weight('balanced', [0,1], Y_train.values)
class_weight = {0:class_weight[0], 1:class_weight[1]}
class_weight

{0: 0.5824081981212639, 1: 3.533678756476684}

In [94]:
model2.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)], class_weight=class_weight)

Epoch 1/10


<keras.callbacks.History at 0x19d276f2070>

In [95]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)
model2.evaluate(test_sequences_matrix,Y_test)



[0.6755795478820801, 0.7697095274925232]

In [96]:
y_pred = model2.predict(test_sequences_matrix)
y_pred = [1*(pred>=0.5) for pred in y_pred]
CM = metrics.confusion_matrix(Y_test, y_pred)
print(CM)
FN = CM[1,0]
print(FN/np.sum(CM))

[[358  55]
 [ 56  13]]
0.11618257261410789
