In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import compute_class_weight
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Embedding, LSTM, Conv1D, MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
import keras.backend as K
np.random.seed(42)

In [2]:
data = pd.read_json('data/mails_preprocessing.json').drop('from', axis = 1)

In [3]:
df = data[data['label']!=2]
df.head()

Unnamed: 0,label,text,_questionmark_count_,text_lem,_AJD_count_,_ADP_count_,_ADV_count_,_AUX_count_,_CCONJ_count_,_DET_count_,...,_NOUN_count_,_NUM_count_,_PRON_count_,_PROPN_count_,_PUNCT_count_,_SCONJ_count_,_SYM_count_,_VERB_count_,_X_count_,unique_words_count
0,1,acces decibel bonjour pouvez vous donner les a...,2,acce decibel bonjour pouvoir donner acces deci...,0,7,1,1,0,3,...,18,2,3,0,2,0,0,3,1,233
1,1,actes indemnités hospitalières tu sais ce que ...,2,acte indemnite hospitalier savoir acte frais r...,0,3,3,0,2,5,...,12,2,5,0,2,1,0,6,0,177
2,1,analyse des obsèques naissances appareils audi...,1,analyse obseque naissance appareil auditif bon...,0,6,3,0,2,6,...,23,1,8,0,4,3,0,8,0,274
3,1,ano ihm bonjour j ai un multivalue filtres eta...,1,ano ihm bonjour multivalu filtre etablissement...,0,2,1,0,1,2,...,11,1,1,0,1,0,0,2,3,172
4,1,ano alimentation réseau sur dcb bonjour je m i...,1,ano alimentation reseau dob bonjour metre inte...,0,38,6,6,2,31,...,56,3,14,1,10,1,0,20,0,659


# CNN + LSTM on word sequences

![Image](http://colah.github.io/posts/2015-08-Understanding-LSTMs/img/LSTM3-chain.png)

In [4]:
def custom_f1(y_true, y_pred):    
    def recall_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        
        recall = TP / (Positives+K.epsilon())    
        return recall 
    
    
    def precision_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
        precision = TP / (Pred_Positives+K.epsilon())
        return precision 
    
    precision, recall = precision_m(y_true, y_pred), recall_m(y_true, y_pred)
    
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.text_lem, df.label, test_size=0.15, stratify=df.label, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train.values, y_train.values, test_size=0.15, stratify=y_train.values, random_state = 42)

In [6]:
max_words = 3000
max_len = 200
embedding_dim = 400
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix_tfidf = tok.sequences_to_matrix(sequences, mode = 'tfidf')
sequences_matrix = sequence.pad_sequences(sequences, maxlen=max_len)

In [7]:
class_weight = compute_class_weight('balanced', classes = [0,1], y = y_train)
class_weight = {0:class_weight[0], 1:class_weight[1]}
class_weight

{0: 4.535211267605634, 1: 0.5619546247818499}

In [19]:
model = Sequential()
model.add(Embedding(max_words,embedding_dim, input_length=max_len))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[custom_f1])
print(model.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 200, 400)          1200000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 200, 32)           38432     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 100, 32)           0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 1,291,733
Trainable params: 1,291,733
Non-trainable params: 0
____________________________________________

In [20]:
class_weight = compute_class_weight('balanced', classes = [0,1], y = y_train)
class_weight = {0:class_weight[0], 1:class_weight[1]}
class_weight

{0: 4.535211267605634, 1: 0.5619546247818499}

In [21]:
model.fit(sequences_matrix,y_train,batch_size=256,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_custom_f1',min_delta=0.00001)], class_weight=class_weight,
          workers = 6, use_multiprocessing = True)

Epoch 1/10


<keras.callbacks.History at 0x161858ea400>

In [22]:
test_sequences = tok.texts_to_sequences(X_val)
test_sequences_matrix = sequence.pad_sequences(test_sequences, maxlen=max_len)

In [23]:
model.evaluate(test_sequences_matrix, y_val)



[0.2999166250228882, 0.9694898128509521]

In [25]:
y_pred = model.predict(test_sequences_matrix)
y_pred = [1*(pred>=0.5) for pred in y_pred]
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.63      0.71        75
           1       0.96      0.98      0.97       607

    accuracy                           0.94       682
   macro avg       0.89      0.81      0.84       682
weighted avg       0.94      0.94      0.94       682

