# Bigrams + Word embedding + RNN

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import compute_class_weight
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, LSTM, Conv1D, MaxPooling1D
from keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix, classification_report, f1_score
import keras.backend as K
np.random.seed(42)

In [2]:
data = pd.read_pickle('data/mails_embedded_doc2vec_bigrams.pkl')
data.head()

Unnamed: 0,_questionmark_count_,_AJD_count_,_ADP_count_,_ADV_count_,_AUX_count_,_CCONJ_count_,_DET_count_,_INTJ_count_,_NOUN_count_,_NUM_count_,...,491,492,493,494,495,496,497,498,499,_label_
0,2,0,7,1,1,0,3,0,18,2,...,7.552959,-7.75819,-17.432704,7.485336,-6.11666,5.172562,7.584496,5.250462,-1.320692,1
1,2,0,3,3,0,2,5,0,12,2,...,-7.110094,-5.106038,-6.896137,-6.444591,4.211719,10.020246,-0.814034,-3.377685,1.531344,1
2,1,0,6,3,0,2,6,0,23,1,...,-12.163507,5.605024,-18.668275,-20.377143,-3.610264,0.414532,-11.114808,-3.477304,-4.977881,1
3,1,0,2,1,0,1,2,0,11,1,...,2.105754,-1.29743,-9.423036,-1.073635,8.888151,6.058524,4.917495,7.490066,-1.712665,1
4,1,0,38,6,6,2,31,0,56,3,...,-19.045707,-35.856213,-24.074799,-23.920559,16.694527,10.680893,-16.323111,40.139483,-10.349179,1


In [3]:
def custom_f1(y_true, y_pred):    
    def recall_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        
        recall = TP / (Positives+K.epsilon())    
        return recall 
    
    
    def precision_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
        precision = TP / (Pred_Positives+K.epsilon())
        return precision 
    
    precision, recall = precision_m(y_true, y_pred), recall_m(y_true, y_pred)
    
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [7]:
X_unlabeled = data[data._label_==2].drop('_label_', axis = 1)
X, y = data[data['_label_'] != 2].drop('_label_', axis = 1), data[data['_label_'] != 2]._label_
X, X_test, y, y_test = train_test_split(X.values, y.values, test_size=0.15, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state = 42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_val)
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_val = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

In [8]:
class_weight = compute_class_weight('balanced', classes = [0,1], y = y_train)
class_weight = {0:class_weight[0], 1:class_weight[1]}
class_weight

{0: 4.3710407239819, 1: 0.5645821157218002}

## LSTM

In [9]:
model1=Sequential()
model1.add(LSTM(128, input_shape = (1, 518), return_sequences=True))
model1.add(LSTM(64, return_sequences=True))
model1.add(Dense(256,activation='relu'))
model1.add(Dropout(0.5))
model1.add(Dense(8,activation='relu'))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=[custom_f1])
print(model1.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 1, 128)            331264    
_________________________________________________________________
lstm_1 (LSTM)                (None, 1, 64)             49408     
_________________________________________________________________
dense (Dense)                (None, 1, 256)            16640     
_________________________________________________________________
dropout (Dropout)            (None, 1, 256)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 1, 8)              2056      
_________________________________________________________________
dense_2 (Dense)              (None, 1, 1)              9         
Total params: 399,377
Trainable params: 399,377
Non-trainable params: 0
__________________________________________________

In [10]:
model1.fit(X_train,y_train,batch_size=16,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_custom_f1',min_delta=0.000001)], class_weight=class_weight,
          workers = 6, use_multiprocessing = True)

Epoch 1/10


<keras.callbacks.History at 0x186a954b5b0>

In [11]:
y_prob = model1.predict(X_val)
y_pred = np.array([1*(pred>=0.5) for pred in y_prob])
y_pred = y_pred[:,0,0]

CM = confusion_matrix(y_val, y_pred)
print(CM)
print(classification_report(y_val, y_pred))

[[ 54  10]
 [ 27 591]]
              precision    recall  f1-score   support

           0       0.67      0.84      0.74        64
           1       0.98      0.96      0.97       618

    accuracy                           0.95       682
   macro avg       0.83      0.90      0.86       682
weighted avg       0.95      0.95      0.95       682



# CNN + LSTM

In [12]:
model3 = Sequential()
model3.add(Conv1D(filters=32, kernel_size=3, input_shape = (1, 518), padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2, padding = 'same'))
model3.add(LSTM(100))
model3.add(Dropout(0.5))
model3.add(Dense(8,activation='relu'))
model3.add(Dense(1, activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=[custom_f1])
print(model3.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d (Conv1D)              (None, 1, 32)             49760     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 1, 32)             0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 8)                 808       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 103,777
Trainable params: 103,777
Non-trainable params: 0
________________________________________________

In [13]:
class_weight = compute_class_weight('balanced', classes = [0,1], y = y_train)
class_weight = {0:class_weight[0], 1:class_weight[1]}
class_weight

{0: 4.3710407239819, 1: 0.5645821157218002}

In [14]:
model3.fit(X_train,y_train, batch_size = 32,epochs=10,
          validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)], class_weight=class_weight,
          workers = 6, use_multiprocessing = True)

Epoch 1/10


<keras.callbacks.History at 0x186b623c190>

In [15]:
y_pred = model3.predict(X_val)
y_pred = np.array([1*(pred>=0.5) for pred in y_pred])
y_pred = y_pred[:,0]

CM = confusion_matrix(y_val, y_pred)
print(CM)
print(classification_report(y_val, y_pred))

[[ 52  12]
 [ 37 581]]
              precision    recall  f1-score   support

           0       0.58      0.81      0.68        64
           1       0.98      0.94      0.96       618

    accuracy                           0.93       682
   macro avg       0.78      0.88      0.82       682
weighted avg       0.94      0.93      0.93       682



# Subsampling

In [16]:
x1 = data[data._label_==0].sample(590, random_state = 42)
y1 = x1._label_
x1 = x1.drop('_label_', axis = 1)
x2 = data[data._label_==1].sample(590, random_state = 42)
y2 = x2._label_
x2 = x2.drop('_label_', axis = 1)
Xt = np.concatenate([x1,x2])
yt = np.concatenate([y1, y2])

Xt_train, Xt_test, yt_train, yt_test = train_test_split(Xt, yt, test_size=0.15, random_state=42)
Xt_train, Xt_val,yt_train, yt_val = train_test_split(Xt_train, yt_train, test_size=0.15, random_state=42)

scaler = StandardScaler()
Xt_train = scaler.fit_transform(Xt_train)
Xt_val = scaler.transform(Xt_val)
Xt_test = scaler.transform(Xt_val)
Xt_train = np.reshape(Xt_train, (Xt_train.shape[0], 1, Xt_train.shape[1]))
Xt_val = np.reshape(Xt_val, (Xt_val.shape[0], 1, Xt_val.shape[1]))
Xt_test = np.reshape(Xt_test, (Xt_test.shape[0], 1, Xt_test.shape[1]))

In [17]:
model1=Sequential()
model1.add(LSTM(128, input_shape = (1, 518), return_sequences=True))
model1.add(LSTM(64, return_sequences=True))
model1.add(Dense(256,activation='relu'))
model1.add(Dropout(0.5))
model1.add(Dense(8,activation='relu'))
model1.add(Dense(1,activation='sigmoid'))
model1.compile(loss='binary_crossentropy',optimizer='adam',metrics=[custom_f1])
print(model1.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_3 (LSTM)                (None, 1, 128)            331264    
_________________________________________________________________
lstm_4 (LSTM)                (None, 1, 64)             49408     
_________________________________________________________________
dense_5 (Dense)              (None, 1, 256)            16640     
_________________________________________________________________
dropout_2 (Dropout)          (None, 1, 256)            0         
_________________________________________________________________
dense_6 (Dense)              (None, 1, 8)              2056      
_________________________________________________________________
dense_7 (Dense)              (None, 1, 1)              9         
Total params: 399,377
Trainable params: 399,377
Non-trainable params: 0
________________________________________________

In [18]:
model1.fit(Xt_train,yt_train,batch_size=16,epochs=100,
          validation_split=0.2, verbose = 0, workers = 6, use_multiprocessing = True)

<keras.callbacks.History at 0x186b61c9520>

In [19]:
y_prob = model1.predict(Xt_val)
y_pred = np.array([1*(pred>=0.5) for pred in y_prob])
y_pred = y_pred[:,0,0]

CM = confusion_matrix(yt_val, y_pred)
print(CM)
print(classification_report(yt_val, y_pred))

[[71  8]
 [ 6 66]]
              precision    recall  f1-score   support

           0       0.92      0.90      0.91        79
           1       0.89      0.92      0.90        72

    accuracy                           0.91       151
   macro avg       0.91      0.91      0.91       151
weighted avg       0.91      0.91      0.91       151



In [20]:
model3 = Sequential()
model3.add(Conv1D(filters=32, kernel_size=3, input_shape = (1, 518), padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2, padding = 'same'))
model3.add(LSTM(100))
model3.add(Dropout(0.5))
model3.add(Dense(8,activation='relu'))
model3.add(Dense(1, activation='sigmoid'))
model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=[custom_f1])
print(model3.summary())

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 1, 32)             49760     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 1, 32)             0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               53200     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 8)                 808       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 9         
Total params: 103,777
Trainable params: 103,777
Non-trainable params: 0
________________________________________________

In [21]:
model3.fit(Xt_train,yt_train, batch_size = 16,epochs=100,
          validation_split=0.2,verbose = 0, workers = 6, use_multiprocessing = True)

<keras.callbacks.History at 0x186baf85af0>

In [22]:
y_pred = model3.predict(Xt_val)
y_pred = np.array([1*(pred>=0.5) for pred in y_pred])
y_pred = y_pred[:,0]

CM = confusion_matrix(yt_val, y_pred)
print(CM)
print(classification_report(yt_val, y_pred))

[[70  9]
 [ 5 67]]
              precision    recall  f1-score   support

           0       0.93      0.89      0.91        79
           1       0.88      0.93      0.91        72

    accuracy                           0.91       151
   macro avg       0.91      0.91      0.91       151
weighted avg       0.91      0.91      0.91       151



In [23]:
def create_model():
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=3, input_shape = (1, 518), padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2, padding = 'same'))
    model.add(LSTM(100))
    model.add(Dropout(0.5))
    model.add(Dense(8,activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy', custom_f1])
    return model

In [25]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
nn_model = KerasClassifier(build_fn=create_model, epochs = 15, batch_size = 32, verbose = 0)

kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(nn_model, Xt_train, yt_train, cv=kfold, scoring = 'f1', verbose = 0)

results.mean()

0.9165855901955509