In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import ShuffleSplit, cross_validate
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import compute_class_weight
import keras.backend as K
np.random.seed(42)

In [2]:
data = pd.read_pickle('data/mails_embedded_doc2vec_bigrams.pkl')
data.head()

Unnamed: 0,_questionmark_count_,_AJD_count_,_ADP_count_,_ADV_count_,_AUX_count_,_CCONJ_count_,_DET_count_,_INTJ_count_,_NOUN_count_,_NUM_count_,...,491,492,493,494,495,496,497,498,499,_label_
0,2,0,7,1,1,0,3,0,18,2,...,7.552959,-7.75819,-17.432704,7.485336,-6.11666,5.172562,7.584496,5.250462,-1.320692,1
1,2,0,3,3,0,2,5,0,12,2,...,-7.110094,-5.106038,-6.896137,-6.444591,4.211719,10.020246,-0.814034,-3.377685,1.531344,1
2,1,0,6,3,0,2,6,0,23,1,...,-12.163507,5.605024,-18.668275,-20.377143,-3.610264,0.414532,-11.114808,-3.477304,-4.977881,1
3,1,0,2,1,0,1,2,0,11,1,...,2.105754,-1.29743,-9.423036,-1.073635,8.888151,6.058524,4.917495,7.490066,-1.712665,1
4,1,0,38,6,6,2,31,0,56,3,...,-19.045707,-35.856213,-24.074799,-23.920559,16.694527,10.680893,-16.323111,40.139483,-10.349179,1


In [3]:
# no subsampling
df = data[data._label_!=2]

In [4]:
shuffler = ShuffleSplit(n_splits=1,test_size=0.3, random_state=42)
train_indexes, test_indexes = list(shuffler.split(df.index))[0]
y_train = df.iloc[train_indexes,:]._label_
df_train = df.iloc[train_indexes,:].drop('_label_', axis = 1)
y_test = df.iloc[test_indexes,:]._label_
df_test = df.iloc[test_indexes,:].drop('_label_', axis = 1)
print(Counter(y_train))

df_features_train = df_train.iloc[:,:18] #meta-data
df_features_test = df_test.iloc[:,:18]
df_embeddings_train = df_train.iloc[:,18:] #word embeddings
df_embeddings_test = df_test.iloc[:,18:]

Counter({1: 3324, 0: 420})


In [5]:
class_weight = compute_class_weight('balanced', classes = [0,1], y = y_train)
class_weight = {0:class_weight[0], 1:class_weight[1]}
class_weight

{0: 4.457142857142857, 1: 0.5631768953068592}

In [6]:
features_scaler = StandardScaler()
X_features_train = features_scaler.fit_transform(df_features_train)
X_features_test = features_scaler.transform(df_features_test)
embedding_scaler = StandardScaler()
X_embeddings_train = embedding_scaler.fit_transform(df_embeddings_train)
X_embeddings_test = embedding_scaler.transform(df_embeddings_test)

X_features_train = np.reshape(X_features_train, (X_features_train.shape[0], 1, X_features_train.shape[1]))
X_features_test = np.reshape(X_features_test, (X_features_test.shape[0], 1, X_features_test.shape[1]))
X_embeddings_train = np.reshape(X_embeddings_train, (X_embeddings_train.shape[0], 1, X_embeddings_train.shape[1]))
X_embeddings_test = np.reshape(X_embeddings_test, (X_embeddings_test.shape[0], 1, X_embeddings_test.shape[1]))

In [7]:
def custom_f1(y_true, y_pred):    
    def recall_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        
        recall = TP / (Positives+K.epsilon())    
        return recall 
    
    
    def precision_m(y_true, y_pred):
        TP = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        Pred_Positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
        precision = TP / (Pred_Positives+K.epsilon())
        return precision 
    
    precision, recall = precision_m(y_true, y_pred), recall_m(y_true, y_pred)
    
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# LSTM

In [19]:
text_in = keras.Input(shape=(1,500,))
meta_in = keras.Input(shape=(1,18,))
lstm1 = layers.LSTM(128, return_sequences=True)(text_in)
lstm2 = layers.LSTM(64, return_sequences=True)(lstm1)
dense1 = layers.Dense(256,activation='relu')(lstm2)
drop = layers.Dropout(0.5)(dense1)
merged = layers.concatenate([drop, meta_in])
dense2 = layers.Dense(8, activation='relu')(merged)
text_class = layers.Dense(1, activation='sigmoid')(dense2)
model1 = keras.Model([text_in, meta_in], text_class)

model1.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics=[custom_f1])

In [20]:
np.random.seed(42)
model1.fit([X_embeddings_train, X_features_train], y_train, batch_size=16,
    epochs=200, validation_split=0.2, verbose = 0, workers = 6, use_multiprocessing = True)

<keras.callbacks.History at 0x2642660aa90>

In [15]:
y_prob = model1.predict([X_embeddings_test, X_features_test])
y_pred = np.array([1*(pred>=0.5) for pred in y_prob])
y_pred = y_pred[:,0,0]

CM = confusion_matrix(y_test, y_pred)
print(CM)
print(classification_report(y_test, y_pred))

[[ 140   30]
 [  21 1414]]
              precision    recall  f1-score   support

           0       0.87      0.82      0.85       170
           1       0.98      0.99      0.98      1435

    accuracy                           0.97      1605
   macro avg       0.92      0.90      0.91      1605
weighted avg       0.97      0.97      0.97      1605



# LSTM + CNN

In [35]:
text_in = keras.Input(shape=(1,500,))
meta_in = keras.Input(shape=(1,18,))
conv = layers.Conv1D(filters=32, kernel_size=3, input_shape = (1, 500), padding='same', activation='relu')(text_in)
max_pool = layers.MaxPooling1D(pool_size=2, padding = 'same')(conv)
lstm1 = layers.LSTM(128, return_sequences=True)(max_pool)
lstm2 = layers.LSTM(64, return_sequences=True)(lstm1)
dense0 = layers.Dense(256,activation='relu')(lstm2)
drop = layers.Dropout(0.5)(dense0)
merged = layers.concatenate([drop, meta_in])
dense1 = layers.Dense(64, activation='relu')(merged)
dense2 = layers.Dense(8, activation='relu')(dense1)
text_class = layers.Dense(2, activation='softmax')(dense2)
model2 = keras.Model([text_in, meta_in], text_class)

model2.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics=[custom_f1])

In [36]:
from tensorflow.keras.utils import to_categorical
y_train_probas = to_categorical(y_train)
y_train_probas = np.reshape(y_train_probas, (y_train_probas.shape[0], 1, y_train_probas.shape[1]))

In [37]:
model2.fit([X_embeddings_train, X_features_train], y_train_probas, batch_size=16,
    epochs=200, validation_split=0.2, verbose = 0, workers = 6, use_multiprocessing = True)

<keras.callbacks.History at 0x1b12f0a6c40>

In [57]:
y_prob = model2.predict([X_embeddings_test, X_features_test])
y_pred = y_prob.argmax(axis = -1)
y_pred[:,0]

CM = confusion_matrix(y_test, y_pred)
print(CM)
print(classification_report(y_test, y_pred))

[[ 146   24]
 [  16 1419]]
              precision    recall  f1-score   support

           0       0.90      0.86      0.88       170
           1       0.98      0.99      0.99      1435

    accuracy                           0.98      1605
   macro avg       0.94      0.92      0.93      1605
weighted avg       0.97      0.98      0.97      1605



In [60]:
import joblib
joblib.dump(features_scaler, 'models/features_scaler')
joblib.dump(embedding_scaler, 'models/embedding_scaler')
# model1.save('models/LSTM_518features_input_emb_meta')
model2.save('models/CNN_LSTM_518features_input_emb_meta')



INFO:tensorflow:Assets written to: models/CNN_LSTM_518features_input_emb_meta\assets


INFO:tensorflow:Assets written to: models/CNN_LSTM_518features_input_emb_meta\assets
