In [2]:
import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
import seaborn as sns

import string
from urllib.parse import urlparse
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras import preprocessing as tfkp
from tensorflow.keras.utils import to_categorical

import gensim.downloader as api
from imblearn.over_sampling import RandomOverSampler

#if on Google Colab, put the DL_model files in root
from DL_models import lstm_keras, cnn_keras, blstm, blstm_att

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
def indiv_eval(model, X_test, y_test, target_names=['Normal', 'Cyberbullying']):
    preds_probs = model.predict(X_test)
    preds = [np.argmax(p) for p in preds_probs]

    print(classification_report(y_test, preds))

    plt.figure()
    sns.heatmap(confusion_matrix(y_test, preds, normalize='true'), fmt='.2%', annot=True ,linewidths=.5,cmap='YlOrRd', annot_kws={"fontsize":10}, yticklabels=target_names, xticklabels=target_names)
    plt.xticks(rotation=45, ha='right')

    plt.show()

    return preds

In [5]:
from mlxtend.evaluate import cochrans_q
from mlxtend.evaluate import mcnemar
from mlxtend.evaluate import mcnemar_tables
from prettytable import PrettyTable

def model_compare(key, model_list):

    two_models = key.split(' vs ')
    first_model = int(two_models[0].split('_')[-1])
    second_model = int(two_models[1].split('_')[-1])

    return model_list[first_model], model_list[second_model]

def multiple_mcnemar(model_name_list, preds_array, y, sig_level):

    # creates the 'correct' value for use in the McNemar table
    y_true = np.array([1] * len(y))

    converted_pred_array = []

    for i in range(len(preds_array)):
        converted_pred_array.append((preds_array[i]==y).astype(int))

    '''q, p_value = cochrans_q(y_true,*converted_pred_array)

    significance = p_value < sig_level

    scientific_notation="{:.2e}".format(p_value)

    print(f"-----Cochran's Q Test-----")
    print(f'============================')
    print(f'Q-Score         {q:.4f}')
    print(f'p-value         {scientific_notation}')
    print(f'Reject? ({1-sig_level:.0%})      {significance}')
    print('\n')'''

    mctable = mcnemar_tables(y_true, *converted_pred_array)

    pairwise_table = PrettyTable()
    pairwise_table.field_names = ['Model 1', 'Model 2', 'Chi²', 'p-Value', 'Reject?']

    for key, value in mctable.items():
        chi2, p = mcnemar(ary=value, corrected=True)
        first_model, second_model = model_compare(key, model_name_list)
        reject_null = p < sig_level

        pairwise_table.add_row([first_model, second_model, chi2, p, reject_null])
        pairwise_table.float_format["Chi²"] = '.2E'
        pairwise_table.float_format["p-Value"] = '.2E'

    print(pairwise_table)

In [6]:
# embedding stuff
def map_weights(embed_dict, vocab, embed_size): # embed size is embedding dim
    vocab_size = len(vocab) + 1
    weights = np.zeros((vocab_size, embed_size))

    n_missed = 0
    words_missed = []
    for k,v in vocab.items():
        try:
            weights[v] = embed_dict[k]  # weights[v] is an index, embed_dict[k] is the list of weights
        except:
            n_missed += 1
            words_missed.append(k)
    print(f"{n_missed} embeddings missed of {vocab_size}")
    return weights, words_missed

### Loading Data

In [9]:
comments = pd.read_csv('/content/drive/MyDrive/Uni/Experiment Design/Ass2/preprocessing/WikiPages/wikipedia_preprocessed.csv')

x_text = list(comments['comment'])
labels = list(comments['attack'])

labels, uniques = pd.factorize(labels)

### Intitial Twitter-Specific Pre-processing

In [10]:
import re
comments = pd.DataFrame({'comment': x_text, 'attack': labels})

#drop nulls
comments = comments[comments['comment'].notna()]

#lower case
comments['comment'] = comments['comment'].str.lower()

#remove stop words
comments['comment'] = comments['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

#tokenize
#comments['comment'] = comments.apply(lambda row: nltk.word_tokenize(row['comment']), axis=1)

#remove punctiations
#comments['comment'] = [list(filter(lambda x: x not in string.punctuation, sentence)) for sentence in comments['comment']]

In [11]:
comments.head()

Unnamed: 0,comment,attack
0,creative dictionary definitions terms insuranc...,0
1,term standard model less npov think wed prefer...,0
2,true false situation march 2002 saudi proposal...,0
3,next maybe could work less condescending sugge...,0
4,page need disambiguation,0


### Train-Test Split and Tokenization

In [12]:
X_train_pretoken, X_midway_pretoken, y_train, y_midway = train_test_split(comments['comment'], comments['attack'], random_state = 42, test_size=0.2)

In [13]:
comments['number_of_words'] = comments.comment.apply(lambda x: len(x.split()))

In [18]:
comments['number_of_words'].quantile(0.95)

107.0

In [19]:
dim = 100 #manually set to 95th percentile
n_classes = len(np.unique(y_train.values))

tokenizer = tfkp.text.Tokenizer(oov_token="<UNK>", filters='!"$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',)
tokenizer.fit_on_texts(X_train_pretoken)

convert = lambda x: tfkp.sequence.pad_sequences(tokenizer.texts_to_sequences(x),
                                                    maxlen=dim,
                                                    padding='post', truncating='post')

X_train = convert(X_train_pretoken)
X_midway = convert(X_midway_pretoken)

In [20]:
X_test, X_val, y_test, y_val = train_test_split(X_midway, y_midway, random_state = 42, test_size=0.5)

### Oversampling

In [21]:
oversample = RandomOverSampler(random_state=42)
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

### Reshaping for input

In [22]:
y_train_onehot = to_categorical(y_train, n_classes)
y_val_onehot = to_categorical(y_val, n_classes)
y_test_onehot = to_categorical(y_test, n_classes)
y_train_over_onehot = to_categorical(y_train_over, n_classes)

### Glove Embedding Weights

In [23]:
glove_dict = api.load("glove-wiki-gigaword-200")
glove_weights, glove_words_missed = map_weights(glove_dict, tokenizer.word_index, 200)

83056 embeddings missed of 154017


### Word2vec Weights

In [24]:
word_dict = api.load("word2vec-google-news-300")
word2vec_weights, word2vec_words_missed = map_weights(word_dict, tokenizer.word_index, 300)

100606 embeddings missed of 154017


#### Checks

In [25]:
# check number of oversampling
from collections import Counter
print(Counter(y_train))
print(Counter(y_train_over))

Counter({0: 81751, 1: 10922})
Counter({0: 81751, 1: 81751})


### Write Embedders

In [26]:
from keras.layers import Embedding
from tensorflow.keras import initializers

vocab_size = len(tokenizer.word_index)+1

random_embedder = Embedding(vocab_size, 200, input_length=dim, trainable=True)
glove_embedding = Embedding(vocab_size, 200, input_length=dim, embeddings_initializer=initializers.Constant(glove_weights),
                            trainable=False)
word2vec_embedding = Embedding(vocab_size, 300, input_length=dim, embeddings_initializer=initializers.Constant(word2vec_weights),
                              trainable=False)

### Load DL Models and Run Them

In [27]:
lstm_random = lstm_keras(dim,n_classes,random_embedder)
lstm_random_over = lstm_keras(dim,n_classes,random_embedder)
lstm_glove = lstm_keras(dim,n_classes,glove_embedding)
lstm_glove_over = lstm_keras(dim,n_classes,glove_embedding)
lstm_word2vec = lstm_keras(dim,n_classes,word2vec_embedding)
lstm_word2vec_over = lstm_keras(dim,n_classes,word2vec_embedding)

In [28]:
blstm_random = blstm(dim,n_classes,random_embedder)
blstm_random_over = blstm(dim,n_classes,random_embedder)
blstm_glove = blstm(dim,n_classes,glove_embedding)
blstm_glove_over = blstm(dim,n_classes,glove_embedding)
blstm_word2vec = blstm(dim,n_classes,word2vec_embedding)
blstm_word2vec_over = blstm(dim,n_classes,word2vec_embedding)

In [29]:
blstm_att_random = blstm_att(dim,n_classes,random_embedder)
blstm_att_random_over = blstm_att(dim,n_classes,random_embedder)
blstm_att_glove = blstm_att(dim,n_classes,glove_embedding)
blstm_att_glove_over = blstm_att(dim,n_classes,glove_embedding)
blstm_att_word2vec = blstm_att(dim,n_classes,word2vec_embedding)
blstm_att_word2vec_over = blstm_att(dim,n_classes,word2vec_embedding)

In [30]:
cnn_random = cnn_keras(dim,n_classes,random_embedder)
cnn_random_over = cnn_keras(dim,n_classes,random_embedder)
cnn_glove = cnn_keras(dim,n_classes,glove_embedding)
cnn_glove_over = cnn_keras(dim,n_classes,glove_embedding)
cnn_word2vec = cnn_keras(dim,n_classes,word2vec_embedding)
cnn_word2vec_over = cnn_keras(dim,n_classes,word2vec_embedding)

### Big Blob of Fits

In [None]:
lstm_random.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
lstm_random_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
lstm_glove.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
lstm_glove_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
lstm_word2vec.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
lstm_word2vec_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_random.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_random_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_glove.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_glove_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_word2vec.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_word2vec_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_att_random.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_att_random_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_att_glove.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_att_glove_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_att_word2vec.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
blstm_att_word2vec_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
cnn_random.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
cnn_random_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
cnn_glove.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
cnn_glove_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
cnn_word2vec.fit(X_train, y_train_onehot, epochs=30, validation_data=(X_val, y_val_onehot))
cnn_word2vec_over.fit(X_train_over, y_train_over_onehot, epochs=30, validation_data=(X_val, y_val_onehot))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30

In [None]:
model_list = [lstm_random,
lstm_random_over,
lstm_glove,
lstm_glove_over,
lstm_word2vec,
lstm_word2vec_over,
blstm_random,
blstm_random_over,
blstm_glove,
blstm_glove_over,
blstm_word2vec,
blstm_word2vec_over,
blstm_att_random,
blstm_att_random_over,
blstm_att_glove,
blstm_att_glove_over,
blstm_att_word2vec,
blstm_att_word2vec_over,
cnn_random,
cnn_random_over,
cnn_glove,
cnn_glove_over,
cnn_word2vec,
cnn_word2vec_over]

In [None]:
model_name_list = ['lstm_random',
'lstm_random_over',
'lstm_glove',
'lstm_glove_over',
'lstm_word2vec',
'lstm_word2vec_over',
'blstm_random',
'blstm_random_over',
'blstm_glove',
'blstm_glove_over',
'blstm_word2vec',
'blstm_word2vec_over',
'blstm_att_random',
'blstm_att_random_over',
'blstm_att_glove',
'blstm_att_glove_over',
'blstm_att_word2vec',
'blstm_att_word2vec_over',
'cnn_random',
'cnn_random_over',
'cnn_glove',
'cnn_glove_over',
'cnn_word2vec',
'cnn_word2vec_over']

In [None]:
# facility to save the fitted model
import shutil
from google.colab import files

for model in model_list:
    index = model_list.index(model)
    model_name = model_name_list[index]
    model.save(model_name)

    shutil.make_archive(model_name, 'zip', model_name)
    shutil.move(f"{model_name}.zip", f"/content/drive/MyDrive/Uni/Experiment Design/Ass2/Deep Learning/models_wiki/{model_name}.zip")

In [None]:
predictions = []

for model in model_list:
    index = model_list.index(model)
    print(model_name_list[index])
    model_prediction = indiv_eval(model, X_test, y_test)
    predictions.append(model_prediction)

In [None]:
import joblib
joblib.dump(predictions, '/content/drive/MyDrive/Uni/Experiment Design/Ass2/Deep Learning/wiki_dl_preds.df')

In [None]:
multiple_mcnemar(model_name_list, predictions, y_test, 0.01)