In [1]:
import numpy as np
import pandas as pd
import pickle
from matplotlib import pyplot as plt
import seaborn as sns

import string
import re
from urllib.parse import urlparse
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras import preprocessing as tfkp
from tensorflow.keras.utils import to_categorical

import gensim.downloader as api
from imblearn.over_sampling import RandomOverSampler

from DL_models import lstm_keras, cnn_keras, blstm, blstm_att

[nltk_data] Downloading package stopwords to C:\Users\Ben
[nltk_data]     Lee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
def load_data():
    filename = "../data/formspring_data_fixed.pkl"
    print("Loading data from file: " + filename)
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = []
    ids = []
    for i in range(len(data)):
        text = "".join(l for l in data[i]['text'] if l not in string.punctuation)
        x_text.append((data[i]['text']).encode('utf-8'))
        labels.append(data[i]['label'])
    return x_text,labels

In [3]:
def is_url(url):
  try:
    result = urlparse(url)
    return all([result.scheme, result.netloc])
  except ValueError:
    return False

In [4]:
# embedding stuff
def map_weights(embed_dict, vocab, embed_size): # embed size is embedding dim
    vocab_size = len(vocab) + 1
    weights = np.zeros((vocab_size, embed_size))

    n_missed = 0
    words_missed = []
    for k,v in vocab.items():
        try:
            weights[v] = embed_dict[k]  # weights[v] is an index, embed_dict[k] is the list of weights
        except:
            n_missed += 1
            words_missed.append(k)
    print(f"{n_missed} embeddings missed of {vocab_size}")
    return weights, words_missed

### Loading Data

In [7]:
x_text, labels_og = load_data()
labels, uniques = pd.factorize(labels_og)

Loading data from file: ../data/formspring_data_fixed.pkl


### Intitial Twitter-Specific Pre-processing

In [6]:
import re
comments = pd.DataFrame({'comment': x_text, 'attack': labels})

# decode to UTF-8
comments['comment'] = comments['comment'].str.decode("utf-8")

# lower case everything
comments['comment'] = comments['comment'].str.lower()

# Remove punctuation
comments['comment'] = comments['comment'].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))

#Remove words containing numbers
comments['comment'] = comments['comment'].apply(lambda x: re.sub('\w*\d\w*', '', x))

#Remove in sentece white spaces
comments['comment'] = comments['comment'].apply(lambda x: re.sub(' +', ' ', x))

#Remove whitespaces at begining and end of sentence
comments['comment'] = comments['comment'].str.strip()

# remove stop words
comments['comment'] = comments['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

# remove empty string rows
comments = comments[comments['comment'] != ""]

In [23]:
comments.to_pickle('../data/formspring_final_preprocess.pkl')

In [26]:
pd.read_pickle('../data/formspring_final_preprocess.pkl')

Unnamed: 0,comment,attack
0,favorite song like many songs favorite,0
1,haha jk,0
2,hey angel duh sexy really thanks haha,0
4,meowww rawr,0
5,makeup tips suck makeup lol sure like tell wht...,0
...,...,...
12768,youre party friend drove drunk wo give keys wo...,0
12769,awesome give compliment deserve thank awesome,0
12770,yu play yurself time sometimes day,0
12771,yukk beer disgusting drink im already drunk br...,0


### Train-Test Split and Tokenization

In [7]:
X_train_pretoken, X_midway_pretoken, y_train, y_midway = train_test_split(comments['comment'], comments['attack'], random_state = 42, test_size=0.2)

In [8]:
dim = 25
n_classes = len(np.unique(y_train.values))

tokenizer = tfkp.text.Tokenizer(oov_token="<UNK>", filters='!"$%#&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',)
tokenizer.fit_on_texts(X_train_pretoken)

convert = lambda x: tfkp.sequence.pad_sequences(tokenizer.texts_to_sequences(x),
                                                    maxlen=dim,
                                                    padding='post', truncating='post')

X_train = convert(X_train_pretoken)
X_midway = convert(X_midway_pretoken)

In [9]:
X_test, X_val, y_test, y_val = train_test_split(X_midway, y_midway, random_state = 42, test_size=0.5)

### Oversampling

In [10]:
oversample = RandomOverSampler(random_state=42)
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

### Reshaping for input

In [11]:
y_train_onehot = to_categorical(y_train, n_classes)
y_val_onehot = to_categorical(y_val, n_classes)
y_test_onehot = to_categorical(y_test, n_classes)
y_train_over_onehot = to_categorical(y_train_over, n_classes)

### Glove Embedding Weights

In [12]:
glove_dict = api.load("glove-twitter-200")
glove_weights, glove_words_missed = map_weights(glove_dict, tokenizer.word_index, 200)

2629 embeddings missed of 15475


In [13]:
# def replace(string, char):
#     pattern = char + '{3,}'
#     string = re.sub(pattern, char, string)
#     return string

In [14]:
# replace('hiii', collections.Counter('hiii').most_common(1)[0][0])

In [15]:
# import collections
# print(collections.Counter(glove_words_missed[1]).most_common(1)[0][0])


### Word2vec Weights

In [16]:
word_dict = api.load("word2vec-google-news-300")
word2vec_weights, word2vec_words_missed = map_weights(word_dict, tokenizer.word_index, 300)

4852 embeddings missed of 15475


#### Checks

In [17]:
# check number of oversampling
from collections import Counter
print(Counter(y_train))
print(Counter(y_train_over))

Counter({0: 9580, 1: 608})
Counter({0: 9580, 1: 9580})


### Write Embedders

In [18]:
from keras.layers import Embedding
from tensorflow.keras import initializers

vocab_size = len(tokenizer.word_index)+1

random_embedder = Embedding(vocab_size, 200, input_length=dim, trainable=True)
glove_embedding = Embedding(vocab_size, 200, input_length=dim, embeddings_initializer=initializers.Constant(glove_weights),
                            trainable=False)
word2vec_embedding = Embedding(vocab_size, 300, input_length=dim, embeddings_initializer=initializers.Constant(word2vec_weights),
                              trainable=False)

### Load DL Models and Run Them

In [None]:
lstm_random = lstm_keras(dim,n_classes,random_embedder)
lstm_random_over = lstm_keras(dim,n_classes,random_embedder)
lstm_glove = lstm_keras(dim,n_classes,glove_embedding)
lstm_glove_over = lstm_keras(dim,n_classes,glove_embedding)
lstm_word2vec = lstm_keras(dim,n_classes,word2vec_embedding)
lstm_word2vec_over = lstm_keras(dim,n_classes,word2vec_embedding)

In [None]:
blstm_random = blstm(dim,n_classes,random_embedder)
blstm_random_over = blstm(dim,n_classes,random_embedder)
blstm_glove = blstm(dim,n_classes,glove_embedding)
blstm_glove_over = blstm(dim,n_classes,glove_embedding)

In [None]:
blstm_att_random = blstm_att(dim,n_classes,random_embedder)
blstm_att_random_over = blstm_att(dim,n_classes,random_embedder)
blstm_att_glove = blstm_att(dim,n_classes,glove_embedding)
blstm_att_glove_over = blstm_att(dim,n_classes,glove_embedding)

In [None]:
cnn_random = cnn_keras(dim,n_classes,random_embedder)
cnn_random_over = cnn_keras(dim,n_classes,random_embedder)
cnn_glove = cnn_keras(dim,n_classes,glove_embedding)
cnn_glove_over = cnn_keras(dim,n_classes,glove_embedding)

In [None]:
cnn_random.fit(X_train, y_train_onehot, epochs=3, validation_data=(X_val, y_val_onehot))

In [None]:
lstm_random.fit(X_train,  y_train_onehot, epochs=3, validation_data=(X_val, y_val_onehot))

In [None]:
lstm_glove.fit(X_train,  y_train_onehot, epochs=3, validation_data=(X_val, y_val_onehot))

In [None]:
lstm_word2vec.fit(X_train,  y_train_onehot, epochs=3, validation_data=(X_val, y_val_onehot))

In [None]:
blstm_random_over.fit(X_train_over,  y_train_over_onehot, epochs=3, validation_data=(X_val, y_val_onehot))

In [None]:
blstm_att_random_over.fit(X_train_over,  y_train_over_onehot, epochs=3, validation_data=(X_val, y_val_onehot))

In [None]:
preds_probs = blstm_att_random_over.predict(X_test)
preds = [np.argmax(p) for p in preds_probs]

print(classification_report(y_test, preds))

plt.figure()
sns.heatmap(confusion_matrix(y_test, preds, normalize='true'), fmt='.2%', annot=True ,linewidths=.5)#,cmap='YlOrRd', annot_kws={"fontsize":10})#, yticklabels=target_names, xticklabels=target_names)
plt.xticks(rotation=45, ha='right')