In [1]:
import pandas as pd
import numpy as np
import gensim
from gensim.models import Word2Vec
from tensorflow.keras import preprocessing as tfkp
import pickle
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# define training data

In [2]:
import gensim.models.keyedvectors as word2vec
model = word2vec.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
import string

def load_data():
    filename = "../data/formspring_data_fixed.pkl"
    print("Loading data from file: " + filename)
    data = pickle.load(open(filename, 'rb'))
    x_text = []
    labels = []
    ids = []
    for i in range(len(data)):
        text = "".join(l for l in data[i]['text'] if l not in string.punctuation)
        x_text.append((data[i]['text']).encode('utf-8'))
        labels.append(data[i]['label'])
    return x_text,labels

In [4]:
def map_weights(embed_dict, vocab, embed_size): # embed size is embedding dim
    vocab_size = len(vocab)
    weights = np.zeros((vocab_size, embed_size))

    n_missed = 0
    words_missed = []
    for k,v in vocab.items():
        try:
            weights[v] = embed_dict[k]  # weights[v] is an index, embed_dict[k] is the list of weights
        except:
            n_missed += 1
            words_missed.append(k)
    print(f"{n_missed} embeddings missed of {vocab_size}")
    return weights, words_missed

In [5]:
x_text, labels_og = load_data()
labels, uniques = pd.factorize(labels_og)

Loading data from file: ../data/formspring_data_fixed.pkl


In [6]:
comments = pd.DataFrame({'comment': x_text, 'attack': labels})

comments['comment'] = comments['comment'].str.decode("utf-8")

#Replace empty comments with np.nan
comments['comment'].replace('', np.NaN, inplace=True)

#Drop nan(empty) comments
comments = comments[comments.comment.notna()]

# lower case everything
comments['comment'] = comments['comment'].str.lower()

#Remove stop words
comments['comment'] = comments['comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in stopwords.words('english')]))

#Tokenize
comments['comment'] = comments.apply(lambda row: nltk.word_tokenize(row['comment']), axis=1)

# Remove punctuations

comments['comment'] = [list(filter(lambda x: x not in string.punctuation, sentence)) for sentence in comments['comment']]

comments['comment'] = comments['comment'].apply(lambda x: ' '.join([str(elem) for elem in x]))

In [7]:
X_train_pretoken, X_midway_pretoken, y_train, y_midway = train_test_split(comments['comment'], comments['attack'], random_state = 42, test_size=0.2)

In [8]:
tokenizer = tfkp.text.Tokenizer(oov_token="<UNK>", filters='!"$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',)
tokenizer.fit_on_texts(comments['comment'])

convert = lambda x: tfkp.sequence.pad_sequences(tokenizer.texts_to_sequences(x),                                               
                                                    padding='post', truncating='post')

X_train = convert(X_train_pretoken)
X_midway = convert(X_midway_pretoken)

In [9]:
X_test, X_val, y_test, y_val = train_test_split(X_midway, y_midway, random_state = 42, test_size=0.5)

In [10]:
oversample = RandomOverSampler(random_state=42)
X_train_over, y_train_over = oversample.fit_resample(X_train, y_train)

In [11]:
word2vec_weights, wrod2vec_words_missed = map_weights(model, tokenizer.word_index, 300)

6711 embeddings missed of 18526


In [12]:
print(np.shape(X_train))
print(np.shape(X_train_over))
print(np.shape(X_val))
print(np.shape(X_test))
print(np.shape(y_train))
print(np.shape(y_train_over))
print(np.shape(y_val))
print(np.shape(y_test))
print(np.shape(word2vec_weights))

(10208, 549)
(19186, 549)
(1276, 309)
(1276, 309)
(10208,)
(19186,)
(1276,)
(1276,)
(18526, 300)
