In [1]:
# inspired by:
# https://towardsdatascience.com/natural-language-processing-classification-using-deep-learning-and-word2vec-50cbadd3bd6a

print("Begin importing")
# imports + set random seeds.
SEED = 0
import random
random.seed(SEED)

import numpy as np
np.random.seed(SEED)

import tensorflow as tf
tf.reset_default_graph()
tf.set_random_seed(SEED)

# rest of the imports.
# native packages
import multiprocessing
import os
import pickle
import re
from time import time

# third party.
import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

import keras
from keras import layers
from keras.layers import concatenate
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input, Embedding
from keras.layers.merge import Concatenate
from keras.models import Sequential, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from nltk.tokenize import RegexpTokenizer

import numpy as np

import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

print("Done importing")

# Load W2V
W2V_Pickle = "../Data/Cached/w2v.p"
print("loading w2v")
try:
    w2v_model = pickle.load(open(W2V_Pickle, "rb"))
    print("loaded from pickle")
except:
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../Data/GoogleNews-vectors-negative300.bin', binary=True)
    pickle.dump(w2v_model, open(W2V_Pickle, "wb"))
    print("loaded from model file")

print("Done loading w2v")

print("Loading training / testing data from pickles")

TRAIN_DATA = "../Data/Generated/RC_2016-10_Train.pkl"
TEST_DATA = "../Data/Generated/RC_2016-10_Test.pkl"

postsTrain = pd.read_pickle(TRAIN_DATA)
postsTest = pd.read_pickle(TEST_DATA)

print("Loaded.")

SEQ_LEN = len(postsTrain["tokens"].values[0])
print("Training data consists of %d words per training example."%SEQ_LEN)

print(postsTrain.head())
print(postsTest.head())

y_train = postsTrain["banned"].values
y_test = postsTest["banned"].values
X_train = postsTrain["tokens"].values
X_test = postsTest["tokens"].values

all_words = [word for tokens in X_train for word in tokens]
all_sentence_lengths = [SEQ_LEN]
ALL_VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(ALL_VOCAB)))
print("Max sentence length is %s" % max(all_sentence_lengths))


####################### CHANGE THE PARAMETERS HERE #####################################
EMBEDDING_DIM = 300 # how big is each word vector
MAX_VOCAB_SIZE = len(ALL_VOCAB) # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = max(all_sentence_lengths) # max number of words in a comment to use


tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(X_train.tolist())
training_sequences = tokenizer.texts_to_sequences(X_train.tolist())
train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = w2v_model[word] if word in w2v_model else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)


######################## TRAIN AND TEST SET #################################
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_sequences = tokenizer.texts_to_sequences(X_test.tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)


print(train_cnn_data[0].shape)

Begin importing


Using TensorFlow backend.


Done importing
loading w2v
loaded from pickle
Done loading w2v
Loading training / testing data from pickles
Loaded.
Training data consists of 200 words per training example.
   banned                                             tokens
0       1  [was, performed, automatically, please, contac...
1       0  [levi, and, amira, hass, have, written, so, mu...
2       0  [he, isnt, dead, maybe, hes, just, in, another...
3       1  [benghazi, or, the, clinton, foundation, takin...
4       0  [hilton, mike, evans, jordan, matthews, davont...
   banned                                             tokens
0       1  [tonight, we, wuz, peruvian, 10, s, a, bigger,...
1       1  [but, this, to, my, knowledge, is, why, well, ...
2       0  [than, the, wrx, they, come, out, to, the, eve...
3       1  [but, i, m, talking, only, 1, 2, per, test, th...
4       0  [immensely, beautiful, you, definitely, have, ...
18000000 words total, with a vocabulary size of 263457
Max sentence length is 200
Found 263457

In [5]:
EPOCHS = 10
BATCH_SIZE = 64

def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, trainable=False, extra_conv=True):

    embedding_layer = Embedding(num_words,
                                embedding_dim,
                                weights=[embeddings],
                                input_length=max_sequence_length,
                                trainable=trainable)

    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    # Based on Yoon Kim model (https://arxiv.org/abs/1408.5882)
    convs = []
    filter_sizes = [3, 3, 3, 4, 5]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=128, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = MaxPooling1D(pool_size=3)(l_conv)
        convs.append(l_pool)

    l_merge = concatenate([convs[0],convs[1],convs[2], convs[3], convs[4]],axis=1)

    # add a 1D convnet with global maxpooling, instead of Yoon Kim model
    conv = Conv1D(filters=128, kernel_size=3, activation='relu')(embedded_sequences)
    pool = MaxPooling1D(pool_size=3)(conv)

    if extra_conv==True:
        x = Dropout(0.5)(l_merge)
    else:
        # Original Yoon Kim model
        x = Dropout(0.5)(pool)
    x = Flatten()(x)
    x = Dense(128, activation='relu')(x)
    # Finally, we feed the output into a Sigmoid layer.
    # The reason why sigmoid is used is because we are trying to achieve a binary classification(1,0)
    # for each of the 6 labels, and the sigmoid function will squash the output between the bounds of 0 and 1.
    preds = Dense(1,activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                      optimizer='adadelta',
                      metrics=['acc'])
    model.summary()
    return model




In [6]:

model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, False)
history = model.fit(train_cnn_data, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
                       validation_data=(test_cnn_data, y_test) )

loss, accuracy = model.evaluate(train_cnn_data, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(test_cnn_data, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 200, 300)     79037400    input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_13 (Conv1D)              (None, 198, 128)     115328      embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_14 (Conv1D)              (None, 198, 128)     115328      embedding_3[0][0]                
__________________________________________________________________________________________________
conv1d_15 

In [None]:
classes = [0,1]

print("TEST DATA")
y_pred = np.array([1 if prd > 0.5 else 0 for prd in model.predict(test_cnn_data)])
sess = tf.compat.v1.Session()
con_mat = sess.run(tf.math.confusion_matrix(labels=y_test, predictions=y_pred))
con_mat_norm = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2)

con_mat_df = pd.DataFrame(con_mat,
                              index=classes,
                              columns=classes)

con_mat_df_norm = pd.DataFrame(con_mat_norm,
                              index=classes,
                              columns=classes)

print("Normalized values:")
print(con_mat_df_norm)
print("\nRaw values:")
print(con_mat_df)

print("row: what should have been predicted")
print("column: what was predicted")

print("")
print("TRAIN DATA")
y_pred = np.array([1 if prd > 0.5 else 0 for prd in model.predict(train_cnn_data)])

con_mat = sess.run(tf.math.confusion_matrix(labels=y_train, predictions=y_pred))

con_mat_norm = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2)

con_mat_df_norm = pd.DataFrame(con_mat_norm,
                              index=classes,
                              columns=classes)
con_mat_df = pd.DataFrame(con_mat,
                              index=classes,
                              columns=classes)
print("Normalized values:")
print(con_mat_df_norm)
print("\nRaw values:")
print(con_mat_df)
print("row: what should have been predicted")
print("column: what was predicted")