In [1]:
print("Begin importing")
# imports + set random seeds.
SEED = 0
import random
random.seed(SEED)

import numpy as np
np.random.seed(SEED)

import tensorflow as tf
tf.reset_default_graph()
tf.set_random_seed(SEED)

# rest of the imports.
# native packages
import multiprocessing
import os
import pickle
import re
from time import time

# third party.
import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

import keras
from keras import layers
from keras.layers import concatenate
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input, Embedding
from keras.layers.merge import Concatenate
from keras.models import Sequential, Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from nltk.tokenize import RegexpTokenizer

import numpy as np

import pandas as pd

from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split

print("Done importing")

# Load W2V
W2V_Pickle = "../Data/Cached/w2v.p"
print("loading w2v")
try:
    w2v_model = pickle.load(open(W2V_Pickle, "rb"))
    print("loaded from pickle")
except:
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../Data/GoogleNews-vectors-negative300.bin', binary=True)
    pickle.dump(w2v_model, open(W2V_Pickle, "wb"))
    print("loaded from model file")

print("Done loading w2v")

print("Loading training / testing data from pickles")

TRAIN_DATA = "../Data/Generated/RC_2016-10_Train.pkl"
TEST_DATA = "../Data/Generated/RC_2016-10_Test.pkl"

postsTrain = pd.read_pickle(TRAIN_DATA)
postsTest = pd.read_pickle(TEST_DATA)

print("Loaded.")

SEQ_LEN = len(postsTrain["tokens"].values[0])
print("Training data consists of %d words per training example."%SEQ_LEN)

print(postsTrain.head())
print(postsTest.head())

y_train = postsTrain["banned"].values
y_test = postsTest["banned"].values
X_train = postsTrain["tokens"].values
X_test = postsTest["tokens"].values

all_words = [word for tokens in X_train for word in tokens]
all_sentence_lengths = [SEQ_LEN]
ALL_VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(ALL_VOCAB)))
print("Max sentence length is %s" % max(all_sentence_lengths))


####################### CHANGE THE PARAMETERS HERE #####################################
EMBEDDING_DIM = 300 # how big is each word vector
MAX_VOCAB_SIZE = len(ALL_VOCAB) # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = max(all_sentence_lengths) # max number of words in a comment to use


tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(X_train.tolist())
training_sequences = tokenizer.texts_to_sequences(X_train.tolist())
train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = w2v_model[word] if word in w2v_model else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)


######################## TRAIN AND TEST SET #################################
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_sequences = tokenizer.texts_to_sequences(X_test.tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

print(train_cnn_data[0].shape)

Begin importing


Using TensorFlow backend.


Done importing
loading w2v
loaded from pickle
Done loading w2v
Loading training / testing data from pickles
Loaded.
Training data consists of 200 words per training example.
   banned                                             tokens
0       0  [top, tier, isn, t, that, small, there, are, a...
1       1  [m, actually, checking, out, the, thread, now,...
2       1  [still, requires, input, deleted, just, got, t...
3       0  [min, in, his, jungle, c9, will, punish, the, ...
4       1  [well, ahead, of, the, curve, in, the, 1980s, ...
   banned                                             tokens
0       0  [wan, you, will, never, find, a, more, wretche...
1       0  [re, right, i, ll, probably, delete, the, orig...
2       0  [lose, ng, konting, weight, but, then, nakita,...
3       0  [s, a, bad, thing, and, definitely, not, an, o...
4       1  [and, isn, t, feasible, for, anything, over, l...
18000000 words total, with a vocabulary size of 263939
Max sentence length is 200
Found 263939

In [2]:
w2v_model_tmp = {}

def get_word_vec(word):

    if word in w2v_model:
        return w2v_model[word]
    elif word in w2v_model_tmp:
        return w2v_model_tmp[word]
    else:
        w2v_model_tmp[word] = np.random.rand(EMBEDDING_DIM)
        return w2v_model_tmp[word]
        
X_train_cnn = np.zeros((len(X_train), SEQ_LEN, EMBEDDING_DIM))

X_test_cnn = np.zeros((len(X_test), SEQ_LEN, EMBEDDING_DIM))

print("Generating training examples.")
print("type, num loaded, w2v misses")
for i in range(X_train_cnn.shape[0]):
    if i %1000==0: print("train", i, len(w2v_model_tmp))
    for j in range(SEQ_LEN):
        word_vec = get_word_vec( X_train[i][j])
        X_train_cnn[i, j, :] = word_vec


for i in range(X_test_cnn.shape[0]):
    if i %1000==0: print("test", i, len(w2v_model_tmp))
    for j in range(SEQ_LEN):
        word_vec = get_word_vec( X_test[i][j])
        X_test_cnn[i, j, :] = word_vec

Generating training examples.
type, num loaded, w2v misses
train 0 0
train 1000 4663
train 2000 8383
train 3000 11940
train 4000 15460
train 5000 18684
train 6000 21993
train 7000 25148
train 8000 28451
train 9000 31157
train 10000 34097
train 11000 37016
train 12000 39967
train 13000 42698
train 14000 45136
train 15000 47648
train 16000 50164
train 17000 52455
train 18000 55158
train 19000 57525
train 20000 60040
train 21000 62372
train 22000 64929
train 23000 67281
train 24000 69446
train 25000 71494
train 26000 73647
train 27000 75809
train 28000 78101
train 29000 80191
train 30000 82170
train 31000 84115
train 32000 86478
train 33000 88536
train 34000 90849
train 35000 92831
train 36000 94846
train 37000 96707
train 38000 98857
train 39000 100978
train 40000 102832
train 41000 105023
train 42000 106844
train 43000 108801
train 44000 110839
train 45000 112858
train 46000 114824
train 47000 116707
train 48000 118777
train 49000 120761
train 50000 122842
train 51000 125203
train 52000

In [4]:
EPOCHS = 4
BATCH_SIZE = 128

model = Sequential()
model.add(Conv1D(filters=128, kernel_size=5, activation="relu", input_shape=(SEQ_LEN, EMBEDDING_DIM)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))

model.add(Conv1D(filters=128, kernel_size=7, activation="relu", input_shape=(SEQ_LEN, EMBEDDING_DIM)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))

model.add(Conv1D(filters=128, kernel_size=15, activation="relu", input_shape=(SEQ_LEN, EMBEDDING_DIM)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))

model.add(Conv1D(filters=128, kernel_size=15, activation="relu", input_shape=(SEQ_LEN, EMBEDDING_DIM)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
                  optimizer='adadelta',
                  metrics=['acc'])

model.summary()

history = model.fit(X_train_cnn, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE,
                    validation_data=(X_test_cnn, y_test))

loss, accuracy = model.evaluate(X_train_cnn, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test_cnn, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_5 (Conv1D)            (None, 196, 128)          192128    
_________________________________________________________________
max_pooling1d_5 (MaxPooling1 (None, 98, 128)           0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 98, 128)           0         
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 92, 128)           114816    
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 46, 128)           0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 46, 128)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 32, 128)           245888    
__________

In [5]:
classes = [0,1]

print("TEST DATA")
y_pred = np.array([1 if prd > 0.5 else 0 for prd in model.predict(X_test_cnn)])
sess = tf.compat.v1.Session()
con_mat = sess.run(tf.math.confusion_matrix(labels=y_test, predictions=y_pred))
con_mat_norm = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2)

con_mat_df = pd.DataFrame(con_mat,
                              index=classes,
                              columns=classes)

con_mat_df_norm = pd.DataFrame(con_mat_norm,
                              index=classes,
                              columns=classes)

print("Normalized values:")
print(con_mat_df_norm)
print("\nRaw values:")
print(con_mat_df)

print("row: what should have been predicted")
print("column: what was predicted")

print("")
print("TRAIN DATA")
y_pred = np.array([1 if prd > 0.5 else 0 for prd in model.predict(X_train_cnn)])

con_mat = sess.run(tf.math.confusion_matrix(labels=y_train, predictions=y_pred))

con_mat_norm = np.around(con_mat.astype('float') / con_mat.sum(axis=1)[:, np.newaxis], decimals=2)

con_mat_df_norm = pd.DataFrame(con_mat_norm,
                              index=classes,
                              columns=classes)
con_mat_df = pd.DataFrame(con_mat,
                              index=classes,
                              columns=classes)
print("Normalized values:")
print(con_mat_df_norm)
print("\nRaw values:")
print(con_mat_df)
print("row: what should have been predicted")
print("column: what was predicted")

TEST DATA
Normalized values:
      0     1
0  0.89  0.11
1  0.10  0.90

Raw values:
      0     1
0  4433   554
1   494  4519
row: what should have been predicted
column: what was predicted

TRAIN DATA
Normalized values:
      0     1
0  0.92  0.08
1  0.07  0.93

Raw values:
       0      1
0  41277   3736
1   3046  41941
row: what should have been predicted
column: what was predicted
