In [12]:
import pickle

import gensim
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import sklearn.linear_model
from sklearn.svm import SVC
import pandas as pd
from string import punctuation
import os
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

from keras.models import Sequential
from keras import layers
from keras.layers import Reshape, Conv1D, Embedding, MaxPooling1D, Dropout, Flatten, Dense, Bidirectional, LSTM
from keras import metrics
from keras import backend as K
from keras.utils import multi_gpu_model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import tensorflow as tf
tf.ConfigProto().gpu_options.allow_growth = True
np.random.seed(1) # this sets the seed so that the runs are consistent

%matplotlib inline


# from keras.backend.tensorflow_backend import set_session
# import tensorflow as tf
# config = tf.ConfigProto()
# config.gpu_options.allow_growth = True
# config.log_device_placement = True
# sess = tf.Session(config=config)
# set_session(sess)

In [13]:
# Load W2V
W2V_Pickle = "../Data/Cached/w2v.p"
print("loading w2v")
try:
    w2v_model = pickle.load(open(W2V_Pickle, "rb"))
    print("loaded from pickle")
except:
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format('../Data/GoogleNews-vectors-negative300.bin', binary=True)
    pickle.dump(w2v_model, open(W2V_Pickle, "wb"))
    print("loaded from model file")

print("Done loading w2v")

loading w2v
loaded from pickle
Done loading w2v


In [14]:
def cross_val(clf,X,y,name):
    print(name)
    y_pred = cross_val_predict(clf, X, y, cv=10)
    print(metrics.classification_report(y, y_pred))
    conf = np.array(metrics.confusion_matrix(y, y_pred))
    print(conf)
    y_probas = clf.predict_proba(X)
#     skplt.metrics.plot_roc_curve(y, y_probas, title=name+' ROC Curves', curves='each_class')
    return metrics.f1_score(y,y_pred,pos_label=1, average='binary')

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))


def binary_focal_loss(gamma=2., alpha=.25):
    """
    Binary form of focal loss.
      FL(p_t) = -alpha * (1 - p_t)**gamma * log(p_t)
      where p = sigmoid(x), p_t = p or 1 - p depending on if the label is 1 or 0, respectively.
    References:
        https://arxiv.org/pdf/1708.02002.pdf
    Usage:
     model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"], optimizer=adam)
    """
    def binary_focal_loss_fixed(y_true, y_pred):
        """
        :param y_true: A tensor of the same shape as `y_pred`
        :param y_pred:  A tensor resulting from a sigmoid
        :return: Output tensor.
        """
        pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
        pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))

        epsilon = K.epsilon()
        # clip to prevent NaN's and Inf's
        pt_1 = K.clip(pt_1, epsilon, 1. - epsilon)
        pt_0 = K.clip(pt_0, epsilon, 1. - epsilon)

        return -K.sum(alpha * K.pow(1. - pt_1, gamma) * K.log(pt_1)) \
               -K.sum((1 - alpha) * K.pow(pt_0, gamma) * K.log(1. - pt_0))

    return binary_focal_loss_fixed

In [15]:
SEQ_LEN = 200

In [16]:
dftrain_banned = pd.read_csv("../Data/Generated/200_words_10M_banned.csv", delimiter=',')
dftrain_banned.insert(0, "banned", 1)

dftrain_notbanned = pd.read_csv("../Data/Generated/200_words_10M_notbanned.csv", delimiter=',')
dftrain_notbanned.insert(0, "banned", 0)

dfTest = pd.read_csv("../Data/Generated/200_words_10M_test.csv", delimiter=',')
dfTest = dfTest.sample(frac=1)

dfTest["split"] = dfTest["words"].map(lambda x: x.split(" "), na_action='ignore')
dfTest["word_cnt"] = dfTest["split"].map(lambda x: len(x), na_action='ignore')
print("Test percent lost: %.2f" % (100*len(dfTest[dfTest["word_cnt"] != SEQ_LEN])/ len(dfTest)))
dfTest = dfTest[dfTest["word_cnt"] == SEQ_LEN]

dfTest_banned = dfTest[dfTest["banned"]]
dfTest_notbanned = dfTest[dfTest["banned"] == False]


Test percent lost: 5.63


In [30]:
TRAIN_BALANCE_RATIO = 40
TEST_BALANCE_RATIO = 1
TRAIN_N_COMMENTS = int(len(dftrain_banned)/1)
TEST_N_COMMENTS = int(len(dfTest_banned)/1)

In [31]:
dfTest_balanced = pd.concat([dfTest_banned.head(n=TEST_N_COMMENTS), dfTest_notbanned.head(n=TEST_BALANCE_RATIO*TEST_N_COMMENTS)]).sample(frac=1)
dfTrain = pd.concat([dftrain_banned.head(n=TRAIN_N_COMMENTS), dftrain_notbanned.head(n=TRAIN_BALANCE_RATIO*TRAIN_N_COMMENTS)])
dfTrain[dfTrain["banned"]==1].shape, dfTrain[dfTrain["banned"]==0].shape

((7500, 2), (300000, 2))

In [32]:
dfTrain = dfTrain.sample(frac=1)
dfTrain.head(n=100)

Unnamed: 0,banned,words
271582,0,its inaugural edition of the report 'Poverty a...
72781,0,cool '' will suddenly make you unique and inte...
197561,0,from him . If he is in your class and sits nex...
70760,0,is now just a safe space nonsense zone when it...
51467,0,is awesome ! Canada was there too . [ removed ...
49464,0,Cyberface/Medic what if they win more than 8 g...
144111,0,`` Stop Crossposting My Stuff ! '' ) ] ( https...
249539,0,being built in Europe for whatever that 's wor...
61102,0,As a BB you can kill in one shot if enemy has ...
1953,1,never a real competitor and probably was never...


In [33]:
y_train = dfTrain["banned"].values
y_test = dfTest_balanced["banned"].values
X_train = dfTrain["words"].values
X_test = dfTest_balanced["words"].values

all_words = [word for tokens in X_train for word in tokens]
all_sentence_lengths = [SEQ_LEN]
ALL_VOCAB = sorted(list(set(all_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(ALL_VOCAB)))
print("Max sentence length is %s" % max(all_sentence_lengths))


####################### CHANGE THE PARAMETERS HERE #####################################
EMBEDDING_DIM = 300 # how big is each word vector
MAX_VOCAB_SIZE = len(ALL_VOCAB) # how many unique words to use (i.e num rows in embedding vector)
MAX_SEQUENCE_LENGTH = max(all_sentence_lengths) # max number of words in a comment to use


tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, lower=True, char_level=False)
tokenizer.fit_on_texts(X_train.tolist())
training_sequences = tokenizer.texts_to_sequences(X_train.tolist())
train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = w2v_model[word] if word in w2v_model else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)


######################## TRAIN AND TEST SET #################################
train_cnn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_sequences = tokenizer.texts_to_sequences(X_test.tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)


print(train_cnn_data[0].shape)

303813572 words total, with a vocabulary size of 5364
Max sentence length is 200
Found 609126 unique tokens.
(609127, 300)
(200,)


In [34]:
NUM_WORDS = len(train_word_index)+1
TRAINABLE_EMBEDDINGS=True

model = Sequential()
model.add(Embedding(NUM_WORDS,
          EMBEDDING_DIM,
          weights=[train_embedding_weights],
          input_length=MAX_SEQUENCE_LENGTH,
          trainable=TRAINABLE_EMBEDDINGS))



model.add(Conv1D(filters=128, kernel_size=3, activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))

model.add(Conv1D(filters=128, kernel_size=3, activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))

model.add(Conv1D(filters=128, kernel_size=3, activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))

model.add(Conv1D(filters=128, kernel_size=3, activation="relu"))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.5))
model.add(Flatten())

model.add(Dense(256, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(#loss='binary_crossentropy',
              loss=[binary_focal_loss(alpha=.25, gamma=2)],
              optimizer='adam',
              metrics=['acc',f1_m,precision_m, recall_m] )

model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 200, 300)          182738100 
_________________________________________________________________
conv1d_21 (Conv1D)           (None, 198, 128)          115328    
_________________________________________________________________
max_pooling1d_21 (MaxPooling (None, 99, 128)           0         
_________________________________________________________________
dropout_32 (Dropout)         (None, 99, 128)           0         
_________________________________________________________________
conv1d_22 (Conv1D)           (None, 97, 128)           49280     
_________________________________________________________________
max_pooling1d_22 (MaxPooling (None, 48, 128)           0         
_________________________________________________________________
dropout_33 (Dropout)         (None, 48, 128)           0         
__________

In [None]:
history = model.fit(train_cnn_data, y_train, epochs=100, batch_size=512,
                       validation_data=(test_cnn_data, y_test),
                   verbose=1)

Train on 307500 samples, validate on 3714 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
 59904/307500 [====>.........................] - ETA: 28s - loss: 0.4936 - acc: 0.9964 - f1_m: 0.9192 - precision_m: 0.9787 - recall_m: 0.8753