In [119]:
##### Always import all needed libraries in the first cell
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.datasets
import sklearn.linear_model
from sklearn.svm import SVC
import pandas as pd
from string import punctuation
import os
import sys
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_predict
from sklearn import metrics

from keras.models import Sequential, load_model
from keras import layers
from keras.layers import Reshape
from keras import metrics
from keras import backend as K
from keras.utils import multi_gpu_model, CustomObjectScope
import random

np.random.seed(1) # this sets the seed so that the runs are consistent

%matplotlib inline

In [115]:
def cross_val(clf,X,y,name):
    print(name)
    y_pred = cross_val_predict(clf, X, y, cv=10)
    print(metrics.classification_report(y, y_pred))
    conf = np.array(metrics.confusion_matrix(y, y_pred))
    print(conf)
    y_probas = clf.predict_proba(X)
#     skplt.metrics.plot_roc_curve(y, y_probas, title=name+' ROC Curves', curves='each_class')
    return metrics.f1_score(y,y_pred,pos_label=1, average='binary')

def recall_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

def precision_m(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))
def flatten(l):
    return [item for sublist in l for item in sublist]

In [3]:
dftrain_banned = pd.read_csv("../Data/Generated/200_words_10M_banned.csv", delimiter=',')
dftrain_banned.insert(0, "banned", 1)

dftrain_notbanned = pd.read_csv("../Data/Generated/200_words_10M_notbanned.csv", delimiter=',')
dftrain_notbanned.insert(0, "banned", 0)

dfTest = pd.read_csv("../Data/Generated/200_words_10M_test.csv", delimiter=',')
dfTest = dfTest.sample(frac=1)

dfTest["split"] = dfTest["words"].map(lambda x: x.split(" "), na_action='ignore')
dfTest["word_cnt"] = dfTest["split"].map(lambda x: len(x), na_action='ignore')
print("Test percent lost: %.2f" % (100*len(dfTest[dfTest["word_cnt"] != 200])/ len(dfTest)))
dfTest = dfTest[dfTest["word_cnt"] == 200]



Test percent lost: 5.63


In [47]:
def group_by_subreddit(df):
    subreddit_to_comments = {}
    for index, row in df.iterrows():
        if row["subreddit"] in subreddit_to_comments:
            subreddit_to_comments[row["subreddit"]] += [row["words"]]
        else:
            subreddit_to_comments[row["subreddit"]] = [row["words"]]
    return subreddit_to_comments
    
random.seed(42)
unique_subreddits = list(dfTest["subreddit"].value_counts().keys())
random.shuffle(unique_subreddits)
SUBREDDIT_CNT = len(unique_subreddits)

s1 = int (1/3 * SUBREDDIT_CNT)
s2 = int (2/3 * SUBREDDIT_CNT)

validation_test = unique_subreddits[:s1]

threshold_test = unique_subreddits[s1:s2]

testing_test = unique_subreddits[s2:]


dfVal = dfTest[dfTest["subreddit"].isin(validation_test)]
print(dfVal["banned"].sum()/ dfVal.shape[0])


dfThresh = dfTest[dfTest["subreddit"].isin(threshold_test)]
print(dfThresh["banned"].sum()/ dfThresh.shape[0])
threshSubreddits  = group_by_subreddit(dfThresh)

dfTestingTest = dfTest[dfTest["subreddit"].isin(testing_test)]
print(dfTestingTest["banned"].sum()/ dfTestingTest.shape[0])
testingTestSubreddits  = group_by_subreddit(dfTestingTest)


0.006299239776679007
0.004335934894269896
0.006116000902730761


In [48]:
TRAIN_BALANCE_RATIO = 20
TEST_BALANCE_RATIO = 185
TRAIN_N_COMMENTS = int(len(dftrain_banned)/3)
TEST_N_COMMENTS = int(len(dfTest_banned)/10)

In [49]:
dfTest_banned = dfVal[dfVal["banned"]]
dfTest_notbanned = dfVal[dfVal["banned"] == False]

dfTest_balanced = pd.concat([dfTest_banned.head(n=TEST_N_COMMENTS), dfTest_notbanned.head(n=TEST_BALANCE_RATIO*TEST_N_COMMENTS)]).sample(frac=1)

In [50]:
dfTrain_banned_tmp = pd.concat([dftrain_banned.head(n=TRAIN_N_COMMENTS)]*int(TRAIN_BALANCE_RATIO))
dfTrain = pd.concat([dfTrain_banned_tmp, dftrain_notbanned.head(n=TRAIN_BALANCE_RATIO*TRAIN_N_COMMENTS)])

In [51]:
dfTrain[dfTrain["banned"]==1].shape, dfTrain[dfTrain["banned"]==0].shape

((50000, 2), (50000, 2))

In [52]:
dfTrain["split"] = dfTrain["words"].apply(lambda x: x.split(" "))
dfTrain["word_cnt"] = dfTrain["split"].apply(lambda x: len(x))
print("Train percent lost: %.2f" % (100*len(dfTrain[dfTrain["word_cnt"] != 200])/ len(dfTrain)))
dfTrain = dfTrain[dfTrain["word_cnt"]== 200]

Train percent lost: 1.00


In [53]:
dfTrain = dfTrain.sample(frac=1)
dfTrain.head(n=10)

Unnamed: 0,banned,words,split,word_cnt
34777,0,in some instances . There is a pretty famous c...,"[in, some, instances, ., There, is, a, pretty,...",200
38630,0,] f this suggestion of personhood is establish...,"[], f, this, suggestion, of, personhood, is, e...",200
2305,1,"bro to hang with . I met a few my last trip , ...","[bro, to, hang, with, ., I, met, a, few, my, l...",200
228,1,"was the most popular answer to that query , so...","[was, the, most, popular, answer, to, that, qu...",200
14913,0,) ** but we know that 1+1=322 - & gt ; equatio...,"[), **, but, we, know, that, 1+1=322, -, &, gt...",200
33164,0,thousands of HS like nothing ! Acknowledge my ...,"[thousands, of, HS, like, nothing, !, Acknowle...",200
1677,1,for me at points . They do n't work if you did...,"[for, me, at, points, ., They, do, n't, work, ...",200
2072,1,"if you do n't yet have much to offer , put wha...","[if, you, do, n't, yet, have, much, to, offer,...",200
16252,0,I 've only listened Dan Carlin 's Hardcore His...,"[I, 've, only, listened, Dan, Carlin, 's, Hard...",200
692,1,and psychological warfare is a pretty valid po...,"[and, psychological, warfare, is, a, pretty, v...",200


In [54]:
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(dfTrain["words"])
y_train = dfTrain["banned"]

X_test = vectorizer.transform(dfTest_balanced["words"])
y_test = dfTest_balanced["banned"]

In [58]:
input_dim = X_train.shape[1]  # Number of features
print(X_train.shape)
print(input_dim)

model = Sequential()


model.add(layers.Dense(100, input_dim=input_dim, activation='relu'))
model.add(layers.Dropout(0.5))

model.add(layers.Dense(100, input_dim=input_dim, activation='relu'))
model.add(layers.Dropout(0.5))

model.add(layers.Dense(100, input_dim=input_dim, activation='relu'))
model.add(layers.Dropout(0.5))

# model.add(layers.Dense(100, input_dim=input_dim, activation='relu'))
# model.add(layers.Dropout(0.5))

# model.add(layers.Dense(100, input_dim=input_dim, activation='relu'))
# model.add(layers.Dropout(0.5))


model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
# model.add(layers.Dense(100, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

(98995, 193952)
193952


In [59]:
parallel_model = multi_gpu_model(model, gpus=8, cpu_merge=False)
parallel_model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['acc',f1_m,precision_m, recall_m])
parallel_model.summary()

# model.compile(loss='binary_crossentropy',
#               optimizer='adam',
#               metrics=['acc',f1_m,precision_m, recall_m])
# model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
dense_6_input (InputLayer)      (None, 193952)       0                                            
__________________________________________________________________________________________________
lambda_9 (Lambda)               (None, 193952)       0           dense_6_input[0][0]              
__________________________________________________________________________________________________
lambda_10 (Lambda)              (None, 193952)       0           dense_6_input[0][0]              
__________________________________________________________________________________________________
lambda_11 (Lambda)              (None, 193952)       0           dense_6_input[0][0]              
__________________________________________________________________________________________________
lambda_12 

In [60]:
history = parallel_model.fit(X_train, y_train,
                    epochs=2,
                    verbose=True,
                    validation_data=(X_test[:10000], y_test[:10000]),
                    batch_size=1024)

Train on 98995 samples, validate on 10000 samples
Epoch 1/2
Epoch 2/2


In [127]:
parallel_model.save("../Data/Cached/Model.h5")

In [129]:
with CustomObjectScope({"f1_m": f1_m, "cross_val":cross_val, "recall_m":recall_m, "precision_m":precision_m}):
    model_loaded = load_model("../Data/Cached/Model.h5")

In [None]:
# X_thresh = vectorizer.transform(flatten(threshSubreddits.values()))

def predict_batch(examples):
    X_thresh = vectorizer.transform(examples)
    return parallel_model.predict(X_thresh, batch_size=256)
    
    
predictions = Parallel(n_jobs=-1)(delayed(predict_batch)(examples) for examples in tqdm(threshSubreddits.values()))

# for subreddit, examples in threshSubreddits.items():
#     X_thresh = vectorizer.transform(examples)
#     print(len(examples), flush=True)
#     thresh_predications[subreddit] = parallel_model.predict(X_thresh, batch_size=len(examples))
    

In [None]:
parallel_model.__dict__

In [132]:
prediction = model_loaded.predict(X_thresh, use_multiprocessing=True)


TypeError: predict() got an unexpected keyword argument 'use_multiprocessing'

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

In [None]:
plot_history(history)