In [16]:
# Imports
SEED = 0
import random
random.seed(SEED)

import numpy as np
np.random.seed(SEED)

import copy
import pandas as pd
from pathlib import Path
from pprint import pprint
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

# Constants
NB = 'naive bayes'
SVM = 'SVM'
RF = 'random forest'
KNN = 'kNN'
LG = "logReg"


def make_models():
    '''
    Make a variety of model pipelines and parameter grids. Return a dict mapping from a model name
    to a tuple of (model, param_grid). The param_grid is for use with GridSearchCV

    :return: dict
    '''

    nb =  Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', MultinomialNB()),
    ])

    svm =  Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, shuffle=True,
                              max_iter=10, n_jobs=-1
                              # early_stopping=True, tol=1e-3, n_iter_no_change=5, validation_fraction=0.1
                              ))
    ])

    knn =  Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', KNeighborsClassifier(n_neighbors=2, weights='uniform', p=1,  n_jobs=-1)),
    ])

    rf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', RandomForestClassifier(n_estimators=100, n_jobs=-1)),
    ])
    
    logRg = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', LogisticRegression( n_jobs=-1)),
    ])

    return {LG:logRg, NB: nb, SVM: svm, RF: rf,  KNN: knn}


def evaluate_model(y, pred):

    print(classification_report(y, pred))
    report = classification_report(y, pred, output_dict=True)
    print('Confusion matrix: row is true class, col is predicted class')
    cm = confusion_matrix(y, pred)
    print(cm)
    return report, cm


In [17]:
# # Load Data
# TRAIN_DATA = "../Data/Generated/RC_2016-10_Train.pkl"
# TEST_DATA = "../Data/Generated/RC_2016-10_Test.pkl"

# postsTrain = pd.read_pickle(TRAIN_DATA)
# postsTest = pd.read_pickle(TEST_DATA)

In [21]:
dftrain_banned = pd.read_csv("../Data/Generated/200_words_10M_banned.csv", delimiter=',')
dftrain_banned.insert(0, "banned", 1)

dftrain_notbanned = pd.read_csv("../Data/Generated/200_words_10M_notbanned.csv", delimiter=',')
dftrain_notbanned.insert(0, "banned", 0)

dfTest = pd.read_csv("../Data/Generated/200_words_10M_test.csv", delimiter=',')
dfTest = dfTest.sample(frac=1)

dfTest["split"] = dfTest["words"].map(lambda x: x.split(" "), na_action='ignore')
dfTest["word_cnt"] = dfTest["split"].map(lambda x: len(x), na_action='ignore')
print("Test percent lost: %.2f" % (100*len(dfTest[dfTest["word_cnt"] != 200])/ len(dfTest)))
dfTest = dfTest[dfTest["word_cnt"] == 200]

dfTest_banned = dfTest[dfTest["banned"]]
dfTest_notbanned = dfTest[dfTest["banned"] == False]

TRAIN_BALANCE_RATIO = 100
TEST_BALANCE_RATIO = 100
TRAIN_N_COMMENTS = int(len(dftrain_banned)/1)
TEST_N_COMMENTS = int(len(dfTest_banned)/2)

dfTest_balanced = pd.concat([dfTest_banned.head(n=TEST_N_COMMENTS), dfTest_notbanned.head(n=TEST_BALANCE_RATIO*TEST_N_COMMENTS)]).sample(frac=1)

dfTrain = pd.concat([dftrain_banned.head(n=TRAIN_N_COMMENTS), dftrain_notbanned.head(n=TRAIN_BALANCE_RATIO*TRAIN_N_COMMENTS)])

dfTrain["split"] = dfTrain["words"].apply(lambda x: x.split(" "))
dfTrain["word_cnt"] = dfTrain["split"].apply(lambda x: len(x))
print("Train percent lost: %.2f" % (100*len(dfTrain[dfTrain["word_cnt"] != 200])/ len(dfTrain)))
dfTrain = dfTrain[dfTrain["word_cnt"]== 200]

dfTrain = dfTrain.sample(frac=1)
dfTrain.head(n=10)


Test percent lost: 5.63
Train percent lost: 0.03


Unnamed: 0,banned,words,split,word_cnt
440841,0,is related with vex ... suppose ) I saw a Sapp...,"[is, related, with, vex, ..., suppose, ), I, s...",200
444289,0,"end of the week , this week he was actually pu...","[end, of, the, week, ,, this, week, he, was, a...",200
436538,0,"your credit score , however I 'm not sure if h...","[your, credit, score, ,, however, I, 'm, not, ...",200
721801,0,night but I bet the Steelers D did well . What...,"[night, but, I, bet, the, Steelers, D, did, we...",200
545623,0,worst bit though is the sound effect implies t...,"[worst, bit, though, is, the, sound, effect, i...",200
467945,0,and there 's a TON of discussion around the wh...,"[and, there, 's, a, TON, of, discussion, aroun...",200
627647,0,"to weekends for relaxing and having fun , holi...","[to, weekends, for, relaxing, and, having, fun...",200
361817,0,") , [ see IMGUR gallery with more pics here ] ...","[), ,, [, see, IMGUR, gallery, with, more, pic...",200
739472,0,385 . You increase this by maximizing your dam...,"[385, ., You, increase, this, by, maximizing, ...",200
235344,0,and cracked app vectors . It 's still dangerou...,"[and, cracked, app, vectors, ., It, 's, still,...",200


In [22]:
kfold = 5

models = make_models()

results = {'kfold': kfold,
           'trials': []}

x_train = dfTrain["words"].values
y_train = dfTrain["banned"].values

x_test = dfTest["words"].values
y_test = dfTest["banned"].values


In [None]:
rf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', RandomForestClassifier(n_estimators=1000, n_jobs=-1)),
    ])
rf.fit(x_train, y_train)
pred_test = rf.predict(x_test)
cr_train, cm_train = evaluate_model(y_test, pred_test)


In [14]:

for model_id in models:
    result = {'model': model_id}
    model = models[model_id]
    original_model = copy.deepcopy(model)
    print('==========================================================')
    print(f'training and evaluating {model_id}')

    print(f'dataset shapes: x_train: {len(x_train)}, x_test: {len(x_test)}, y_train: {len(y_train)}, y_test: {len(y_test)}')

    print('train set evaluation')
    model = copy.deepcopy(original_model)

    model.fit(x_train, y_train)
        
    pred_test = model.predict(x_test)
    cr_train, cm_train = evaluate_model(y_test, pred_test)
    print('==========================================================')


training and evaluating logReg
dataset shapes: x_train: 44841, x_test: 339426, y_train: 44841, y_test: 339426
train set evaluation


  " = {}.".format(effective_n_jobs(self.n_jobs)))


KeyboardInterrupt: 