In [7]:
# Imports
SEED = 0
import random
random.seed(SEED)

import numpy as np
np.random.seed(SEED)

import copy
import pandas as pd
from pathlib import Path
from pprint import pprint
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

# Constants
NB = 'naive bayes'
SVM = 'SVM'
RF = 'random forest'
KNN = 'kNN'
LG = "logReg"


def make_models():
    '''
    Make a variety of model pipelines and parameter grids. Return a dict mapping from a model name
    to a tuple of (model, param_grid). The param_grid is for use with GridSearchCV

    :return: dict
    '''

    nb =  Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', MultinomialNB()),
    ])

    svm =  Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, shuffle=True,
                              max_iter=10, n_jobs=-1
                              # early_stopping=True, tol=1e-3, n_iter_no_change=5, validation_fraction=0.1
                              ))
    ])

    knn =  Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', KNeighborsClassifier(n_neighbors=2, weights='uniform', p=1,  n_jobs=-1)),
    ])

    rf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', RandomForestClassifier(n_estimators=100, n_jobs=-1)),
    ])
    
    logRg = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', LogisticRegression( n_jobs=-1)),
    ])

    return {LG:logRg, NB: nb, SVM: svm, RF: rf,  KNN: knn}


def evaluate_model(y, pred):

    print(classification_report(y, pred))
    report = classification_report(y, pred, output_dict=True)
    print('Confusion matrix: row is true class, col is predicted class')
    cm = confusion_matrix(y, pred)
    print(cm)
    return report, cm


In [8]:
# Load Data
TRAIN_DATA = "../Data/Generated/RC_2016-10_Train.pkl"
TEST_DATA = "../Data/Generated/RC_2016-10_Test.pkl"

postsTrain = pd.read_pickle(TRAIN_DATA)
postsTest = pd.read_pickle(TEST_DATA)

In [9]:
kfold = 5

models = make_models()

results = {'kfold': kfold,
           'trials': []}

x_train = [' '.join(row) for row in postsTrain["tokens"].values]
y_train = postsTrain["banned"].values

x_test = [' '.join(row) for row in postsTest["tokens"].values]
y_test = postsTest["banned"].values


In [13]:
rf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', RandomForestClassifier(n_estimators=20, n_jobs=-1)),
    ])
rf.fit(x_train, y_train)
pred_test = rf.predict(x_test)
cr_train, cm_train = evaluate_model(y_test, pred_test)


              precision    recall  f1-score   support

           0       0.79      0.82      0.81      5071
           1       0.81      0.77      0.79      4929

   micro avg       0.80      0.80      0.80     10000
   macro avg       0.80      0.80      0.80     10000
weighted avg       0.80      0.80      0.80     10000

Confusion matrix: row is true class, col is predicted class
[[4173  898]
 [1110 3819]]


In [None]:
rf = Pipeline([
        ('vect', CountVectorizer(ngram_range=(1, 1))),
        ('tfidf', TfidfTransformer(use_idf=True)),
        ('clf', RandomForestClassifier(n_estimators=40, n_jobs=-1)),
    ])
rf.fit(x_train, y_train)
pred_test = rf.predict(x_test)
cr_train, cm_train = evaluate_model(y_test, pred_test)


In [6]:

for model_id in models:
    result = {'model': model_id}
    model = models[model_id]
    original_model = copy.deepcopy(model)
    print('==========================================================')
    print(f'training and evaluating {model_id}')

    print(f'dataset shapes: x_train: {len(x_train)}, x_test: {len(x_test)}, y_train: {len(y_train)}, y_test: {len(y_test)}')

    print('train set evaluation')
    model = copy.deepcopy(original_model)

    model.fit(x_train, y_train)
        
    pred_test = model.predict(x_test)
    cr_train, cm_train = evaluate_model(y_test, pred_test)
    print('==========================================================')


training and evaluating logReg
dataset shapes: x_train: 90000, x_test: 10000, y_train: 90000, y_test: 10000
train set evaluation




              precision    recall  f1-score   support

           0       0.92      0.93      0.92      5071
           1       0.92      0.92      0.92      4929

   micro avg       0.92      0.92      0.92     10000
   macro avg       0.92      0.92      0.92     10000
weighted avg       0.92      0.92      0.92     10000

Confusion matrix: row is true class, col is predicted class
[[4691  380]
 [ 415 4514]]
training and evaluating naive bayes
dataset shapes: x_train: 90000, x_test: 10000, y_train: 90000, y_test: 10000
train set evaluation
              precision    recall  f1-score   support

           0       0.94      0.93      0.94      5071
           1       0.93      0.94      0.93      4929

   micro avg       0.93      0.93      0.93     10000
   macro avg       0.93      0.93      0.93     10000
weighted avg       0.93      0.93      0.93     10000

Confusion matrix: row is true class, col is predicted class
[[4720  351]
 [ 304 4625]]
training and evaluating SVM
dataset sh



              precision    recall  f1-score   support

           0       0.80      0.93      0.86      5071
           1       0.91      0.76      0.83      4929

   micro avg       0.84      0.84      0.84     10000
   macro avg       0.85      0.84      0.84     10000
weighted avg       0.85      0.84      0.84     10000

Confusion matrix: row is true class, col is predicted class
[[4694  377]
 [1200 3729]]
training and evaluating random forest
dataset shapes: x_train: 90000, x_test: 10000, y_train: 90000, y_test: 10000
train set evaluation


KeyboardInterrupt: 