# Imports

In [48]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder

import pickle

import psycopg2

# Read in data

In [1]:
with open("../database/secrets", "r") as file:
    secrets = [i.strip('\n') for i in file.readlines()]


def conn_curs():
    """
    makes a connection to the database dont worry these are dummy keys
    """

    connection = psycopg2.connect(dbname=secrets[4], user=secrets[4],
                                  password=secrets[5], host=secrets[6])
    cursor = connection.cursor()
    return connection, cursor

In [4]:
conn, curs = conn_curs()
df = pd.read_sql("SELECT * FROM posts", conn)

# Inspect

In [5]:
df.head()

Unnamed: 0,id,text,subreddit
0,1,Using This Subreddit # Rules\n1. [Be excellent...,MovieSuggestions
1,2,Looking for a movie where a mother/father go i...,MovieSuggestions
2,3,The Abyss \n\nHi Everyone. I had originally p...,MovieSuggestions
3,4,"Good plot twist Hi all,\n\nAny suggestions for...",MovieSuggestions
4,5,featuring characters who have interesting reas...,MovieSuggestions


In [6]:
df.shape

(15935, 3)

In [7]:
df.drop('id', axis=1, inplace=True)

In [8]:
df.subreddit.value_counts()

AskWomen                300
learnpython             300
RedditWritesSeinfeld    300
Jokes                   300
teenagers               300
britishproblems         300
ADHD                    300
DecidingToBeBetter      300
socialskills            300
soccer                  300
dating_advice           300
philosophy              300
TalesFromRetail         300
offmychest              300
nfl                     300
gaming                  300
Showerthoughts          300
ShouldIbuythisgame      300
hockey                  300
AskMen                  300
CasualConversation      300
3amjokes                300
politics                300
explainlikeimfive       300
bestoflegaladvice       300
MovieSuggestions        300
AmItheAsshole           300
askscience              300
socialanxiety           300
talesfromtechsupport    300
tifu                    300
whowouldwin             300
HailCorporate           300
WritingPrompts          300
pcmasterrace            300
PoliticalDiscussion 

# Model

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.subreddit, test_size=0.33, random_state=42)

In [82]:
log_pipe = make_pipeline(TfidfVectorizer(max_df=.95,  min_df=80), LogisticRegression(random_state=42, n_jobs=-1))
log_pipe.fit(X_train, y_train);

In [84]:
print(f"Log train accuracy: {log_pipe.score(X_train, y_train)}")
print(f"Log val accuracy: {log_pipe.score(X_test, y_test)}")

Log train accuracy: 0.7729486699138254
Log val accuracy: 0.608290549534132


In [12]:
forest_pipe = make_pipeline(TfidfVectorizer(max_df=.95, min_df=80), LogisticRegression(random_state=42, n_jobs=-1))
forest_pipe.fit(X_train, y_train);

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(max_df=0.95, min_df=80)),
                ('logisticregression',
                 LogisticRegression(n_jobs=-1, random_state=42))])

In [14]:
print(f"Forest train accuracy: {forest_pipe.score(X_train, y_train)}")
print(f"Forest val accuracy: {forest_pipe.score(X_test, y_test)}")

Forest train accuracy: 0.7729486699138254
Forest val accuracy: 0.608290549534132


In [26]:
sgd_pipe = make_pipeline(TfidfVectorizer(max_df=.95, min_df=80), SGDClassifier(n_jobs=-1))
sgd_pipe.fit(X_train, y_train);

In [27]:
print(f"SGD train accuracy: {sgd_pipe.score(X_train, y_train)}")
print(f"SGD val accuracy: {sgd_pipe.score(X_test, y_test)}")

SGD train accuracy: 0.8768265267890596
SGD val accuracy: 0.6035367940673132


# Hyperparameter Tuning

In [35]:
sgd_tune = make_pipeline(TfidfVectorizer(), SGDClassifier(n_jobs=-1))

params = {
    'sgdclassifier__loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
    'sgdclassifier__learning_rate': ["constant","optimal","invscaling","adaptive"],
    'sgdclassifier__eta0': [.001, .0001, .01],
    'sgdclassifier__early_stopping': [True, False],
    'sgdclassifier__validation_fraction': [.1, .2, .3],
    'tfidfvectorizer__min_df': [30, 50, 80, 100, .1],
    'tfidfvectorizer__max_df': [.95, .9, .97]
}

search = RandomizedSearchCV(sgd_tune, params, random_state=42, cv=3, n_jobs=-1, n_iter=150, verbose=1)

search.fit(X_train, y_train);

Fitting 3 folds for each of 150 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    8.6s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:   56.8s
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:  2.7min finished


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('tfidfvectorizer',
                                              TfidfVectorizer()),
                                             ('sgdclassifier',
                                              SGDClassifier(n_jobs=-1))]),
                   n_iter=150, n_jobs=-1,
                   param_distributions={'sgdclassifier__early_stopping': [True,
                                                                          False],
                                        'sgdclassifier__eta0': [0.001, 0.0001,
                                                                0.01],
                                        'sgdclassifier__learning_rate': ['constant',
                                                                         'optimal',
                                                                         'invscaling',
                                                                         'adaptive'],
          

In [36]:
search.best_params_

{'tfidfvectorizer__min_df': 30,
 'tfidfvectorizer__max_df': 0.97,
 'sgdclassifier__validation_fraction': 0.2,
 'sgdclassifier__loss': 'squared_hinge',
 'sgdclassifier__learning_rate': 'constant',
 'sgdclassifier__eta0': 0.01,
 'sgdclassifier__early_stopping': False}

In [37]:
search.best_score_

0.6216751552287852

In [None]:
# {'tfidfvectorizer__min_df': 30,
#  'tfidfvectorizer__max_df': 0.97,
#  'sgdclassifier__validation_fraction': 0.2,
#  'sgdclassifier__loss': 'squared_hinge',
#  'sgdclassifier__learning_rate': 'constant',
#  'sgdclassifier__eta0': 0.01,
#  'sgdclassifier__early_stopping': False}
# 0.6216751552287852

# Now forest

In [45]:
forest_tune = make_pipeline(TfidfVectorizer(), RandomForestClassifier(random_state=42, n_jobs=-1))

params = {
    'randomforestclassifier__n_estimators': range(100, 501, 100),
    'randomforestclassifier__criterion': ['gini', 'entropy'],
    'randomforestclassifier__max_depth': [None, 5, 10, 40, 100, 200],
    'randomforestclassifier__min_samples_split': range(2, 51, 2),
    'randomforestclassifier__min_samples_leaf': range(1, 51, 2),
    'randomforestclassifier__max_features': ['auto', 'sqrt', 'log2'],
    
    'tfidfvectorizer__min_df': [30, 50, 80, 100, .1],
    'tfidfvectorizer__max_df': [.95, .9, .97]
}

forest_search = RandomizedSearchCV(forest_tune, params, random_state=42, cv=3, n_jobs=-1, n_iter=150, verbose=1)

forest_search.fit(X_train, y_train);

Fitting 3 folds for each of 150 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:  4.2min finished


In [46]:
forest_search.best_params_

{'tfidfvectorizer__min_df': 30,
 'tfidfvectorizer__max_df': 0.97,
 'randomforestclassifier__n_estimators': 100,
 'randomforestclassifier__min_samples_split': 10,
 'randomforestclassifier__min_samples_leaf': 3,
 'randomforestclassifier__max_features': 'auto',
 'randomforestclassifier__max_depth': 100,
 'randomforestclassifier__criterion': 'gini'}

In [47]:
forest_search.best_score_

0.5651931942195753

# MORE REDDITS NUKED OUR SCORES

# TRYING A NET BECAUSE SCORES ARE DEPRESSING

In [105]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf
from tensorflow.keras.optimizers import Adam, Adagrad, SGD, Ftrl, RMSprop
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorboard.plugins.hparams import api as hp
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from tensorflow.keras import regularizers
from tensorflow.keras.constraints import MaxNorm

import os
import datetime

In [50]:
le = LabelEncoder()
le.fit(df.subreddit.unique())

LabelEncoder()

In [85]:
vect = TfidfVectorizer(max_df=.95, min_df=80, stop_words='english')
vect.fit(X_train)

TfidfVectorizer(max_df=0.95, min_df=80, stop_words='english')

In [172]:
logdir = os.path.join("logs", "Sigmoid + Dropout")
tensorboard_callback = TensorBoard(logdir, histogram_freq=1)
stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=5)

model = Sequential()
model.add(Dense(256, activation='sigmoid', input_dim=len(vect.get_feature_names())))
model.add(Dropout(.1))
model.add(Dense(128, activation='sigmoid'))
model.add(Dropout(.1))
model.add(Dense(128, activation='sigmoid'))
model.add(Dropout(.1))
model.add(Dense(df.subreddit.nunique(), activation='softmax'))

model.compile(optimizer=Adam(.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [173]:
model.fit(np.array(vect.transform(X_train).todense()),le.transform(y_train),
          validation_data=(np.array(vect.transform(X_test).todense()), le.transform(y_test)),
          batch_size=32, epochs=200, callbacks=[tensorboard_callback, stop])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200


<tensorflow.python.keras.callbacks.History at 0x7f5e70076a20>

In [174]:
%load_ext tensorboard
%tensorboard --logdir logs

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


# Tring random search on net

In [104]:
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [154]:
def create_model(hidden_layers:int, largest_hidden_nueron:int, activation:str, out_activation, optimizer, learning_rate:float):
    # create model
    model = Sequential()
    # first layer is special so make out of for loop
    x = [int(i) for i in np.linspace(64, largest_hidden_nueron, hidden_layers//2)]
    y = x + list(reversed(x))
    model.add(Dense(y[0], input_dim=len(vect.get_feature_names()), activation=activation))
    if hidden_layers > 1:
        for i in range(1, hidden_layers - 1):
            model.add(Dense(y[i], activation=activation))
    model.add(Dense(df.subreddit.nunique(), activation=out_activation))
    # Compile model
    model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer(learning_rate), metrics=['accuracy'])
    return model

In [155]:
model = KerasClassifier(build_fn=create_model, verbose=1)

In [168]:
param_grid = {'batch_size': (16,32,64,512),
              'epochs': [100],
              'hidden_layers': (1,3,5,8),
              'largest_hidden_nueron': [512,256,128],
              'activation': ('relu', 'sigmoid', 'tanh'),
              'out_activation': ['softmax'],
              'optimizer': (Adam, Adagrad, SGD, Ftrl, RMSprop),
              'learning_rate': [.001]#tuple(np.linspace(.001, .01, 5))
             }

nueral_search = RandomizedSearchCV(estimator=model,param_distributions=param_grid,n_jobs=-1,cv=3,random_state=42,verbose=1,n_iter=20)
nueral_search.fit(np.array(vect.transform(X_train).todense()),le.transform(y_train),
                  validation_data=(np.array(vect.transform(X_test).todense()), le.transform(y_test)),
                  epochs=32)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   11.1s finished


Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


RandomizedSearchCV(cv=3,
                   estimator=<tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier object at 0x7f5efc361eb8>,
                   n_iter=20, n_jobs=-1,
                   param_distributions={'activation': ('relu', 'sigmoid',
                                                       'tanh'),
                                        'batch_size': (16, 32, 64, 512),
                                        'epochs': [100],
                                        'hidden_layers': (1, 3, 5, 8),
                                        'largest_hidden_nueron': [512, 256,
                                                                  128],
                                        'learning_rate': [0.001],
                                        'optimizer': (<class...sorflow.python.keras.optimizer_v2.adam.Adam'>,
                                                      <class 'tensorflow.python.keras.optimizer_v2.adagrad.Adagrad'>,
                                  

In [169]:
nueral_search.best_params_

{'out_activation': 'softmax',
 'optimizer': tensorflow.python.keras.optimizer_v2.adam.Adam,
 'learning_rate': 0.001,
 'largest_hidden_nueron': 128,
 'hidden_layers': 5,
 'epochs': 100,
 'batch_size': 32,
 'activation': 'sigmoid'}

In [167]:
# logdir = os.path.join("logs", "Best from search")
# tensorboard_callback = TensorBoard(logdir, histogram_freq=1)
# stop = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=10)

# nn = Sequential([
#     Dense(64, activation='sigmoid', input_dim=len(vect.get_feature_names())),
#     Dense(128, activation='sigmoid'),
#     Dense(128, activation='sigmoid'),
#     Dense(64, activation='sigmoid')
# ])
# nn.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(.001), metrics=['accuracy'])

# nn.fit(np.array(vect.transform(X_train).todense()),le.transform(y_train),
#           validation_data=(np.array(vect.transform(X_test).todense()), le.transform(y_test)),
#           batch_size=32, epochs=100)