# Imports

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

import pickle

import psycopg2

# Read Data From PostGressSql Data Base

In [2]:
with open("../database/secrets", "r") as file:
    secrets = [i.strip('\n') for i in file.readlines()]


def conn_curs():
    """
    makes a connection to the database dont worry these are dummy keys
    """

    connection = psycopg2.connect(dbname=secrets[4], user=secrets[4],
                                  password=secrets[5], host=secrets[6])
    cursor = connection.cursor()
    return connection, cursor

In [3]:
conn, curs = conn_curs()

In [4]:
df = pd.read_sql("SELECT * FROM posts", conn)

# Explore Data Frame

In [5]:
df.head()

Unnamed: 0,id,text,subreddit
0,1,Thousand Year Blood War Arc Anime Adaptation M...,bleach
1,2,Burn The Witch - Chapter 4 Discussion Thread #...,bleach
2,3,Let the journey begin.,bleach
3,4,"Since Ichigos an English Literature major, thi...",bleach
4,5,I just made these Ulquiorra customs for a clie...,bleach


In [6]:
df.shape

(29788, 3)

In [7]:
df.drop('id', axis=1, inplace=True)

In [8]:
df.subreddit.value_counts()

legal                 300
NameThatSong          300
datarecovery          300
whatsthatbook         300
GiftIdeas             300
                     ... 
DecidingToBeBetter    300
translator            300
needadvice            280
GetMotivated          220
Rabbits               188
Name: subreddit, Length: 100, dtype: int64

In [9]:
df['text']

0        Thousand Year Blood War Arc Anime Adaptation M...
1        Burn The Witch - Chapter 4 Discussion Thread #...
2                                  Let the journey begin. 
3        Since Ichigos an English Literature major, thi...
4        I just made these Ulquiorra customs for a clie...
                               ...                        
29783    My friendly dog has enemies, that he attacks. ...
29784    Dog won’t follow my lead on walks Hey guys \n\...
29785    Dog is afraid to go into the back yard out of ...
29786    Roommates dog needy/neglected, how to train? T...
29787    In the video, Olive (Labrador) is not comforta...
Name: text, Length: 29788, dtype: object

# Model

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.text, df.subreddit, test_size=0.33, random_state=42)

In [11]:
X_train.head()

20940    Starting to study So my classes only start nex...
20946    How to deal with guilt? So as many people rece...
28725    [Request] The first 2 questions from this xkcd...
11715    WTW for background noise becoming louder (or a...
8406     SSR player from coin transfer Anyone get any s...
Name: text, dtype: object

In [12]:
vect = TfidfVectorizer(max_df=.95, min_df=80)
vect.fit(X_train)
train_df = pd.DataFrame(vect.transform(X_train).todense(), columns=vect.get_feature_names())
train_df.head()
val_df = pd.DataFrame(vect.transform(X_test).todense(), columns=vect.get_feature_names())
val_df.head()

Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,youll,young,younger,your,youre,yourself,youtu,youtube,youve,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# each post is represented by tfidvector (each row)
vect.transform(X_test).todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [14]:
log_pipe = make_pipeline(TfidfVectorizer(max_df=.95,  min_df=80), LogisticRegression(random_state=42, n_jobs=-1))
log_pipe.fit(X_train, y_train);

In [15]:
print(f"Log train accuracy: {log_pipe.score(X_train, y_train)}")
print(f"Log val accuracy: {log_pipe.score(X_test, y_test)}")

Log train accuracy: 0.6404269178734279
Log val accuracy: 0.4652629437493643


In [17]:
test = """In episode 119 after Ikkaku used Bankai on Edrad it talked about
Ikkaku's backstory and he trained Renji when he didn't know how to fight. 
I looked for it in the manga but they skipped that scene.
I really want to know because they performed a mountain level feat"""

In [19]:
log_pipe.predict([test])[0]

'deathnote'

# Pickle mvp log pipe

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.svm import SVC

In [21]:
with open("../models/mvp_log_pipe", "wb") as file:
    pickle.dump(log_pipe, file)

# Testing Other Models

In [22]:
forest = RandomForestClassifier(n_jobs=-1)
forest.fit(train_df, y_train);

In [23]:
print(f"Train accuracy: {forest.score(train_df, y_train)}")
print(f"Val accuracy: {forest.score(val_df, y_test)}")

Train accuracy: 0.9643232950844315
Val accuracy: 0.4287458040891059


In [24]:
mnb = MultinomialNB()
mnb.fit(train_df, y_train);

In [25]:
print(f"Train accuracy: {mnb.score(train_df, y_train)}")
print(f"Val accuracy: {mnb.score(val_df, y_test)}")

Train accuracy: 0.5616074560304655
Val accuracy: 0.41653951785169363


In [28]:
sgd = SGDClassifier(n_jobs=-1)
sgd.fit(train_df, y_train);

In [29]:
print(f"Train accuracy: {sgd.score(train_df, y_train)}")
print(f"Val accuracy: {sgd.score(val_df, y_test)}")

Train accuracy: 0.6885303402314977
Val accuracy: 0.44359678567795746
