# Imports

In [23]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2

import pickle
import psycopg2

# Read in data

In [24]:
with open("../database/secrets", "r") as file:
    secrets = [i.strip('\n') for i in file.readlines()]


def conn_curs():
    """
    makes a connection to the database dont worry these are dummy keys
    """

    connection = psycopg2.connect(dbname=secrets[4], user=secrets[4],
                                  password=secrets[5], host=secrets[6])
    cursor = connection.cursor()
    return connection, cursor

In [25]:
conn, curs = conn_curs()
df = pd.read_sql("SELECT * FROM posts", conn)

# Exploring Data Frame

In [26]:
df.drop('id', axis=1, inplace=True)

In [27]:
df.shape

(29788, 2)

In [28]:
df.head()

Unnamed: 0,text,subreddit
0,Thousand Year Blood War Arc Anime Adaptation M...,bleach
1,Burn The Witch - Chapter 4 Discussion Thread #...,bleach
2,Let the journey begin.,bleach
3,"Since Ichigos an English Literature major, thi...",bleach
4,I just made these Ulquiorra customs for a clie...,bleach


# Vectorize

In [29]:
le = LabelEncoder()
le.fit(df.subreddit.unique())

LabelEncoder()

In [30]:
le.classes_

array(['Animesuggest', 'AquaSwap', 'AusLegal', 'BackYardChickens',
       'BeardedDragons', 'Bedbugs', 'Beekeeping', 'Berserk',
       'BokuNoHeroAcademia', 'CaptainTsubasaDT', 'CasualConversation',
       'CatAdvice', 'Clairvoyantreadings', 'DDLC', 'DarlingInTheFranxx',
       'DecidingToBeBetter', 'Dogtraining', 'Dragonballsuper',
       'GetMotivated', 'GiftIdeas', 'Gifts', 'Gundam', 'Gunpla',
       'HelpMeFind', 'HunterXHunter', 'INeedAName', 'IWantOut',
       'ImmigrationCanada', 'KissAnime', 'LegalAdviceUK', 'LifeProTips',
       'MLPLounge', 'NameThatSong', 'NoStupidQuestions', 'Petloss',
       'Pets', 'RATS', 'RBI', 'Rabbits', 'ShingekiNoKyojin',
       'StardustCrusaders', 'StopGaming', 'TooAfraidToAsk',
       'WouldYouRather', 'anime', 'answers', 'araragi', 'ask', 'asklaw',
       'ballpython', 'bettafish', 'bleach', 'cats', 'changemyview',
       'christmas', 'datarecovery', 'dbz', 'deathnote', 'declutter',
       'dogs', 'explainlikeimfive', 'fatestaynight', 'ferrets',


In [31]:
le.classes_[88]

'snakes'

In [32]:
X_train, X_test, y_train, y_test = train_test_split(df.text, le.transform(df.subreddit), test_size=0.33, random_state=42)

In [33]:
vect = TfidfVectorizer(max_df=.95, min_df=80)

In [34]:
vect.fit(X_train)

TfidfVectorizer(max_df=0.95, min_df=80)

In [35]:
train_df = pd.DataFrame(vect.transform(X_train).todense(), columns=vect.get_feature_names())
train_df.head()

Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,youll,young,younger,your,youre,yourself,youtu,youtube,youve,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
val_df = pd.DataFrame(vect.transform(X_test).todense(), columns=vect.get_feature_names())
val_df.head()

Unnamed: 0,000,10,100,11,12,13,14,15,16,17,...,youll,young,younger,your,youre,yourself,youtu,youtube,youve,zero
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Constructing First Neuronetwork Model

In [37]:

model = Sequential()
model.add(Dense(128, activation='relu', input_dim=train_df.shape[1]))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(df.subreddit.nunique(), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [38]:
model.fit(train_df.to_numpy(), y_train, batch_size=32, epochs=30, validation_data=(val_df.to_numpy(), y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x14dc34978>

In [39]:
# Recall, we used label encoder to convert subreddits into numerical representations

In [49]:
le.transform(df.subreddit)

array([51, 51, 51, ..., 16, 16, 16])

# Testing Model

In [50]:
input = vect.transform(['bleach is awesome']).todense()

In [51]:
input.shape

(1, 1858)

In [43]:
model.predict(input).shape

(1, 100)

In [44]:
predicted_label = model.predict_classes(input)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [45]:
le.inverse_transform(predicted_label)

array(['araragi'], dtype=object)

In [46]:
def neuro_predictor(text,labels):
    """returns prediction from a neuronetwork model"""
    # Text paramaters should be specified as string , ex 'This is a sentence that can be passed as text'
    # labels should be passed as a series; This represents the training vector used for the model
    # Ex of labels is df.subreddits
    le = LabelEncoder()
    le.fit(labels.unique()) 
    text = [text]
    #Convert Text to vectorize form
    input = vect.transform(text).todense()
    #predicted label in encoded formate
    predicted_label = model.predict_classes(input)
    #predicted subreddit in english text
    Subreddit = le.inverse_transform(predicted_label)
    return Subreddit
    
    

In [54]:
neuro_predictor("""In episode 119 after Ikkaku used Bankai on Edrad it talked about
Ikkaku's backstory and he trained Renji when he didn't know how to fight. 
I looked for it in the manga but they skipped that scene.
I really want to know because they performed a mountain level feat""",df.subreddit)

array(['ask'], dtype=object)

# NeuroNetwork #2

In [48]:
model = Sequential()
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.003), input_dim=train_df.shape[1]))
model.add(Dropout(.2))
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.003)))
model.add(Dropout(.2))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.003)))
model.add(Dropout(.2))
model.add(Dense(df.subreddit.nunique(), activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.fit(train_df.to_numpy(), y_train, batch_size=32, epochs=30, validation_data=(val_df.to_numpy(), y_test))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x14daff668>