In [44]:

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import pickle
import psycopg2

In [45]:
from tensorflow.keras.regularizers import l2
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.layers import LSTM, SpatialDropout1D, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping
from numpy import array


In [None]:
!pip install -U nltk

In [58]:
# Read data from PostGres SQL Data Base
with open("../database/secrets", "r") as file:
    secrets = [i.strip('\n') for i in file.readlines()]


def conn_curs():
    """
    makes a connection to the database dont worry these are dummy keys
    """

    connection = psycopg2.connect(dbname=secrets[4], user=secrets[4],
                                  password=secrets[5], host=secrets[6])
    cursor = connection.cursor()
    return connection, cursor


In [59]:
conn, curs = conn_curs()

In [60]:
df = pd.read_sql("SELECT * FROM posts", conn)

In [61]:
df.head()

Unnamed: 0,id,text,subreddit
0,1,Thousand Year Blood War Arc Anime Adaptation M...,bleach
1,2,Burn The Witch - Chapter 4 Discussion Thread #...,bleach
2,3,Let the journey begin.,bleach
3,4,"Since Ichigos an English Literature major, thi...",bleach
4,5,I just made these Ulquiorra customs for a clie...,bleach


In [89]:
#imports to clean text
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johnrivera/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Clean Data

In [90]:
#import regular expressions to filter/clean appropiate text
import re
df = df.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text
df['text'] = df['text'].apply(clean_text)
df['text'] = df['text'].str.replace('\d+', '')


In [91]:
df.head()

Unnamed: 0,id,text,subreddit
0,1,thousand year blood war arc anime adaptation m...,bleach
1,2,burn witch chapter discussion thread #chapter ...,bleach
2,3,let journey begin,bleach
3,4,since ichigos english literature major would s...,bleach
4,5,made ulquiorra customs client mine huge bleach...,bleach


In [93]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 50000
# Max number of words in each post`
MAX_SEQUENCE_LENGTH = 250
# This is fixed.
EMBEDDING_DIM = 100
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 75901 unique tokens.


In [94]:
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (29788, 250)


In [95]:
Y = pd.get_dummies(df['subreddit']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (29788, 100)


In [106]:
Y

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [103]:
df['subreddit'].unique()

array(['bleach', 'BackYardChickens', 'resumes', 'ballpython',
       'TooAfraidToAsk', 'Gunpla', 'christmas', 'cats', 'DDLC',
       'immigration', 'MLPLounge', 'bettafish', 'Bedbugs', 'araragi',
       'getting_over_it', 'findapath', 'parrots', 'dbz',
       'Dragonballsuper', 'Gundam', 'selfhelp', 'snakes',
       'whatsthisplant', 'asklaw', 'Petloss', 'Berserk', 'KissAnime',
       'RATS', 'reptiles', 'HunterXHunter', 'Gifts', 'DarlingInTheFranxx',
       'LegalAdviceUK', 'ferrets', 'RBI', 'LifeProTips', 'Rabbits',
       'GiftIdeas', 'ShingekiNoKyojin', 'ask', 'leopardgeckos',
       'CaptainTsubasaDT', 'BokuNoHeroAcademia', 'anime',
       'DecidingToBeBetter', 'whatsthisbug', 'manga', 'legaladvice',
       'nosurf', 'shrimptank', 'whatisthisthing', 'deathnote',
       'WouldYouRather', 'GetMotivated', 'AusLegal', 'whatstheword',
       'CasualConversation', 'ImmigrationCanada', 'getdisciplined',
       'Beekeeping', 'BeardedDragons', 'ifyoulikeblank', 'declutter',
       'NoStupi

In [96]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.10, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(26809, 250) (26809, 100)
(2979, 250) (2979, 100)


In [97]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(100, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [98]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(100, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 30
batch_size = 64

history = model.fit(X_train, Y_train, epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30


In [105]:
# Testing W/ Bleach Post
post = ["""In episode 119 after Ikkaku used Bankai on Edrad it talked about
Ikkaku's backstory and he trained Renji when he didn't know how to fight. 
I looked for it in the manga but they skipped that scene.
I really want to know because they performed a mountain level feat"""]
seq = tokenizer.texts_to_sequences(post)
padded = pad_sequences(seq, maxlen=MAX_SEQUENCE_LENGTH)
pred = model.predict(padded)
labels = df['subreddit'].unique()
print(labels[np.argmax(pred)])

deathnote
