In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split

from keras.models import Sequential, Model
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense
from keras.layers import GlobalAveragePooling1D, Input, Conv1D, MaxPooling1D, Flatten
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from tqdm import tqdm_notebook

ModuleNotFoundError: No module named 'keras'

In [11]:
file_path = '../data/reddit_train.csv'
file_path2 = '../data/reddit_test.csv'
data = pd.read_csv(file_path)
data = data.drop(columns={'id'})
data.tail()
test_data = pd.read_csv(file_path2)
test_data.tail()

Unnamed: 0,id,comments
29995,29995,I have no idea what's going on this trailer an...
29996,29996,"I misread that at David Cross, and now I'm try..."
29997,29997,Well lets be reasonable next time and dont unb...
29998,29998,Jaime dumping on Jon for going off to serve in...
29999,29999,"I think he'll be on par, but more mechanic tha..."


In [7]:
def clean_data(s):
    for expr in [r"</d>",r"</s>",r"[^A-Za-z0-9(),!?\'\`]"]:
        s = re.sub(expr, " ", s)
    for expr in [r"\'s",r"\'ve",r"\'t",r"\'re",r"\'d",r"\'11",]:
        s = re.sub(expr, " "+expr[1:], s)
    for expr in [r",",r"!",r"\(",r"\)"r"\?"]:
        s = re.sub(expr, " "+expr[1:]+" ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r'\S*(x{2,}|X{2,})\S*', "xxx", s)
    s = re.sub(r'[^\x00-\x7F]+', "", s)
    return s.strip().lower()

In [8]:
data["comments"] = data["comments"].apply(lambda x: clean_data(x))

In [12]:
for c in data["subreddits"].unique():
    data[c] = np.zeros(len(data)).astype(int)
u = data["subreddits"].map({'hockey': 2, 'nba' : 3, 'leagueoflegends' : 4, 'soccer' : 5, 'funny': 6,
                           'movies' : 7, 'anime' : 8,
 'Overwatch': 9, 'trees' : 10, 'GlobalOffensive' : 11, 'nfl' : 12, 'AskReddit' : 13, 'gameofthrones' : 14,
 'conspiracy' : 15, 'worldnews' : 16, 'wow' : 17,
                           'europe' : 18, 'canada' : 19, 'Music' : 20, 'baseball' : 21})
for i in tqdm_notebook(range(len(data))):
    data.iloc[i,u[i]] = 1

NameError: name 'tqdm_notebook' is not defined

In [None]:
data.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.comments, data[data['subreddits'].unique()],
                                                   test_size=0.25, random_state=42)

In [None]:
xLengths = [len(word_tokenize(x)) for x in X_train]
h = sorted(xLengths)
maxLength = h[len(h) - 1]
print("max input lenght is: ", maxLength)

In [None]:
maxLength = h[int(len(h) * 0.8)]
print("80% covers input sequence length up to ", maxLength)

In [None]:
max_vocab_size = 200000
input_tokenizer = Tokenizer(max_vocab_size)
input_tokenizer.fit_on_texts(X_train)
input_vocab_size = len(input_tokenizer.word_index) +1
print("input_vocab_size: ", input_vocab_size)
totalX = np.array(pad_sequences(input_tokenizer.texts_to_sequences(X_train), maxlen=maxLength))

In [None]:
num_categories = data['subreddits'].nunique()

In [None]:
EMBEDDING_DIM = 100
model = Sequential()
model.add(Embedding(input_vocab_size, EMBEDDING_DIM, input_length=maxLength))
model.add(GRU(256, dropout=0.1, return_sequences=True))
model.add(GRU(256, dropout=0.1))
model.add(Dense(32, activation='relu'))
model.add(Dense(num_categories, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
history = model.fit(totalX, y_train.values, validation_split=0.3, batch_size=128, epochs=3)

In [None]:
max_vocab_size = 200000
test_tokenizer = Tokenizer(max_vocab_size)
test_tokenizer.fit_on_texts(X_test)
test_vocab_size = len(test_tokenizer.word_index) + 1
print("Test vocab size: ", test_vocab_size)
totalX_test = np.array(pad_sequences(input_tokenizer.texts_to_sequences(X_test), maxlen=maxLength))

In [None]:
pred = model.predict_classes(totalX_test)

In [None]:
map_class = {'hockey': 2, 'nba' : 3, 'leagueoflegends' : 4, 'soccer' : 5, 'funny': 6,
                           'movies' : 7, 'anime' : 8,
 'Overwatch': 9, 'trees' : 10, 'GlobalOffensive' : 11, 'nfl' : 12, 'AskReddit' : 13, 'gameofthrones' : 14,
 'conspiracy' : 15, 'worldnews' : 16, 'wow' : 17,
                           'europe' : 18, 'canada' : 19, 'Music' : 20, 'baseball' : 21}
inv_map = {v: k for k, v in map_class.items()}

In [None]:
np.mean(pred==data.loc[y_test.index].subreddits.map(map_class).values)