We start with importing nessecary libraries

In [9]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [2]:
file_path = '../data/reddit_train.csv'
file_path2 = '../data/reddit_test.csv'
data = pd.read_csv(file_path)
data = data.drop(columns={'id'})
data.tail()
test_data = pd.read_csv(file_path2)
test_data.tail()

Unnamed: 0,id,comments
29995,29995,I have no idea what's going on this trailer an...
29996,29996,"I misread that at David Cross, and now I'm try..."
29997,29997,Well lets be reasonable next time and dont unb...
29998,29998,Jaime dumping on Jon for going off to serve in...
29999,29999,"I think he'll be on par, but more mechanic tha..."


In [3]:
def clean_data(s):
    for expr in [r"</d>",r"</s>",r"[^A-Za-z0-9(),!?\'\`]"]:
        s = re.sub(expr, " ", s)
    for expr in [r"\'s",r"\'ve",r"\'t",r"\'re",r"\'d",r"\'11",]:
        s = re.sub(expr, " "+expr[1:], s)
    for expr in [r",",r"!",r"\(",r"\)"r"\?"]:
        s = re.sub(expr, " "+expr[1:]+" ", s)
    s = re.sub(r"\s{2,}", " ", s)
    s = re.sub(r'\S*(x{2,}|X{2,})\S*', "xxx", s)
    s = re.sub(r'[^\x00-\x7F]+', "", s)
    return s.strip().lower()

In [4]:
data['comments'] = data['comments'].apply(lambda x: clean_data(x))
test_data['comments'] = data['comments'].apply(lambda x: clean_data(x))

We split the data in oder to get the train and validation data.

In [5]:
X_train, X_test, y_train, y_test = train_test_split(data.comments, data.subreddits, test_size=0.2, random_state=42)

In [17]:
%%time
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
test_counts = count_vect.transform(X_test)
tf_idf_vectorizer = TfidfVectorizer()
vectors_train_idf = tf_idf_vectorizer.fit_transform(X_train)
vectors_test_idf = tf_idf_vectorizer.transform(X_test)
subreddits = data['subreddits'].unique()
print(subreddits)

['hockey' 'nba' 'leagueoflegends' 'soccer' 'funny' 'movies' 'anime'
 'Overwatch' 'trees' 'GlobalOffensive' 'nfl' 'AskReddit' 'gameofthrones'
 'conspiracy' 'worldnews' 'wow' 'europe' 'canada' 'Music' 'baseball']
CPU times: user 3.24 s, sys: 13.2 ms, total: 3.25 s
Wall time: 3.26 s


Now we are creating naive bayes using SKlearn.

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

In [39]:
%%time
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),('clf', MultinomialNB())])

text_clf = text_clf.fit(X_train, y_train)

CPU times: user 1.53 s, sys: 12 ms, total: 1.54 s
Wall time: 1.55 s


In [40]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.5547857142857143

In [20]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

print(clf)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


We will create the grid search model.

In [21]:
%%time
clf.fit(X_train_counts, y_train)

CPU times: user 17min 32s, sys: 18min 36s, total: 36min 8s
Wall time: 3min 1s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
%%time
lr_y_pred = clf.predict(test_counts)

CPU times: user 15.7 ms, sys: 1.11 ms, total: 16.8 ms
Wall time: 15.4 ms


In [24]:
from sklearn import metrics

metrics.accuracy_score(y_test, lr_y_pred)

0.5287857142857143

In [56]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range':[(1,1),(1,2)], 'tfidf__use_idf': (True, False), 'clf__alpha':(0.2, 0.1)}

In [57]:
%%time
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1, cv=3)
gs_clf = gs_clf.fit(X_train, y_train)

CPU times: user 3.03 s, sys: 594 ms, total: 3.63 s
Wall time: 25.3 s


In [58]:
print(gs_clf.best_score_)
print(gs_clf.best_params_)

0.5520535714285715
{'clf__alpha': 0.2, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


In [45]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)