In [1313]:
import pandas as pd
import numpy as np
import numbers
import decimal
import scipy.stats as ss
import matplotlib.pyplot as plt
from statistics import stdev
from statistics import mean
import time
import math
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold

In [1311]:
def getScoretWithModel(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [1453]:
redditDataTrain = pd.read_csv("data/reddit_train.csv") #, sep="\n", header=None) 
redditDataTest = pd.read_csv("data/reddit_test.csv") # sep="\n", header=None)
redditDataTrain

array(['hockey', 'nba', 'leagueoflegends', 'soccer', 'funny', 'movies',
       'anime', 'Overwatch', 'trees', 'GlobalOffensive', 'nfl',
       'AskReddit', 'gameofthrones', 'conspiracy', 'worldnews', 'wow',
       'europe', 'canada', 'Music', 'baseball'], dtype=object)

In [7]:
commentsTrain = redditDataTrain.iloc[:,1]
subredditsTrain = redditDataTrain.iloc[:,-1]
commentsTest = redditDataTest.iloc[:,1]

In [1417]:
tfidf = TfidfVectorizer(stop_words='english')
cv = CountVectorizer()
lr = LogisticRegression()
multiNB = MultinomialNB()
dtc = tree.DecisionTreeClassifier()
kf = StratifiedKFold(n_splits=5)

### Part 2: Experimentation

In [1413]:
kf = StratifiedKFold(n_splits=4)
kf = KFold(n_splits=4)

In [1340]:
GX_train, GX_test, Gy_train, Gy_test = train_test_split(commentsTrain, subredditsTrain, test_size=0.2)

In [1403]:
GX_train = commentsTrain[commentsTrain.index < np.percentile(commentsTrain.index, 80)].sort_index()
Gy_train = subredditsTrain[subredditsTrain.index < np.percentile(subredditsTrain.index, 80)].sort_index()
GX_test = commentsTrain[commentsTrain.index > np.percentile(commentsTrain.index, 80)].sort_index()
Gy_test = subredditsTrain[subredditsTrain.index > np.percentile(subredditsTrain.index, 80)].sort_index()

### so I am splitting the global train data into Gtest and Gtrain. Then I use k-fold on Gtrain. in the k-fold I select the model that gives the best accuracy. Then I use that model, to predict the global thing. Ideally I should be using k-fold for validating which theory about the data is correct. Then train the validation theory, using the entire Gtrain and predict Gtest.

### get avg accuracy

In [1422]:
kf = KFold(n_splits=4)

In [1423]:
def get_avg_acc_for_model(model):
    results = []
    for train_index, test_index in kf.split(GX_train, Gy_train):
        x_train, x_test, y_train, y_test = GX_train[train_index], GX_train[test_index], Gy_train[train_index], Gy_train[test_index]
        redditDataTrainTF = tfidf.fit_transform(x_train)
        redditDataTestTF = tfidf.transform(x_test)
        redditDataTrainTF.toarray()
        clf = model()
        results.append(getScoretWithModel(clf, redditDataTrainTF, redditDataTestTF, y_train, y_test))
    avg_acc = sum(results)/len(results)
    return avg_acc

In [1424]:
models = [LogisticRegression, MultinomialNB, tree.DecisionTreeClassifier]

In [1421]:
for model in models:
    print("test for model", model)
    print(get_avg_acc_for_model(model))

test for model <class 'sklearn.linear_model.logistic.LogisticRegression'>




0.514875
test for model <class 'sklearn.naive_bayes.MultinomialNB'>
0.5297857142857143
test for model <class 'sklearn.tree.tree.DecisionTreeClassifier'>
0.31


#### retrain with the entire model and test final accuracy

In [1412]:
GX_train_idf = tfidf.fit_transform(GX_train)
GX_test_idf = tfidf.transform(GX_test)
lr = LogisticRegression()
final_acc = getScoretWithModel(lr, GX_train_idf, GX_test_idf, Gy_train, Gy_test)
print(final_acc)



0.5482857142857143


In [1473]:
df = pd.read_csv("data/test.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,id,comments,subreddits
0,0,0,Trout and Bryant have both led the league in s...,baseball
1,1,1,&gt; Just like Estonians have good reasons to ...,europe
2,2,2,Will Sol_Primeval sotp being oblivious?\n\nfin...,GlobalOffensive
3,3,3,Moving Ostwald borders back to the pre 1967 bo...,canada
4,4,4,"You have to take it out of the bag, Morty!",AskReddit
5,5,5,"Don't forget the obnoxious ""*memes*"" in every ...",Overwatch
6,6,6,I say encourage local team support. Half the f...,nfl
7,7,7,"Favorite type of pasta? (not dish, pasta shape...",anime
8,8,8,"Spinal meningitis- Ween.\n\nOn mobile, so no l...",Music
9,9,9,"So what about Scandinavians, Caucasians, Asian...",funny
