In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, hamming_loss, classification_report

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from skmultilearn.adapt import MLkNN

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [4]:
#Load Dataset

df = pd.concat([pd.read_csv('books.csv'), pd.read_csv('Cooking.csv'), pd.read_csv('cookingforbeginners.csv'), pd.read_csv('Games.csv'), pd.read_csv('GYM.csv'), pd.read_csv('videogames.csv'), pd.read_csv('travel.csv')])

In [5]:
df.to_csv('allHobbies.csv')

In [6]:
df = df.reset_index(drop=True)

In [7]:
df

Unnamed: 0,id,subreddit,body
0,lsz3fa,books,brief hiatus r book pleas recomm month book cl...
1,gou1a0c,books,date read schedul discuss thread discuss threa...
2,gouegbm,books,realli appreci mention thing spoiler tag there...
3,gp2fs0h,books,favorit book time read mani time got part memo...
4,gp0vwtb,books,releas end stint read book tri time get mayb t...
...,...,...,...
5276,m3lfxc,travel,
5277,m3l1g7,travel,weve watch nhk world get mini travel fix super...
5278,gqpd4mr,travel,channel anthoni bourdain part unknown regular ...
5279,gqpgeq9,travel,neighborhood japanes groceri store food court ...


In [8]:
vectorizer = CountVectorizer(stop_words='english')


In [9]:
df = df.dropna()

In [10]:
all_features = vectorizer.fit_transform(df.body)

In [11]:
all_features.shape

(5250, 10265)

In [12]:
vectorizer.vocabulary_

{'brief': 1039,
 'hiatus': 4087,
 'book': 946,
 'pleas': 6671,
 'recomm': 7233,
 'month': 5693,
 'club': 1612,
 'march': 5333,
 'read': 7179,
 'rant': 7143,
 'oral': 6204,
 'histori': 4115,
 'buster': 1162,
 'casey': 1301,
 'chuck': 1527,
 'palahniuk': 6353,
 'week': 9875,
 'discuss': 2421,
 'thread': 9028,
 'join': 4701,
 'ama': 243,
 'tuesday': 9309,
 'th': 8958,
 'goodread': 3695,
 'https': 4236,
 'www': 10080,
 'com': 1679,
 'feel': 3143,
 'free': 3399,
 'skip': 8127,
 'prefer': 6813,
 'know': 4863,
 'noth': 6010,
 'effici': 2728,
 'serial': 7893,
 'killer': 4824,
 'time': 9081,
 'high': 4092,
 'school': 7770,
 'rebel': 7200,
 'escap': 2887,
 'small': 8193,
 'town': 9190,
 'home': 4156,
 'big': 821,
 'citi': 1554,
 'becom': 742,
 'leader': 4975,
 'urban': 9550,
 'demolit': 2250,
 'derbi': 2277,
 'parti': 6414,
 'crash': 1963,
 'die': 2346,
 'spectacular': 8360,
 'highway': 4099,
 'death': 2178,
 'friend': 3421,
 'gather': 3543,
 'testimoni': 8946,
 'need': 5868,
 'build': 1115,
 's

In [13]:
X_train, X_test, y_train, y_test = train_test_split(all_features, df.subreddit, test_size=0.3, random_state=88)

In [14]:
X_train.shape

(3675, 10265)

In [15]:
X_test.shape

(1575, 10265)

In [16]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

In [17]:
nr_correct = (y_test == classifier.predict(X_test)).sum()
nr_correct

1246

In [18]:
nr_incorrect = y_test.size - nr_correct
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'The (testing) accuracy of the model is {1-fraction_wrong:.2%}')

The (testing) accuracy of the model is 79.11%


In [19]:
 classifier.score(X_test, y_test)

0.7911111111111111

In [20]:
recall_score(y_test, classifier.predict(X_test), average=None)

array([0.82650602, 0.06666667, 0.90942029, 0.78333333, 0.08928571,
       0.8125    , 0.13333333])

In [21]:
f1_score(y_test, classifier.predict(X_test), average=None)

array([0.83760684, 0.12      , 0.83112583, 0.76422764, 0.15873016,
       0.82539683, 0.23529412])

In [22]:
example = ["Fellow sushi lover here! If y'all can't handle raw fish, we are done. I also love playing Call of Duty and Fortnite."]

In [26]:
doc_term_matrix = vectorizer.transform(example)

In [24]:
classifier.predict_proba(doc_term_matrix)

array([[8.65041326e-01, 4.14216645e-04, 9.43960809e-04, 1.05748275e-01,
        1.04013433e-02, 1.67242799e-02, 7.26598295e-04]])

In [30]:
doc_term_matrix = vectorizer.transform(df.body)

In [31]:
classifier.predict_proba(doc_term_matrix)

array([[2.61652835e-51, 4.23686950e-73, 4.85969493e-36, ...,
        6.92801110e-67, 1.83965366e-23, 2.68951771e-72],
       [2.02225812e-37, 1.77745652e-46, 6.11007448e-31, ...,
        2.20329331e-42, 3.74777454e-20, 3.81004948e-45],
       [2.65671454e-05, 1.88695617e-07, 9.25255117e-01, ...,
        6.14883206e-08, 4.25063419e-04, 7.63860125e-08],
       ...,
       [3.91230264e-08, 4.48555505e-16, 1.13810738e-07, ...,
        4.54810864e-12, 9.99999830e-01, 2.45610610e-14],
       [7.75584588e-06, 1.47845261e-11, 5.86151586e-04, ...,
        4.63157889e-09, 9.96169913e-01, 9.72938029e-12],
       [4.03028197e-03, 6.85017963e-05, 2.93263702e-01, ...,
        8.42644309e-04, 6.62450683e-01, 6.91251401e-05]])

In [32]:
classifier.classes_

array(['Cooking', 'GYM', 'Games', 'books', 'cookingforbeginners',
       'travel', 'videogames'], dtype='<U19')

In [35]:
df2 = pd.DataFrame(classifier.predict_proba(doc_term_matrix),
                   columns=classifier.classes_)

In [36]:
df2

Unnamed: 0,Cooking,GYM,Games,books,cookingforbeginners,travel,videogames
0,2.616528e-51,4.236869e-73,4.859695e-36,1.000000e+00,6.928011e-67,1.839654e-23,2.689518e-72
1,2.022258e-37,1.777457e-46,6.110074e-31,1.000000e+00,2.203293e-42,3.747775e-20,3.810049e-45
2,2.656715e-05,1.886956e-07,9.252551e-01,7.429293e-02,6.148832e-08,4.250634e-04,7.638601e-08
3,1.003784e-10,2.570686e-21,2.376721e-06,9.999976e-01,8.058590e-18,5.337643e-09,9.400412e-22
4,5.107115e-06,7.924552e-12,3.059542e-03,9.967195e-01,7.353764e-10,2.158704e-04,5.011602e-11
...,...,...,...,...,...,...,...
5245,1.139609e-01,2.962029e-03,4.141782e-01,2.115101e-01,4.258181e-03,2.531163e-01,1.429582e-05
5246,2.105934e-08,5.660870e-10,1.256772e-06,8.282606e-06,7.243394e-11,9.999904e-01,9.140044e-13
5247,3.912303e-08,4.485555e-16,1.138107e-07,1.728340e-08,4.548109e-12,9.999998e-01,2.456106e-14
5248,7.755846e-06,1.478453e-11,5.861516e-04,3.236175e-03,4.631579e-09,9.961699e-01,9.729380e-12


In [37]:
df2.to_csv('new_dataframe.csv')