In [34]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import accuracy_score, hamming_loss, classification_report

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from skmultilearn.problem_transform import BinaryRelevance, ClassifierChain, LabelPowerset
from skmultilearn.adapt import MLkNN

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [35]:
#Load Dataset
df_cooking = pd.concat([pd.read_csv('Cooking.csv'), pd.read_csv('cookingforbeginners.csv')])
df_games = pd.concat([pd.read_csv('Games.csv'), pd.read_csv('videogames.csv')])
df_fitness = pd.concat([pd.read_csv('GYM.csv'), pd.read_csv('Fitness.csv')])

df_cooking = df_cooking.drop(columns=['id'])
df_games = df_games.drop(columns=['id'])
df_fitness = df_fitness.drop(columns=['id'])

df_games = df_games.assign(subreddit="Games")
df_cooking = df_cooking.assign(subreddit="Cooking")
df_fitness = df_fitness.assign(subreddit="Fitness")

In [36]:
df_fitness

Unnamed: 0,subreddit,body
0,Fitness,
1,Fitness,thank help get start week
2,Fitness,look like good start good effort plan differen...
3,Fitness,r fit fantast wiki https thefit wiki constant ...
4,Fitness,hi r gym time build workout tracker app weight...
...,...,...
2667,Fitness,whole issu feel like havent eaten someon use s...
2668,Fitness,friend dick face swell gain weight depend over...
2669,Fitness,havent heard meakin ill check nice im layman i...
2670,Fitness,oh much lift


In [37]:
df = pd.concat([pd.read_csv('books.csv'), df_cooking, df_fitness, df_games, pd.read_csv('travel.csv')])

In [38]:
df.to_csv('allHobbies.csv')

In [39]:
df = df.reset_index(drop=True)

In [40]:
df = df.drop(columns=['id'])
df

Unnamed: 0,subreddit,body
0,books,brief hiatus r book pleas recomm month book cl...
1,books,date read schedul discuss thread discuss threa...
2,books,realli appreci mention thing spoiler tag there...
3,books,favorit book time read mani time got part memo...
4,books,releas end stint read book tri time get mayb t...
...,...,...
7948,travel,
7949,travel,weve watch nhk world get mini travel fix super...
7950,travel,channel anthoni bourdain part unknown regular ...
7951,travel,neighborhood japanes groceri store food court ...


In [41]:
vectorizer = CountVectorizer(stop_words='english')


In [42]:
df = df.dropna()

In [43]:
all_features = vectorizer.fit_transform(df.body)

cv_file = open(b'count_vectorizer.obj', 'wb')
pickle.dump(vectorizer, cv_file)
cv_file.close()

In [44]:
all_features.shape

(7921, 11790)

In [45]:
vectorizer.vocabulary_

{'brief': 1214,
 'hiatus': 4654,
 'book': 1104,
 'pleas': 7663,
 'recomm': 8304,
 'month': 6519,
 'club': 1868,
 'march': 6113,
 'read': 8247,
 'rant': 8207,
 'oral': 7122,
 'histori': 4687,
 'buster': 1353,
 'casey': 1516,
 'chuck': 1770,
 'palahniuk': 7290,
 'week': 11335,
 'discuss': 2768,
 'thread': 10354,
 'join': 5375,
 'ama': 285,
 'tuesday': 10680,
 'th': 10276,
 'goodread': 4206,
 'https': 4824,
 'www': 11573,
 'com': 1948,
 'feel': 3574,
 'free': 3872,
 'skip': 9318,
 'prefer': 7830,
 'know': 5565,
 'noth': 6896,
 'effici': 3117,
 'serial': 9051,
 'killer': 5521,
 'time': 10414,
 'high': 4659,
 'school': 8917,
 'rebel': 8270,
 'escap': 3291,
 'small': 9389,
 'town': 10541,
 'home': 4737,
 'big': 951,
 'citi': 1800,
 'becom': 860,
 'leader': 5696,
 'urban': 10964,
 'demolit': 2577,
 'derbi': 2609,
 'parti': 7354,
 'crash': 2248,
 'die': 2688,
 'spectacular': 9571,
 'highway': 4667,
 'death': 2493,
 'friend': 3894,
 'gather': 4037,
 'testimoni': 10261,
 'need': 6727,
 'build': 

In [46]:
X_train, X_test, y_train, y_test = train_test_split(all_features, df.subreddit, test_size=0.3, random_state=88)

In [47]:
X_train.shape

(5544, 11790)

In [48]:
X_test.shape

(2377, 11790)

In [49]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB()

In [50]:
nr_correct = (y_test == classifier.predict(X_test)).sum()
nr_correct

1996

In [51]:
nr_incorrect = y_test.size - nr_correct
fraction_wrong = nr_incorrect / (nr_correct + nr_incorrect)
print(f'The (testing) accuracy of the model is {1-fraction_wrong:.2%}')

The (testing) accuracy of the model is 83.97%


In [52]:
 classifier.score(X_test, y_test)

0.8397139251156921

In [53]:
recall_score(y_test, classifier.predict(X_test), average=None)

array([0.79194631, 0.90725327, 0.87073609, 0.76315789, 0.71052632])

In [54]:
f1_score(y_test, classifier.predict(X_test), average=None)

array([0.84285714, 0.871502  , 0.84421236, 0.76821192, 0.7902439 ])

In [55]:
example = ["Fellow sushi lover here! If y'all can't handle raw fish, we are done. I also love playing Call of Duty and Fortnite."]

In [56]:
doc_term_matrix = vectorizer.transform(example)

In [57]:
classifier.predict_proba(doc_term_matrix)

array([[0.88985769, 0.00144586, 0.00099869, 0.08499369, 0.02270407]])

In [58]:
doc_term_matrix = vectorizer.transform(df.body)

In [59]:
classifier.predict_proba(doc_term_matrix)

array([[2.36018253e-72, 3.03644599e-50, 1.62778742e-56, 1.00000000e+00,
        1.10260241e-47],
       [5.52210055e-41, 2.27795198e-32, 2.76372395e-34, 1.00000000e+00,
        8.36766428e-25],
       [6.07623780e-06, 1.39943345e-05, 9.85766708e-01, 1.41383203e-02,
        7.49011224e-05],
       ...,
       [1.07933181e-08, 6.79847417e-07, 5.45256038e-06, 1.06985421e-08,
        9.99993846e-01],
       [9.92825374e-05, 1.20192643e-07, 7.20261213e-05, 1.45894357e-01,
        8.53934214e-01],
       [9.77394837e-03, 3.44925467e-02, 2.91593375e-01, 4.39761543e-02,
        6.20163976e-01]])

In [60]:
classifier.classes_

array(['Cooking', 'Fitness', 'Games', 'books', 'travel'], dtype='<U7')

In [61]:
df2 = pd.DataFrame(classifier.predict_proba(doc_term_matrix),
                   columns=classifier.classes_)

In [62]:
df2

Unnamed: 0,Cooking,Fitness,Games,books,travel
0,2.360183e-72,3.036446e-50,1.627787e-56,1.000000e+00,1.102602e-47
1,5.522101e-41,2.277952e-32,2.763724e-34,1.000000e+00,8.367664e-25
2,6.076238e-06,1.399433e-05,9.857667e-01,1.413832e-02,7.490112e-05
3,7.017857e-11,7.967106e-09,7.117367e-07,9.999993e-01,6.734793e-09
4,4.783889e-06,3.962856e-05,2.979753e-03,9.967314e-01,2.443883e-04
...,...,...,...,...,...
7916,3.880530e-02,7.768563e-01,1.038925e-01,3.161091e-02,4.883501e-02
7917,4.398177e-09,3.023044e-08,4.038298e-07,1.403733e-05,9.999855e-01
7918,1.079332e-08,6.798474e-07,5.452560e-06,1.069854e-08,9.999938e-01
7919,9.928254e-05,1.201926e-07,7.202612e-05,1.458944e-01,8.539342e-01


In [63]:
df2.to_csv('new_dataframe.csv')

In [64]:
model_file = open(b'hobbies_classifier.obj', 'wb')
pickle.dump(classifier, model_file)
model_file.close()


