In [1]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 5_000
np.random.seed(42)

In [2]:
comics_zone = pd.read_csv('../data/cleaned_data/comics_zone')

In [3]:
comics_zone.isna().sum()

Unnamed: 0         0
author             0
subreddit          0
lems             132
title_lems         0
combined_text      0
dtype: int64

In [4]:
comics_zone.head(2)

Unnamed: 0.1,Unnamed: 0,author,subreddit,lems,title_lems,combined_text
0,0,ryanseanoreilly,1,removed,Podcast Review of story by Charles Beaumont,removedPodcast Review of story by Charles Beau...
1,1,neads1,1,Doing a marathon here of the original series T...,The Jungle S3 ep 12,Doing a marathon here of the original series T...


In [5]:
# Baseline; our classes are not too unbalanced; 
comics_zone['subreddit'].value_counts(normalize=True)

0    0.677863
1    0.322137
Name: subreddit, dtype: float64

In [6]:
# Instantiate our Count Vectorizer.
cvec = CountVectorizer(ngram_range=(1, 2), max_features = 5000, stop_words = 'english')

In [7]:
X_train, X_test, y_train, y_test = train_test_split(comics_zone['combined_text'], comics_zone['subreddit'])

In [8]:
# Fit our TF_IDF on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train).todense(),
                          columns = cvec.get_feature_names())

In [9]:
# Transform our testing data with the already-fit TF_IDF.
X_test_cvec = pd.DataFrame(cvec.transform(X_test).todense(),
                         columns = cvec.get_feature_names())

In [10]:
# Instantiate Naive Bayes methods are a set of supervised learning algorithms based on applying Bayes’ theorem
# with the “naive” assumption of conditional independence between every pair of features
# given the value of the class variable. All of which is strange because text is codependent. 
nb = MultinomialNB()

In [11]:
model = nb.fit(X_train_cvec, y_train)

In [12]:
predictions = model.predict(X_test_cvec)

In [13]:
model.score(X_train_cvec, y_train)

0.9816700610997964

In [14]:
model.score(X_test_cvec, y_test)

0.9786585365853658

In [15]:
# Our train and test scores are only 1% apart and both scored very high at 98%