In [1]:
import pandas as pd
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 5_000

In [2]:
comics_zone = pd.read_csv('../data/cleaned_data/comics_zone')

In [3]:
comics_zone.head()

Unnamed: 0.1,Unnamed: 0,author,subreddit,lems,title_lems,combined_text
0,0,ryanseanoreilly,1,removed,Podcast Review of story by Charles Beaumont,removedPodcast Review of story by Charles Beau...
1,1,neads1,1,Doing a marathon here of the original series T...,The Jungle S3 ep 12,Doing a marathon here of the original series T...
2,2,Ethan_ML10,1,I didn really like the kick the can one,Whats your least favorite episode of the origi...,I didn really like the kick the can oneWhats y...
3,3,TwiliSidon360,1,removed,I Midna,removedI Midna
4,4,nulldrone,1,Picture this if you will a young lad in his 20...,The Groan Zone It a Good Life Review,Picture this if you will a young lad in his 20...


In [4]:
# Baseline; our classes are not too unbalanced; 
comics_zone['subreddit'].value_counts(normalize=True)

0    0.677863
1    0.322137
Name: subreddit, dtype: float64

In [5]:
X_train, X_test, y_train, y_test = train_test_split(comics_zone['combined_text'], comics_zone['subreddit'])

In [6]:
cvec = CountVectorizer(max_features=5000, ngram_range=(1, 2))

In [7]:
# Fit our CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train).todense(),
                          columns = cvec.get_feature_names())

# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test).todense(),
                         columns = cvec.get_feature_names())

In [8]:
# Checking shapes and value counts for symmetry
X_train_cvec.shape

(982, 5000)

In [9]:
X_test_cvec.shape

(328, 5000)

In [10]:
y_train.value_counts()

0    642
1    340
Name: subreddit, dtype: int64

In [11]:
# Voting Classifier is an ensemble estimator where each model 'votes', 'hard' voting is majority rule voting
vc = VotingClassifier([('rfc', RandomForestClassifier()), 
                       ('abc', AdaBoostClassifier()), 
                       ('gbc', GradientBoostingClassifier()), 
                       ('log', LogisticRegression())])

vc.fit(X_train_cvec, y_train)

print("VC Train Score:", vc.score(X_train_cvec, y_train))

print("VC Test Score:", vc.score(X_test_cvec, y_test))

# Our training score is nearly 3% higher than our test score, slightly overfit

VC Train Score: 0.9877800407331976
VC Test Score: 0.9573170731707317


In [12]:
vc.estimators_

[RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False),
 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
           learning_rate=1.0, n_estimators=50, random_state=None),
 GradientBoostingClassifier(criterion='friedman_mse', init=None,
               learning_rate=0.1, loss='deviance', max_depth=3,
               max_features=None, max_leaf_nodes=None,
               min_impurity_decrease=0.0, min_impurity_split=None,
               min_samples_leaf=1, min_samples_split=2,
               min_weight_fraction_leaf=0.0, n_estimators=100,
               n_iter_no_change=None, presort='auto', random_state=No

In [13]:
# Since our train/test was completed; a Random Forest Classifier was fitted; RFC is a collection of decision trees
rf = RandomForestClassifier()

rf.fit(X_train_cvec, y_train)
print(rf.score(X_train_cvec, y_train))
print(rf.score(X_test_cvec, y_test))
# Our Random Forest Classifier performed relatively the same as our Voting Classifier

0.9928716904276986
0.9451219512195121
