In [1]:
import requests
import json
import time
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
np.random.seed(42)

In [3]:
def posts_as_DataFrame(posts, features = ['subreddit', 'title', 'num_comments', 'score','selftext', 'is_self']):
    feat_dict = [{feat : post['data'][feat] for feat in features}  for post in posts]
    return pd.DataFrame(feat_dict)

In [4]:
with open('project_3_raw_data.json', 'r') as f:
    project_data = json.load(f)

In [5]:
df = posts_as_DataFrame(project_data)

In [6]:
df.drop_duplicates(subset='title', inplace=True)

After retreiving data from the reddit API, I realized I had collected a bunch of duplicates in the process.  In the cell above, I'm removing the duplicates, filtering by my 'Title' column.

### Modeling Only Titles

In [7]:
df['class'] = df['subreddit'].isin(['history', 'AskHistorians'])

In [8]:
y = df['class']

In [9]:
X = df['title']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

#### Count Vectorizing

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
cvec = CountVectorizer(stop_words='english', min_df = 2)

In [13]:
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [14]:
X_train_cv = cvec.transform(X_train)
X_test_cv = cvec.transform(X_test)

In [15]:
log_reg = LogisticRegression(C=.8, penalty='l2')
log_reg.fit(X_train_cv, y_train)

LogisticRegression(C=0.8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
log_reg.score(X_train_cv, y_train)

0.9801945795691452

In [17]:
log_reg.score(X_test_cv, y_test)

0.8791666666666667

In [18]:
log_coef_df = pd.DataFrame(log_reg.coef_, columns =cvec.get_feature_names() )

In [19]:
log_coef_df.T.sort_values(0).head()

Unnamed: 0,0
missing,-2.050798
murder,-1.970095
case,-1.905134
trump,-1.884401
mystery,-1.876346


### Fulltext models

##### CountVectorizer

In [20]:
df['fulltext'] = df['title'] + ' ' + df['selftext']

In [21]:
X = df['fulltext']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [23]:
cvec = CountVectorizer(stop_words='english', min_df=2, max_df=0.5)

In [24]:
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [25]:
X_train_sc = cvec.transform(X_train)
X_test_sc = cvec.transform(X_test)

In [26]:
log_reg = LogisticRegression(C=.8, penalty='l2')
log_reg.fit(X_train_sc, y_train)

LogisticRegression(C=0.8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
log_reg.score(X_train_sc, y_train)

0.9989576094510076

In [28]:
log_reg.score(X_test_sc, y_test)

0.9125

In [29]:
log_coef_df_full = pd.DataFrame(log_reg.coef_, columns =cvec.get_feature_names())

In [30]:
log_coef_df_full.T.sort_values(0).tail()

Unnamed: 0,0
ww2,1.151415
roman,1.166866
battle,1.170605
history,1.540655
did,2.062823


In [31]:
dtc = DecisionTreeClassifier()

In [32]:
param_tree = {
    'max_depth': range(20, 101, 20),
    'min_samples_leaf': [3, 7, 11]
}

In [33]:
gs_tree = GridSearchCV(dtc, param_tree, cv=5)
gs_tree.fit(X_train_sc, y_train)


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(20, 101, 20), 'min_samples_leaf': [3, 7, 11]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [34]:
print(gs_tree.best_score_)
print(gs_tree.best_params_)

0.8203613620569841
{'max_depth': 60, 'min_samples_leaf': 3}


In [35]:
gs_tree.score(X_test_sc, y_test)

0.8145833333333333

In [36]:
rfc = RandomForestClassifier()

In [37]:
param_forest = {
    'n_estimators': range(5, 50, 10),
    'min_samples_leaf': [3, 7, 11],
    'max_depth': [20, 50, 75]
}

In [38]:
gs_forest = GridSearchCV(rfc, param_forest, cv=5)
gs_forest.fit(X_train_sc, y_train)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(5, 50, 10), 'min_samples_leaf': [3, 7, 11], 'max_depth': [20, 50, 75]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
print(gs_forest.best_score_)
print(gs_forest.best_params_)

0.9016678248783878
{'max_depth': 75, 'min_samples_leaf': 3, 'n_estimators': 25}


In [40]:
gs_forest.score(X_test_sc, y_test)

0.896875

In [41]:
gs_forest.best_estimator_.feature_importances_

array([0.        , 0.00025323, 0.        , ..., 0.        , 0.        ,
       0.        ])

In [42]:
forest_df_full = pd.DataFrame(gs_forest.best_estimator_.feature_importances_, index =cvec.get_feature_names())

In [43]:
forest_df_full.sort_values(0, ascending=False).head()

Unnamed: 0,0
did,0.06209
history,0.028532
http,0.025992
case,0.018392
war,0.015134


##### TF-IDF Vectorizer

In [44]:
tfdf = TfidfVectorizer(stop_words='english', min_df = 2, max_df = 0.5)

In [45]:
tfdf.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [46]:
X_train_tf = tfdf.transform(X_train)
X_test_tf = tfdf.transform(X_test)

In [47]:
log_reg = LogisticRegression(C=.8, penalty='l2')
log_reg.fit(X_train_tf, y_train)

LogisticRegression(C=0.8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [48]:
log_reg.score(X_train_tf, y_train)

0.9781097984711605

In [49]:
log_reg.score(X_test_tf, y_test)

0.9197916666666667

In [50]:
log_coef_df_full_tf = pd.DataFrame(log_reg.coef_, columns =tfdf.get_feature_names())

In [51]:
log_coef_df_full_tf.T.sort_values(0).tail()

Unnamed: 0,0
empire,1.81223
roman,2.006093
war,2.245596
history,4.135908
did,5.94039


In [52]:
gs_tree_tf = GridSearchCV(dtc, param_tree, cv=5)
gs_tree_tf.fit(X_train_tf, y_train)


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(20, 101, 20), 'min_samples_leaf': [3, 7, 11]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [53]:
print(gs_tree_tf.best_score_)
print(gs_tree_tf.best_params_)

0.8179291174426685
{'max_depth': 100, 'min_samples_leaf': 3}


In [54]:
gs_tree_tf.score(X_test_tf, y_test)

0.8177083333333334

In [55]:
gs_forest_tf = GridSearchCV(rfc, param_forest, cv=5)
gs_forest_tf.fit(X_train_tf, y_train)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(5, 50, 10), 'min_samples_leaf': [3, 7, 11], 'max_depth': [20, 50, 75]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [56]:
print(gs_forest_tf.best_score_)
print(gs_forest_tf.best_params_)

0.8988881167477415
{'max_depth': 75, 'min_samples_leaf': 3, 'n_estimators': 45}


In [57]:
gs_forest_tf.score(X_test_tf, y_test)

0.8958333333333334

I will address the results of my modeling in the READ.me