In [52]:
import requests
import json
import time
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [53]:
np.random.seed(42)

In [2]:
def posts_as_DataFrame(posts, features = ['subreddit', 'title', 'num_comments', 'score','selftext', 'is_self']):
    feat_dict = [{feat : post['data'][feat] for feat in features}  for post in posts]
    return pd.DataFrame(feat_dict)

In [3]:
with open('project_3_raw_data.json', 'r') as f:
    project_data = json.load(f)

In [4]:
df = posts_as_DataFrame(project_data)

In [5]:
df.drop_duplicates(subset='title', inplace=True)

After retreiving data from the reddit API, I realized I had collected a bunch of duplicates in the process.  In the cell above, I'm removing the duplicates, filtering by my 'Title' column.

### Modeling Only Titles

In [6]:
df['class'] = df['subreddit'].isin(['history', 'AskHistorians'])

In [7]:
y = df['class']

In [8]:
X = df['title']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state = 42)

#### Count Vectorizing

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cvec = CountVectorizer(stop_words='english', min_df = 2)

In [12]:
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [13]:
X_train_cv = cvec.transform(X_train)
X_test_cv = cvec.transform(X_test)

In [14]:
log_reg = LogisticRegression(C=.8, penalty='l2')
log_reg.fit(X_train_cv, y_train)

LogisticRegression(C=0.8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
log_reg.score(X_train_cv, y_train)

0.9801945795691452

In [16]:
log_reg.score(X_test_cv, y_test)

0.8791666666666667

In [17]:
log_coef_df = pd.DataFrame(log_reg.coef_, columns =cvec.get_feature_names() )

In [18]:
log_coef_df.T.sort_values(0)

Unnamed: 0,0
missing,-2.050798
murder,-1.970095
case,-1.905134
trump,-1.884401
mystery,-1.876346
conspiracy,-1.696064
disappearance,-1.642461
murders,-1.562677
cases,-1.498694
crime,-1.428661


### Fulltext models

In [19]:
df['fulltext'] = df['title'] + ' ' + df['selftext']

In [20]:
X = df['fulltext']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [22]:
cvec = CountVectorizer(stop_words='english', min_df=2, max_df=0.5)

In [23]:
cvec.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.5, max_features=None, min_df=2,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [24]:
X_train_sc = cvec.transform(X_train)
X_test_sc = cvec.transform(X_test)

In [26]:
log_reg = LogisticRegression(C=.8, penalty='l2')
log_reg.fit(X_train_sc, y_train)

LogisticRegression(C=0.8, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
log_reg.score(X_train_sc, y_train)

0.9989576094510076

In [28]:
log_reg.score(X_test_sc, y_test)

0.9125

In [29]:
log_coef_df_full = pd.DataFrame(log_reg.coef_, columns =cvec.get_feature_names())

In [30]:
log_coef_df_full.T.sort_values(0)

Unnamed: 0,0
conspiracy,-1.611939
trump,-1.407663
case,-1.267552
cases,-1.222219
mysteries,-1.119441
missing,-1.082889
mystery,-0.936662
crime,-0.878942
http,-0.852962
news,-0.842450


In [34]:
dtc = DecisionTreeClassifier()

In [35]:
param_tree = {
    'max_depth': range(20, 101, 20),
    'min_samples_leaf': [3, 7, 11]
}

In [38]:
gs_tree = GridSearchCV(dtc, param_tree, cv=5)
gs_tree.fit(X_train_sc, y_train)


GridSearchCV(cv=5, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': range(20, 101, 20), 'min_samples_leaf': [3, 7, 11]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [39]:
print(gs_tree.best_score_)
print(gs_tree.best_params_)

0.8158443363446838
{'max_depth': 60, 'min_samples_leaf': 3}


In [44]:
gs_tree.score(X_test_sc, y_test)

0.8145833333333333

In [40]:
rfc = RandomForestClassifier()

In [42]:
param_forest = {
    'n_estimators': range(5, 50, 10),
    'min_samples_leaf': [3, 7, 11],
    'max_depth': [20, 50, 75]
}

In [43]:
gs_forest = GridSearchCV(rfc, param_forest, cv=5)
gs_forest.fit(X_train_sc, y_train)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_estimators': range(5, 50, 10), 'min_samples_leaf': [3, 7, 11], 'max_depth': [20, 50, 75]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [45]:
print(gs_forest.best_score_)
print(gs_forest.best_params_)

0.901320361362057
{'max_depth': 75, 'min_samples_leaf': 3, 'n_estimators': 45}


In [46]:
gs_forest.score(X_test_sc, y_test)

0.8947916666666667

In [47]:
gs_forest.best_estimator_.feature_importances_

array([1.58923942e-04, 4.98773644e-05, 0.00000000e+00, ...,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [49]:
forest_df_full = pd.DataFrame(gs_forest.best_estimator_.feature_importances_, index =cvec.get_feature_names())

In [51]:
forest_df_full.sort_values(0, ascending=False)

Unnamed: 0,0
did,0.068838
history,0.032225
com,0.021562
police,0.019248
war,0.017619
case,0.017053
news,0.015513
https,0.013803
conspiracy,0.012369
http,0.012333
