In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import time
import requests
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

In [2]:
%config InLineBackend.figure_format = 'retina'
pd.set_option('display.max_columns', 100)

# Inspecting Subreddits

I will be building a model to classify posts for the following subreddits: Physical Therapy and Chiropractic

In [3]:
# Getting subreddit data from API
res_1 = requests.get('https://api.pushshift.io/reddit/search/submission?subreddit=physicaltherapy')

res_2 = requests.get('https://api.pushshift.io/reddit/search/submission?subreddit=personaltraining')

In [4]:
# Check for status of requests
print('Response 1:', res_1.status_code)
print('Response 2:', res_2.status_code)

Response 1: 200
Response 2: 200


In [5]:
# Read json objects into Python
phys_th_json = res_1.json()
pers_tr_json = res_2.json()

In [6]:
# Check objects pulled in. Should be consistent with Reddit's 25 posts per pull limit.
print('Number of posts:', len(phys_th_json['data']))
print('Number of posts:', len(pers_tr_json['data']))

Number of posts: 25
Number of posts: 25


In [7]:
# Converting json to Pandas DataFrame
phys_th_df = pd.DataFrame(phys_th_json['data'])
pers_tr_df = pd.DataFrame(pers_tr_json['data'])

In [8]:
# Choosing PT subreddit features for model
phys_th_df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers',
       'subreddit_type', 'thumbnail', 'title', 'total_awards_received',
       'treatment_tags', 'url', 'whiteli

In [9]:
# Choosing Chiro subreddit features for model
pers_tr_df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_richtext',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers',
       'subreddit_type', 'thumbnail', 'title', 'total_awards_received',
       'treatment_tags', 'url', 'whiteli

In [10]:
# Columns of interest:
mask = ['author', 'created_utc', 'is_self', 'num_comments', 'score', 'selftext', 'subreddit', 'title']

# Data Retrieval

In [11]:
# Extract posts from subreddit with function:
def get_posts(subreddit, kind, day_window, n_loops, size):
    '''
    1) Function takes subreddit name and endpoint parameters and constructs the URL for requests module to get a reponse.
    2) day_window and n_loops allows you to specify the exact range of days from which you want to pull posts.
    3) Size specifies the number of posts you want to pull per date range
    '''
    # Reddit Pushshift API primary endpoint
    endpoint = f"https://api.pushshift.io/reddit/search/{kind}"
    
    # List to concatenate our DataFrames in
    posts = []
    
    for i in range(1, n_loops + 1):
        res = requests.get(endpoint,
                           params={
                               'subreddit': subreddit,
                               'size': size,
                               'after': f'{day_window*i}d'
                           })
    
        print(f'Retrieved r/{subreddit} data from {day_window*i} days ago')
        # Assert keyword to check for any errors in requests
        assert res.status_code == 200
        subreddit_json = res.json()['data']
        # Convert json object to DataFrame
        df = pd.DataFrame(subreddit_json)[mask]
        posts.append(df)
        # Prevent being DDOS flag
        time.sleep(3)

    final = pd.concat(posts)
    
    if kind == "submission":
        
        # Filter for columns of interest
        final = final[mask]
        
        # Drop duplicates
        final.drop_duplicates(inplace = True)
        
        # Filer for self posts
        final = final.loc[final['is_self'] == True]
    
    final['date_created'] = final['created_utc'].map(dt.date.fromtimestamp)
    
    return final

# Function adapted from lecture authored by Boom D. (DSI-NYC) and Mahdi Shadkam-Farrokhi (DSI-NYC)

In [12]:
# Reconstructing Phys. Therapy DataFrame to include posts from past 100 days
phys_th_df = get_posts(subreddit = 'physicaltherapy', kind = 'submission', day_window = 50, n_loops = 5, size = 500)

Retrieved r/physicaltherapy data from 50 days ago
Retrieved r/physicaltherapy data from 100 days ago
Retrieved r/physicaltherapy data from 150 days ago
Retrieved r/physicaltherapy data from 200 days ago
Retrieved r/physicaltherapy data from 250 days ago


In [13]:
# Resetting index
phys_th_df.reset_index(drop = True, inplace = True)
# Checking final shape
phys_th_df.shape

(2003, 9)

In [14]:
# Reconstructing Personal Training DataFrame to include posts from past 100 days
pers_tr_df = get_posts(subreddit = 'personaltraining', kind = 'submission', day_window = 50, n_loops = 5, size = 500)

Retrieved r/personaltraining data from 50 days ago
Retrieved r/personaltraining data from 100 days ago
Retrieved r/personaltraining data from 150 days ago
Retrieved r/personaltraining data from 200 days ago
Retrieved r/personaltraining data from 250 days ago


In [15]:
# Resetting index
pers_tr_df.reset_index(drop = True, inplace = True)

# Noticed the amount of data returned for the Personal Training subreddit 
# within the same 100 day time period is half the size of Phys. Therapy data
pers_tr_df.shape

(1245, 9)

In [16]:
# Merge both final version DataFrames together
final_df = pd.concat([phys_th_df, pers_tr_df])

# Data Cleaning and Preprocessing

In [17]:
# Dropping redundant date record column
final_df.drop(columns = 'created_utc', inplace = True)

In [18]:
# Check shape of DataFrame
final_df.shape

(3248, 8)

In [19]:
# Check for null values and dropping them
display(final_df.isna().sum())
final_df.dropna(inplace = True)

author          0
is_self         0
num_comments    0
score           0
selftext        0
subreddit       0
title           0
date_created    0
dtype: int64

In [20]:
# Remove posts with deleted or removed content.
final_df = final_df[(final_df['selftext'] != '[deleted]') & (final_df['selftext'] != '[removed]')].copy()

In [21]:
# Check to make sure all posts are original reddit posts.
(final_df['is_self'] == True).sum() == final_df.shape[0]

True

In [22]:
# Map numerical values to our classes "subreddit"
final_df['subreddit'] = final_df['subreddit'].map({'physicaltherapy': 1, 'personaltraining': 0})

In [23]:
# Reset Index
final_df.reset_index(drop = True, inplace = True)

In [24]:
# Merge self-text and title features for vectorizers. We need a single vector in order for our vectorizers to do their job.
final_df['all_text'] = final_df['title'] + ' ' + final_df['selftext']
final_df

Unnamed: 0,author,is_self,num_comments,score,selftext,subreddit,title,date_created,all_text
0,Dr_SidK,True,13,1,"Hello, \n\nI am a neurologist with subspeciali...",1,"A new, always free, online Parkinson’s Patient...",2020-03-05,"A new, always free, online Parkinson’s Patient..."
1,RichardSnell,True,3,1,Goodmorning sirs and ma'ams.\n\nI am a recent ...,1,ATTENTION PHYSICAL THERAPISTS OF NEW ZEALAND,2020-03-05,ATTENTION PHYSICAL THERAPISTS OF NEW ZEALAND G...
2,stglidden,True,1,1,[https://youtu.be/xtZ4sNGM5iM](https://youtu.b...,1,Scar tissue/adhesion of rectus femoris treatme...,2020-03-05,Scar tissue/adhesion of rectus femoris treatme...
3,vols52,True,4,1,"I know this is probably a weird question, but ...",1,Travel PT Question,2020-03-05,Travel PT Question I know this is probably a w...
4,schiltron99,True,2,1,I have been away from sports because of a hams...,1,Heat or ice for damaged muscles?,2020-03-05,Heat or ice for damaged muscles? I have been a...
...,...,...,...,...,...,...,...,...,...
3034,nstrm,True,60,3,Hello personal trainers! 👋\n\nA few months ago...,0,I’m the developer of the weightlifting app Lif...,2019-10-05,I’m the developer of the weightlifting app Lif...
3035,vikesfan50,True,7,3,I have a mock session with a client coming up....,0,Personal training mock client session.,2019-10-06,Personal training mock client session. I have ...
3036,stupidstacker,True,9,8,"Most of my business is referrals, but I’d like...",0,Studio owners - how do you find new leads?,2019-10-06,Studio owners - how do you find new leads? Mos...
3037,Floopserino,True,0,1,Im offering help for building a website. Why i...,0,Helping Out Personal Trainers With Websites an...,2019-10-06,Helping Out Personal Trainers With Websites an...


In [25]:
# Create our X feature matrix and our y dependent variable vector
X = final_df[['all_text', 'num_comments', 'score']]
y = final_df['subreddit']

# Train-Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = .5)

In [26]:
# Since my classes are fairly imbalanced, I will perform oversampling on my minority class Physical Training 
from imblearn.over_sampling import RandomOverSampler

# RandomOverSampler is a tool used to inflate the minority class in our dataset - Uses random selection
# algorithm that resamples data from the current minority class data WITH replacement until the classes are balanced.
ros = RandomOverSampler(random_state = 42)
X_resample, y_resample = ros.fit_resample(X_train, y_train)

In [27]:
# Compare results of over-sampling. Classes should now be balanced.
print(f'Before resampling: \n{y_train.value_counts(normalize = True)}')
print()
print(f'After random oversampling: \n{y_resample.value_counts(normalize = True)}')

Before resampling: 
1    0.646478
0    0.353522
Name: subreddit, dtype: float64

After random oversampling: 
1    0.5
0    0.5
Name: subreddit, dtype: float64


# Feature Engineering and Modeling

In [28]:
# Using make_column_transformer, I can transform both text data and my numerical features together
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB

In [29]:
# Tokenizer that also lemmatizes our text. Adapted from lecture authored by
# Boom D. (DSI-NYC) and Mahdi S. (DSI-NYC)
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t.isalnum() and t not in stopwords]

In [30]:
y_test.value_counts()

1    987
0    533
Name: subreddit, dtype: int64

In [31]:
# Function to return model metrics
def describe_model(gs, X_train, X_test, y_train, y_test):
    '''
    I've bundled some important classification metrics and visualizations for each model iteration. We're constructing
    a confusion matrix, getting best parameters and accuracy scores as well as computing the ROC AUC.
    '''
    # Fit the model
    gs.fit(X_train, y_train)
    
    # Confusion Matrix
    preds = gs.predict(X_test)
    cm = confusion_matrix(y_test, preds)
    cm_df = pd.DataFrame(cm, 
                         columns = ['Predicted Personal Training', 'Predicted Physical Therapy'],
                         index = ['Actual Personal Training', 'Actual Physical Therapy'])
    tn, fp, fn, tp = cm.ravel() # Extracting classification metrics
    
    # ROC AUC
    pred_proba = [i[1] for i in gs.predict_proba(X_test)]
    pred_df = pd.DataFrame({'true_values': y_test, 'pred_probs': pred_proba})
    auc = roc_auc_score(y_test, pred_df['pred_probs'])
    
    # GridSearch Computations
    print(f'Best Paramaters: {gs.best_params_}'
          f'\n\nBest CV Score: {round(gs.best_score_, 2)}'
          f'\n\nRaw Train Score: {round(gs.score(X_train, y_train), 2)}'
          f'\n\nTest Performance: {round(gs.score(X_test, y_test), 2)}'
          '\n--------------------------------------------------------'
          f'\nSensitivity: {round(tp / (tp + fn), 2)}'
          f'\n\nSpecificity: {round(tn / (tn + fp), 2)}'
          f'\n\nPrecision: {round(tp / (tp + fp), 2)}'
          f'\n\nAccuracy: {round((tp + tn) / (tp + fp + tn + fn), 2)}'
          f'\n\nROC AUC Score: {round(auc, 2)}'
         )
    
    return display(cm_df)

In [32]:
# First model: CountVectorizer and standardized features with Logistic Regression

# Instantiate column transformer object
mct_cvec = make_column_transformer(
        (CountVectorizer(max_df = .98, tokenizer = LemmaTokenizer()), 'all_text'),
    (StandardScaler(), ['num_comments', 'score'])
)

# Instantiate Pipeline and GridSearchCV:
pipe_1 = Pipeline([
    ('mct_cvec', mct_cvec),
    ('logreg', LogisticRegression(penalty = 'l1', solver = 'liblinear'))
])

# Select hyperparameters to tune:
pipe_1_params = {
    'mct_cvec__countvectorizer__ngram_range': [(1, 2), (2, 2), (2, 3)],
    'mct_cvec__countvectorizer__max_features': [500, 1000, 1500],
}

# GridSearch with 5-fold CV
gs_1 = GridSearchCV(pipe_1, param_grid = pipe_1_params, scoring = 'accuracy', cv =5)
    

# Evaluate Model 1
describe_model(gs_1, X_resample, X_test, y_resample, y_test);

Best Paramaters: {'mct_cvec__countvectorizer__max_features': 1500, 'mct_cvec__countvectorizer__ngram_range': (1, 2)}

Best CV Score: 0.96

Raw Train Score: 0.99

Test Performance: 0.94
--------------------------------------------------------
Sensitivity: 0.96

Specificity: 0.9

Precision: 0.95

Accuracy: 0.94

ROC AUC Score: 0.98


Unnamed: 0,Predicted Personal Training,Predicted Physical Therapy
Actual Personal Training,480,53
Actual Physical Therapy,42,945


In [33]:
# For Model 1, identify the features with the highest coefficients.
# Create a DataFrame of coefficients
model_1_coefs = pd.DataFrame(pd.Series(gs_1.best_estimator_.named_steps['logreg'].coef_[0]).sort_values(ascending = False))

# Extract feature names
feature_names = gs_1.best_estimator_.named_steps.mct_cvec.named_transformers_.countvectorizer.get_feature_names()

# List comp.
print(f'Top 5 Physical Therapy Words: {[feature_names[idx] for idx, feature in enumerate(feature_names) if idx in model_1_coefs.head().index]}')
print(f'Top 5 Personal Training Words: {[feature_names[idx] for idx, feature in enumerate(feature_names) if idx in model_1_coefs.tail().index]}')

Top 5 Physical Therapy Words: ['athletic', 'clinic', 'mobility', 'patient', 'pta']
Top 5 Personal Training Words: ['client', 'fitness', 'nasm', 'trainer', 'training']


In [34]:
# Second model: TF-IDF and standardized features with Logistic Regression

# Instantiate column transformer object
mct_cvec = make_column_transformer(
    (TfidfVectorizer(max_df = .98, tokenizer = LemmaTokenizer()), 'all_text'),
    (StandardScaler(), ['num_comments', 'score'])
)

# Instantiate Pipeline and GridSearchCV:
pipe_2 = Pipeline([
    ('mct_cvec', mct_cvec),
    ('logreg', LogisticRegression(penalty = 'l1', solver = 'liblinear'))
])

# Select hyperparameters to tune:
pipe_2_params = {
    'mct_cvec__tfidfvectorizer__ngram_range': [(1, 2), (2, 2), (2, 3)],
    'mct_cvec__tfidfvectorizer__max_features': [500, 1000, 1500],
}

# GridSearch with 5-fold CV
gs_2 = GridSearchCV(pipe_2, param_grid = pipe_2_params, scoring = 'accuracy', cv =5)
    

# Evaluate Model 2
describe_model(gs_2, X_resample, X_test, y_resample, y_test);

Best Paramaters: {'mct_cvec__tfidfvectorizer__max_features': 500, 'mct_cvec__tfidfvectorizer__ngram_range': (1, 2)}

Best CV Score: 0.94

Raw Train Score: 0.96

Test Performance: 0.95
--------------------------------------------------------
Sensitivity: 0.97

Specificity: 0.91

Precision: 0.95

Accuracy: 0.95

ROC AUC Score: 0.98


Unnamed: 0,Predicted Personal Training,Predicted Physical Therapy
Actual Personal Training,483,50
Actual Physical Therapy,28,959


In [35]:
# For Model 2, identify the features with the highest coefficients.
# Create a DataFrame of coefficients
model_2_coefs = pd.DataFrame(pd.Series(gs_2.best_estimator_.named_steps['logreg'].coef_[0]).sort_values(ascending = False))

# Extract feature names
feature_names = gs_2.best_estimator_.named_steps.mct_cvec.named_transformers_.tfidfvectorizer.get_feature_names()

# List comp.
print(f'Top 5 Physical Therapy Words: {[feature_names[idx] for idx, feature in enumerate(feature_names) if idx in model_2_coefs.head().index]}')
print(f'Top 5 Personal Training Words: {[feature_names[idx] for idx, feature in enumerate(feature_names) if idx in model_2_coefs.tail().index]}')

Top 5 Physical Therapy Words: ['clinic', 'pain', 'patient', 'pta', 'therapist']
Top 5 Personal Training Words: ['client', 'fitness', 'nasm', 'trainer', 'training']


In [36]:
# Third model: CountVectorizer with LemmaTokenizer and L1 Penalty Logistic Regression (Text data ONLY)

# Instantiate Pipeline and GridSearchCV:
pipe_3 = Pipeline([
    ('cvec', CountVectorizer(max_df = .98, tokenizer = LemmaTokenizer())),
    ('logreg', LogisticRegression(penalty = 'l1', solver = 'liblinear'))
])

# Select hyperparameters to tune:
pipe_3_params = {
    'cvec__ngram_range': [(1, 2), (2, 2), (2, 3)],
    'cvec__max_features': [500, 1000, 1500],
}

# GridSearch with 5-fold CV
gs_3 = GridSearchCV(pipe_3, param_grid = pipe_3_params, scoring = 'accuracy', cv =5)
    

# Evaluate Model 3
describe_model(gs_3, X_resample['all_text'], X_test['all_text'], y_resample, y_test);

Best Paramaters: {'cvec__max_features': 1500, 'cvec__ngram_range': (1, 2)}

Best CV Score: 0.96

Raw Train Score: 0.99

Test Performance: 0.94
--------------------------------------------------------
Sensitivity: 0.96

Specificity: 0.9

Precision: 0.95

Accuracy: 0.94

ROC AUC Score: 0.98


Unnamed: 0,Predicted Personal Training,Predicted Physical Therapy
Actual Personal Training,480,53
Actual Physical Therapy,42,945


In [37]:
# For Model 3, identify the features with the highest coefficients.
# Create a DataFrame of coefficients
model_3_coefs = pd.DataFrame(pd.Series(gs_3.best_estimator_.named_steps['logreg'].coef_[0]).sort_values(ascending = False))

# Extract feature names
feature_names = gs_3.best_estimator_.named_steps['cvec'].get_feature_names()

# List comp.
print(f'Top 5 Physical Therapy Words: {[feature_names[idx] for idx, feature in enumerate(feature_names) if idx in model_3_coefs.head().index]}')
print(f'Top 5 Personal Training Words: {[feature_names[idx] for idx, feature in enumerate(feature_names) if idx in model_3_coefs.tail().index]}')

Top 5 Physical Therapy Words: ['athletic', 'clinic', 'mobility', 'patient', 'pta']
Top 5 Personal Training Words: ['client', 'fitness', 'nasm', 'trainer', 'training']


In [41]:
# Fourth model: TF-IDF (stopwords removed) with LemmaTokenizer and multinomial Naive Bayes (Text data ONLY)

# Instantiate Pipeline and GridSearchCV:
pipe_4 = Pipeline([
    ('cvec', CountVectorizer(max_df = .98, tokenizer = LemmaTokenizer())),
    ('multi_nb', MultinomialNB())
])

# Select hyperparameters to tune:
pipe_4_params = {
    'cvec__ngram_range': [(1, 1), (1, 2)],
    'cvec__max_features': [1000, 1500],
    'multi_nb__alpha': np.linspace(1e-10, 1, 25)
}

# GridSearch with 5-fold CV
gs_4 = GridSearchCV(pipe_4, param_grid = pipe_4_params, scoring = 'accuracy', cv =5)
    

# Evaluate Model #1
describe_model(gs_4, X_resample['all_text'], X_test['all_text'], y_resample, y_test);

Best Paramaters: {'cvec__max_features': 1500, 'cvec__ngram_range': (1, 2), 'multi_nb__alpha': 1e-10}

Best CV Score: 0.95

Raw Train Score: 0.96

Test Performance: 0.94
--------------------------------------------------------
Sensitivity: 0.95

Specificity: 0.91

Precision: 0.95

Accuracy: 0.94

ROC AUC Score: 0.97


Unnamed: 0,Predicted Personal Training,Predicted Physical Therapy
Actual Personal Training,486,47
Actual Physical Therapy,50,937


In [42]:
# For Model 4, identify the features with the highest coefficients.
# Create a DataFrame of coefficients
model_4_coefs = pd.DataFrame(pd.Series(gs_4.best_estimator_.named_steps['multi_nb'].coef_[0]).sort_values(ascending = False))

# Extract feature names
feature_names = gs_4.best_estimator_.named_steps['cvec'].get_feature_names()

# List comp.
print(f'Top 5 Physical Therapy Words: {[feature_names[idx] for idx, feature in enumerate(feature_names) if idx in model_4_coefs.head().index]}')
print(f'Top 5 Personal Training Words: {[feature_names[idx] for idx, feature in enumerate(feature_names) if idx in model_4_coefs.tail().index]}')

Top 5 Physical Therapy Words: ['just', 'like', 'pain', 'patient', 'pt']
Top 5 Personal Training Words: ['calorie', 'fitness professional', 'phd', 'pn level', 'user']
