### Import Libraries

In [1]:
import pandas as pd
import numpy as np

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Sci-kit libraries
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier


### Loading in Data

In [2]:
posts = pd.read_csv('../data/wine_beer_concatenated.csv')

In [3]:
posts.head()

Unnamed: 0,selftext,subreddit
0,Resources for a newbie home winemaker I want t...,winemaking
1,A question about kit wine I’ve made quite a fe...,winemaking
2,"Riesling - My First Wine Hi Everyone,\n\nI am ...",winemaking
3,Persimmon Wine Straining? Hello! This is my fi...,winemaking
4,Wire used for Trellis Hey all - I'm planting s...,winemaking


In [4]:
posts.tail()

Unnamed: 0,selftext,subreddit
4297,"Any input on this recipe 6 lb - Pale Malt, Mar...",Homebrewing
4298,"Bottling Tepache Hello brewers, \n\na couple d...",Homebrewing
4299,Wiring a spa panel for ebiab gfi I've been all...,Homebrewing
4300,What can I ferment at 60-65°F? The basement in...,Homebrewing
4301,Does a decoction mash with wheat leave a bread...,Homebrewing


In [5]:
posts['subreddit'].value_counts()

Homebrewing    2367
winemaking     1935
Name: subreddit, dtype: int64

### Binarizing subreddit column for targeting

In [6]:
posts['subreddit'] = posts['subreddit'].map({'winemaking': 0, 'Homebrewing': 1})

In [7]:
posts['subreddit'].value_counts()

1    2367
0    1935
Name: subreddit, dtype: int64

# Baseline Model
**Interpretation**
* If we guessed at random we would select Homebrewing subreddit 55% of the time and winemaking subreddit 45% of the time. 

In [8]:
posts['subreddit'].value_counts(normalize = True)

1    0.550209
0    0.449791
Name: subreddit, dtype: float64

### Create Target and Predictor Variables

In [9]:
X = posts['selftext']
y = posts['subreddit']

### Train/Test/Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, 
                                                   random_state = 42, 
                                                   stratify = y)

Create loop to loop through vectorizers, create a pipeline and loop through potential models while grid search looks for optimal hyperparameters. 

Information on setting voting classifier parameter to 'soft' or 'hard'. https://towardsdatascience.com/how-voting-classifiers-work-f1c8e41d30ff

Soft on even number of classifiers. Hard on odd number of classifiers. Since I am using 4 classifiers I will set it to soft.

In [14]:
## creating a list of our two chosen vectorizers to iterate through in our grid search
vectorizer = [CountVectorizer(), TfidfVectorizer()]

## creating variables to accept tuning parameters
max_feat = [300, 500]  
ngram_range = [(1, 3), (1, 2)] 
stop_words = [None, 'english'] 
max_df = [0.9, 0.8] 

## creating any empty results list to capture our cv_results_ at the end of each iteration
results = []

## looping through both vectorizers
for vect in vectorizer:
    
    ## creating a pipeline for our vectorizer and classifier models
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', LogisticRegression())
    ])
    
    ## generating our parameters for vectorizers
    vect_params = {'vect__max_features': max_feat,
                    'vect__stop_words': stop_words,
                    'vect__ngram_range': ngram_range,
                    'vect__max_df': max_df
                  }
    parameters = [
        {
            ## Logistic Regression
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            'clf': (LogisticRegression(solver='liblinear'), ), ## setting our first classifier model
            'clf__penalty': ('l1', 'l2'), #2
            'clf__C': (.5, 1.0), #4 288*2
        }, 
        {
            ## Multinomial Bayes
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            'clf': (MultinomialNB(), ),  ## setting our second classifier model
            'clf__alpha': (.5, 1.0)  #2 72*2
        },
        {
            ## SVC
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            'clf': (SVC(gamma='scale', ), ),
            'clf__kernel': ('rbf', 'poly') 
        },
        {
            ## RandomForestClassifier
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            'clf': (RandomForestClassifier(n_estimators=50, min_samples_split=5), ),
        },
        {
            ## putting together an ensemble model
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            'clf': (VotingClassifier(estimators=[('lr', LogisticRegression()), 
                                                 ('rf', RandomForestClassifier()), 
                                                 ('mnb', MultinomialNB()), 
                                                 ('svc', SVC())],                                           
                                            voting='hard'), )
        }
        
    ]
    
    ## performing our grid search with the inherited pipeline and parameters
    grid_search = GridSearchCV(pipeline, 
                               parameters,
                               cv=3,
                               n_jobs=-1,
                               verbose=1,
                               return_train_score=True
                              )
    
    ## running an if statement to print the type of vectorizer used
    if vect == vectorizer[0]:
        vect_string = "CountVectorizer"
    
    else:
        vect_string = "Tf-IDF Vectorizer"
    
    ## fitting our model and printing our best scores and parameters
    grid_search.fit(X_train, y_train)
    print(f'''Best score for {vect_string} is: 
    {round(grid_search.best_score_, 4)}
    ''')
    print(grid_search.best_params_)
    print("")
    
    ## appending our cv_results_ to the end of results
    results.append(grid_search.cv_results_)

Fitting 3 folds for each of 160 candidates, totalling 480 fits
Best score for CountVectorizer is: 
    0.9173
    
{'clf': VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('mnb', MultinomialNB()), ('svc', SVC())]), 'vect__max_df': 0.9, 'vect__max_features': 500, 'vect__ngram_range': (1, 2), 'vect__stop_words': 'english'}

Fitting 3 folds for each of 160 candidates, totalling 480 fits
Best score for Tf-IDF Vectorizer is: 
    0.917
    
{'clf': RandomForestClassifier(min_samples_split=5, n_estimators=50), 'vect__max_df': 0.9, 'vect__max_features': 500, 'vect__ngram_range': (1, 3), 'vect__stop_words': 'english'}



In [34]:
results_df = pd.DataFrame(grid_search.cv_results_)

In [35]:
results_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__C,param_clf__penalty,param_vect__max_df,param_vect__max_features,param_vect__ngram_range,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,1.633313,0.003763,0.486775,0.010979,LogisticRegression(solver='liblinear'),0.5,l1,0.9,300,"(1, 3)",...,0.886454,0.886341,0.886748,0.000498,138,0.891380,0.884903,0.895916,0.890733,0.004519
1,1.359614,0.121083,0.430986,0.145028,LogisticRegression(solver='liblinear'),0.5,l1,0.9,300,"(1, 3)",...,0.904382,0.893320,0.897043,0.005190,86,0.901345,0.899851,0.900896,0.900697,0.000626
2,1.363829,0.124363,0.471265,0.077995,LogisticRegression(solver='liblinear'),0.5,l1,0.9,300,"(1, 2)",...,0.887450,0.887338,0.887413,0.000053,136,0.891380,0.884903,0.895916,0.890733,0.004519
3,0.852647,0.052690,0.264987,0.011226,LogisticRegression(solver='liblinear'),0.5,l1,0.9,300,"(1, 2)",...,0.904382,0.893320,0.897043,0.005190,86,0.901345,0.900349,0.900896,0.900863,0.000407
4,1.749457,0.040503,0.473530,0.023937,LogisticRegression(solver='liblinear'),0.5,l1,0.9,500,"(1, 3)",...,0.892430,0.885344,0.888408,0.002971,132,0.891380,0.892875,0.894422,0.892892,0.001242
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,2.403066,0.032255,0.556340,0.042589,"VotingClassifier(estimators=[('lr', LogisticRe...",,,0.8,300,"(1, 2)",...,0.918327,0.908275,0.910328,0.005875,16,0.975087,0.975087,0.972610,0.974261,0.001168
156,4.913290,0.158064,1.318622,0.035868,"VotingClassifier(estimators=[('lr', LogisticRe...",,,0.8,500,"(1, 3)",...,0.913347,0.906281,0.908335,0.003563,22,0.976582,0.978575,0.975598,0.976918,0.001239
157,2.945695,0.035838,0.725745,0.012690,"VotingClassifier(estimators=[('lr', LogisticRe...",,,0.8,500,"(1, 3)",...,0.918327,0.920239,0.914980,0.006135,1,0.980568,0.980070,0.980578,0.980405,0.000237
158,4.316001,0.211861,1.272064,0.070193,"VotingClassifier(estimators=[('lr', LogisticRe...",,,0.8,500,"(1, 2)",...,0.914343,0.904287,0.908667,0.004206,21,0.976582,0.979073,0.975100,0.976918,0.001640


# Modeling Metrics With Confusion Matrix

Accuracy

Misclassification Rate

Sensitivity/Recall

Specificity

Precision

# ROC Curve