### Import Libraries

In [2]:
import pandas as pd
import numpy as np

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Sci-kit libraries
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier


### Loading in Data

In [3]:
posts = pd.read_csv('../data/wine_beer_concatenated.csv')

In [4]:
posts.head()

Unnamed: 0,selftext,subreddit
0,Resources for a newbie home winemaker I want t...,winemaking
1,A question about kit wine I’ve made quite a fe...,winemaking
2,"Riesling - My First Wine Hi Everyone,\n\nI am ...",winemaking
3,Persimmon Wine Straining? Hello! This is my fi...,winemaking
4,Wire used for Trellis Hey all - I'm planting s...,winemaking


In [5]:
posts.tail()

Unnamed: 0,selftext,subreddit
4297,"Any input on this recipe 6 lb - Pale Malt, Mar...",Homebrewing
4298,"Bottling Tepache Hello brewers, \n\na couple d...",Homebrewing
4299,Wiring a spa panel for ebiab gfi I've been all...,Homebrewing
4300,What can I ferment at 60-65°F? The basement in...,Homebrewing
4301,Does a decoction mash with wheat leave a bread...,Homebrewing


In [6]:
posts['subreddit'].value_counts()

Homebrewing    2367
winemaking     1935
Name: subreddit, dtype: int64

### Binarizing subreddit column for targeting

In [7]:
posts['subreddit'] = posts['subreddit'].map({'winemaking': 0, 'Homebrewing': 1})

# Baseline Model
**Interpretation**
* If we guessed at random we would select Homebrewing subreddit 55% of the time and winemaking subreddit 45% of the time. 

In [8]:
posts['subreddit'].value_counts(normalize = True)

1    0.550209
0    0.449791
Name: subreddit, dtype: float64

### Create Target and Predictor Variables

In [9]:
X = posts['selftext']
y = posts['subreddit']

### Train/Test/Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, 
                                                   random_state = 42, 
                                                   stratify = y)

Create a list of vectorizers to iterate through with grid search

In [11]:
vects = [CountVectorizer(), TfidfVectorizer()]

Create Vectorizer hyperparameter tuning variables

In [12]:
max_features = [300, 5_000]
max_df = [.7, .8]
min_df = [2, 3]
ngram_range = [(1,2), (1,3)]
stop_words = [None, 'english']

Create loop to loop through vectorizers, create a pipeline and loop through potential models while grid search looks for optimal hyperparameters. 

* Notes from pipeline documentation:
* The purpose of the pipeline is to assemble several steps that can be cross-validated together while setting different parameters.
* For this, it enables setting parameters of the various steps using their names and the parameter name separated by a '__', as in the example below.
* A step's estimator may be replaced entirely by setting the parameter with its name to another estimator, or a transformer removed by setting it to 'passthrough' or ``None``.

In [16]:
# Create an empty results list to capture cv_results_ at the end of each iteration. 
vectorizer_results = []

# loop through both vectorizers
for vect in vects:


    # Create a pipeline for vectorizer and classifier models
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', LogisticRegression())
    ])
    
    
    # Generate parameters for vectorizer
    vect_params = {
        'vect__max_features': max_features, 
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__stop_words': stop_words,
        'vect__ngram_range': ngram_range
    }
    
    
    # Create a list of dictionaries that contain parameters for each classifier/model 
    parameters = [
    {
        #Log Regression
        'vect__max_features': max_features, 
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__stop_words': stop_words,
        'vect__ngram_range': ngram_range,
        'clf': (LogisticRegression(solver = 'liblinear'), ),
        'clf__penalty': ('l1', 'l2'),
        'clf__C': (.5, 1.0)
        
    },
        
    {#Multinomial Bayes
        'vect__max_features': max_features, 
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__stop_words': stop_words,
        'vect__ngram_range': ngram_range,
        'clf': (MultinomialNB(), ),
        'clf__alpha': (.5, 1.0)
    },
        
    {
        #SVC
        'vect__max_features': max_features, 
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__stop_words': stop_words,
        'vect__ngram_range': ngram_range,
        'clf': (SVC(gamma = 'scale', ), ),
        'clf__kernel': ('rbf', 'poly')
        
           
    },
        
    {
        #Random Forest Classifier
        'vect__max_features': max_features, 
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__stop_words': stop_words,
        'vect__ngram_range': ngram_range,
        'clf': (RandomForestClassifier(n_estimators = 50, min_samples_split = 5), )
          
    },
        
    {
        #Ensemble Model
        'vect__max_features': max_features, 
        'vect__max_df': max_df,
        'vect__min_df': min_df,
        'vect__stop_words': stop_words,
        'vect__ngram_range': ngram_range,
        'clf': (VotingClassifier(estimators = [('lr', LogisticRegression()),
                                               ('mnb', MultinomialNB()),
                                               ('rf', RandomForestClassifier()),
                                               ('svc', SVC())],
                                 voting = 'soft'), )
        
   
    }
        
        
    ] 
    # Perform grid search with the inherited pipeline and parameters
    grid_search = GridSearchCV(pipeline,
                               parameters,
                               cv=3,
                               n_jobs =-1,
                               verbose =2,
                               return_train_score =True)
    
    # Print the type of vectorizer used by running an if statement
    if vect == vects[0]:
        vect_string = 'Count Vectorizer'
    else:
        vect_string = 'Tf-IDF Vectorizer'
    # Fit model and print best scores and parameters
        #Fit
        grid_search.fit(X_train, y_train)
    
        #print
        print(f'''Best score for {vect_string} is:
        {round(grid_search.best_score_, 4)}''')
        print('')
        
    

Fitting 3 folds for each of 320 candidates, totalling 960 fits


 0.88940483 0.8967106  0.87778166 0.88243172 0.87811367 0.88243172
 0.87778166 0.88276373 0.87811367 0.88243172 0.88874082 0.89737494
 0.88874082 0.89704293 0.88874082 0.8967106  0.88874082 0.8967106
 0.87778166 0.88243172 0.87778166 0.88243172 0.87744966 0.88276373
 0.87811367 0.88243172 0.8890715  0.90368602 0.88973551 0.90368602
 0.88973551 0.90302168 0.88873949 0.90368569 0.90003297 0.9106621
 0.90003363 0.91099444 0.89936896 0.90933342 0.89970163 0.90966576
 0.89006751 0.90368602 0.88973584 0.90368602 0.8894035  0.90302168
 0.89006851 0.90368569 0.9000333  0.9106621  0.9000333  0.91099444
 0.89936896 0.90933342 0.89936929 0.90966576 0.89139653 0.90434937
 0.89139686 0.90434937 0.89106452 0.90401703 0.89139653 0.90401703
 0.89073219 0.89471923 0.89039985 0.89505123 0.89040051 0.89505157
 0.89040018 0.89505157 0.89139686 0.90434937 0.89139686 0.90434937
 0.89139686 0.90401703 0.89172919 0.90401703 0.89073219 0.89471923
 0.89073219 0.89505123 0.89073252 0.89505157 0.89073219 0.895051

Best score for Tf-IDF Vectorizer is:
        0.921



In [None]:
# append cv_results_ to the end of the results. 
    vectorizer_results.append(grid_search.cv_results_)

# Modeling Metrics With Confusion Matrix

Accuracy

Misclassification Rate

Sensitivity/Recall

Specificity

Precision

# ROC Curve