### Import Libraries

In [34]:
import pandas as pd
import numpy as np

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Sci-kit libraries
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline

#Sci-kit Modeling Libraries
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import AdaBoostClassifier



#Sci-kit Metric Libraries
from sklearn.metrics import accuracy_score, confusion_matrix, plot_confusion_matrix
from sklearn.metrics import precision_score, recall_score, classification_report
from sklearn.metrics import roc_curve, plot_roc_curve, auc, f1_score
from sklearn import metrics

### Loading in Data

In [15]:
posts = pd.read_csv('../data/wine_beer_concatenated.csv')

In [16]:
posts.head(1)

Unnamed: 0,selftext,subreddit
0,Resources for a newbie home winemaker I want t...,winemaking


In [17]:
posts.tail(1)

Unnamed: 0,selftext,subreddit
4301,Does a decoction mash with wheat leave a bread...,Homebrewing


In [18]:
posts['subreddit'].value_counts()

Homebrewing    2367
winemaking     1935
Name: subreddit, dtype: int64

### Binarizing subreddit column for targeting

In [19]:
posts['subreddit'] = posts['subreddit'].map({'winemaking': 0, 'Homebrewing': 1})

In [20]:
posts['subreddit'].value_counts()

1    2367
0    1935
Name: subreddit, dtype: int64

### Create Target and Predictor Variables

In [21]:
X = posts['selftext']
y = posts['subreddit']

### Train/Test/Split

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, 
                                                   random_state = 42, 
                                                   stratify = y)

In [23]:
## creating a list of our two chosen vectorizers to iterate through in our grid search
vectorizer = [CountVectorizer(), TfidfVectorizer()]

In [24]:
## creating variables to accept tuning parameters
max_feat = [300, 500]  
ngram_range = [(1, 3), (1, 2)] 
stop_words = [None, 'english'] 
max_df = [0.9, 0.8] 

Create loop to loop through vectorizers, create a pipeline and loop through potential models while grid search looks for optimal hyperparameters. 

Information on setting voting classifier parameter to 'soft' or 'hard'. https://towardsdatascience.com/how-voting-classifiers-work-f1c8e41d30ff

Soft on even number of classifiers. Hard on odd number of classifiers. Since I am using 4 classifiers I tried to set it to soft but kept getting error messages. I am imagining one of the classifiers is not working well with 'soft.' I returned to using 'hard' and it worked fine. 

In [25]:
## creating any empty results list to capture our cv_results_ at the end of each iteration
results = []

## looping through both vectorizers
for vect in vectorizer:
    
    #### Pipeline for our Vectorizer and Classifier Models ####
    pipeline = Pipeline([
        ('vect', vect),
        ('clf', LogisticRegression())])
    
    instantiations = [ #### Beginning of Instantiations List ####
        {
            ### Log Reg Vect Hyperparameters ###
            
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            
            ## Instantiate Log Reg and Hyperparamaters
            'clf': (LogisticRegression(solver='liblinear'), ), ## setting our first classifier model
            'clf__penalty': ('l1', 'l2'),
            'clf__C': (.5, 1.0), 
        }, 
    #######
        {
            
            ### Multinomial Bayes Vect Hyperparameters ###
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            
            ## Instantiate Mulinomial NB and Hyperparamters ##
            'clf': (MultinomialNB(), ),  
            'clf__alpha': (.5, 1.0)
        },
    #######   
        {
            ### SVC Vect Hyperparameters ###
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            
            ## SVC Instantiation and Hyperparameters
            'clf': (SVC(gamma='scale', ), ),
            'clf__kernel': ('rbf', 'poly') 
        },
    #######  
        {
            ### RandomForestClassifier Vect Hyperparameters ###
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            
            ## Instantiate RandomForestClassifier ##
            'clf': (RandomForestClassifier(n_estimators=50, min_samples_split=5), ),
        },
    #######
        {
            ### Voting Classifier Vect Hyperparameters ###
            'vect__max_features': max_feat,
            'vect__stop_words': stop_words,
            'vect__ngram_range': ngram_range,
            'vect__max_df': max_df,
            
            ## Instantiating Ensemble Voting Classifier ##
            'clf': (VotingClassifier(estimators=[('lr', LogisticRegression()), 
                                                 ('rf', RandomForestClassifier()), 
                                                 ('mnb', MultinomialNB()), 
                                                 ('svc', SVC())],                                           
                                            voting='hard'), )
        }    
                    ] #### end of instantiations list ####
    
    
    #### Grid Search ####
    
    grid_search = GridSearchCV(pipeline, 
                               instantiations,
                               cv=3,
                               n_jobs=-1,
                               verbose=3,
                               return_train_score=True)
    
    #### Output Results ####
    
    ## running an if statement to print the type of vectorizer used
    if vect == vectorizer[0]:
        vect_string = "CountVectorizer"
    
    else:
        vect_string = "Tf-IDF Vectorizer"
    
    ## fitting our model and printing our best scores and parameters
    grid_search.fit(X_train, y_train)
    print(f'''Best score for {vect_string} is: 
    {round(grid_search.best_score_, 4)}
    ''')
    print(grid_search.best_params_)
    print("")
    
    ## appending our cv_results_ to the end of results
    results.append(grid_search.cv_results_)

Fitting 3 folds for each of 160 candidates, totalling 480 fits
Best score for CountVectorizer is: 
    0.9176
    
{'clf': VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('mnb', MultinomialNB()), ('svc', SVC())]), 'vect__max_df': 0.9, 'vect__max_features': 300, 'vect__ngram_range': (1, 2), 'vect__stop_words': 'english'}

Fitting 3 folds for each of 160 candidates, totalling 480 fits
Best score for Tf-IDF Vectorizer is: 
    0.9166
    
{'clf': VotingClassifier(estimators=[('lr', LogisticRegression()),
                             ('rf', RandomForestClassifier()),
                             ('mnb', MultinomialNB()), ('svc', SVC())]), 'vect__max_df': 0.8, 'vect__max_features': 500, 'vect__ngram_range': (1, 3), 'vect__stop_words': 'english'}



In [26]:
results_df = pd.DataFrame(grid_search.cv_results_)

In [32]:
results_df.head(1)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf,param_clf__C,param_clf__penalty,param_vect__max_df,param_vect__max_features,param_vect__ngram_range,...,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,5.173456,0.088117,0.82536,0.030476,LogisticRegression(solver='liblinear'),0.5,l1,0.9,300,"(1, 3)",...,0.886454,0.886341,0.886748,0.000498,139,0.891878,0.884903,0.895916,0.890899,0.004549


In [49]:
results_df['train_test_var'] = abs(results_df['mean_test_score'] - results_df['mean_train_score'])

In [50]:
results_df['train_test_var']

0      0.004151
1      0.004153
2      0.003652
3      0.004153
4      0.004318
         ...   
155    0.063768
156    0.068417
157    0.063931
158    0.069413
159    0.067916
Name: train_test_var, Length: 160, dtype: float64

In [51]:
results_df.loc[: ,['param_clf', 'train_test_var']].nsmallest(25, 'train_test_var')

Unnamed: 0,param_clf,train_test_var
2,LogisticRegression(solver='liblinear'),0.003652
8,LogisticRegression(solver='liblinear'),0.003985
0,LogisticRegression(solver='liblinear'),0.004151
10,LogisticRegression(solver='liblinear'),0.004151
6,LogisticRegression(solver='liblinear'),0.004152
1,LogisticRegression(solver='liblinear'),0.004153
9,LogisticRegression(solver='liblinear'),0.004153
3,LogisticRegression(solver='liblinear'),0.004153
11,LogisticRegression(solver='liblinear'),0.004153
4,LogisticRegression(solver='liblinear'),0.004318


In [36]:
results_df.to_excel('../data/scores_1.xlsx', engine = 'xlsxwriter')

# Modeling Metrics With Confusion Matrix

Accuracy

Misclassification Rate

Sensitivity/Recall

Specificity

Precision

# ROC Curve