<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Functions" data-toc-modified-id="Functions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Functions</a></span><ul class="toc-item"><li><span><a href="#Fit-&amp;-Predict-Models" data-toc-modified-id="Fit-&amp;-Predict-Models-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Fit &amp; Predict Models</a></span></li><li><span><a href="#Plot-Normalized-Confusion-Matrices" data-toc-modified-id="Plot-Normalized-Confusion-Matrices-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Plot Normalized Confusion Matrices</a></span></li><li><span><a href="#Print-Classification-Reports" data-toc-modified-id="Print-Classification-Reports-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Print Classification Reports</a></span></li><li><span><a href="#Plot-ROC-Curves" data-toc-modified-id="Plot-ROC-Curves-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Plot ROC Curves</a></span></li><li><span><a href="#Accuracy-Scores" data-toc-modified-id="Accuracy-Scores-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Accuracy Scores</a></span></li><li><span><a href="#Feature-Importance-Plot" data-toc-modified-id="Feature-Importance-Plot-1.6"><span class="toc-item-num">1.6&nbsp;&nbsp;</span>Feature Importance Plot</a></span></li><li><span><a href="#Grid-Search" data-toc-modified-id="Grid-Search-1.7"><span class="toc-item-num">1.7&nbsp;&nbsp;</span>Grid Search</a></span></li></ul></li></ul></div>

## Functions

### Fit & Predict Models

In [6]:
### Fit & Predict Models ###

# Define function to fit our models and predict on the training and test sets

def clf_pred(models):
    '''
    Intakes our DataFrame with our models 
        and our X & y, train & test data
    
    Returns a DataFrame with all of our models, 
        metrics and accuracy scores   
    '''
    
    # Initialize matrix to fill
    clf_df = np.zeros((len(models), 26), dtype=object)
    
    for i,model in models.iterrows():
        

        # Classifier Names & Models
        clf_df[i,0] = model['clf_name']
        clf_df[i,1] = model['clfs'] 
        
        ### Assign Variables ###
        # We do this here because XGBoost takes 
        # np.array format unlike the other models
        X_train = model['X_train']
        X_test = model['X_test']
        y_train = model['y_train']
        y_test = model['y_test']

        ### Initialize Timer ### 
        start_time = time.time()
        
        ### Fit & Predict ###
    
        # Fit Model
        fitted = model['clfs'].fit(X_train, y_train)
        clf_df[i,2] = fitted
        
        # Calculate time to fit model
        stop_time = time.time()
        runtime = (stop_time - start_time)
        clf_df[i,3] = runtime
    
        # Predict
        y_pred_train = fitted.predict(X_train)
        y_pred_test = fitted.predict(X_test)

        clf_df[i,4] = y_pred_train
        clf_df[i,5] = y_pred_test
    
        # y_score
        y_score_train = fitted.predict_proba(X_train)
        y_score_test = fitted.predict_proba(X_test)
        clf_df[i,6] = y_score_train
        clf_df[i,7] = y_score_test
    
        # False & True Positive Rates
        clf_df[i,8], clf_df[i,9], thresholds_train = roc_curve(y_train, y_score_train[:,1])
        clf_df[i,10], clf_df[i,11], thresholds_test = roc_curve(y_test, y_score_test[:,1])
        
        
        ### Accuracy Scores ### 
        
        # Precision
        clf_df[i,12] = precision_score(y_train, y_pred_train)
        clf_df[i,13] = precision_score(y_test, y_pred_test)
        
        # Recall 
        clf_df[i,14] = recall_score(y_train, y_pred_train)
        clf_df[i,15] = recall_score(y_test, y_pred_test)        
        
        # F1
        clf_df[i,16] = f1_score(y_train, y_pred_train)
        clf_df[i,17] = f1_score(y_test, y_pred_test)  
        
        # Accuracy
        clf_df[i,18] = accuracy_score(y_train, y_pred_train)
        clf_df[i,19] = accuracy_score(y_test, y_pred_test)
        
        # AUC
        clf_df[i,20] = roc_auc_score(y_train, y_pred_train)
        clf_df[i,21] = roc_auc_score(y_test, y_pred_test)
        
        ### Add X & y values to have everything in one place ### 
        # These are class balanced/resampled #
        clf_df[i,22] = X_train
        clf_df[i,23] = X_test
        clf_df[i,24] = y_train
        clf_df[i,25] = y_test        
    
    ### Create DataFrame ###
    
    # Column Names
    columns = ['Classifier',
               'Model',
               'Fitted Model',
               'Runtime',
               'Train Preds',
               'Test Preds',
               'Train y-Score',
               'Test y-Score',
               'Train FPR',
               'Train TPR',
               'Test FPR',
               'Test TPR',
               'Train Precision',
               'Test Precision',
               'Train Recall',
               'Test Recall',
               'Train F1',
               'Test F1',
               'Train Accuracy',
               'Test Accuracy',
               'Train ROC AUC',
               'Test ROC AUC',
               'X_train',
               'X_test',
               'y_train',
               'y_test'
              ]
    
    # Create DataFrame
    clf_df = pd.DataFrame(clf_df, columns=columns)
    
    return clf_df

### Plot Normalized Confusion Matrices

In [None]:
### Accuracy Metrics - Plot Confusion Matrices ###

# Define function to print out normalized cofusion matrices
def norm_cm(clf_df):
    '''
    Function that prints out normalized confusion matrices
    for our classification models.
    
    Inputs:
        clf_df: our DataFrame with out classifier metrics. 
        
    Returns: Prints out normalized confustion matrices for
        each of our models in our classifer DataFrame.
    '''

    for i,clf in clf_df.iterrows():
        
        classifier = clf['Classifier'] 
        
        # Create Subplots
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))
        
        # Figure Title
        fig.suptitle(f"{classifier}:")
                     
        ### Plot ###
                     
        # Plot Confusion Matrix - Train Set
        plot_confusion_matrix(clf['Model'], 
                              clf['X_train'], 
                              clf['y_train'], 
                              cmap='Blues',
                              normalize='true',
                              ax=ax1)
        # Subplot Title
        ax1.set_title('Train')
        
        # Plot Confusion Matrix - Test Set
        plot_confusion_matrix(clf['Model'], 
                              clf['X_test'], 
                              clf['y_test'], 
                              cmap='Blues',
                              normalize='true',
                              ax=ax2)
        # Subplot Title
        ax2.set_title('Test')
              
        plt.savefig(f'cm_{classifier}.png', dpi=500, orientation='landscape', bbox_inches='tight')
        plt.show()
                      
    return 

### Print Classification Reports

In [None]:
### Accuracy Metrics - Print Classification Reports ###
                     
# Define function to print out classification reports 

def clf_reports(clf_df):
    '''
    Function that prints out classification reports.
    
    Inputs:
        clf_df: our DataFrame with out classifier metrics. 
        
    Returns: Prints out normalized confustion matrices for
        each of our models in our classifer DataFrame.
    '''
    for i,clf in clf_df.iterrows():
        
        classifier = clf['Classifier']  
        print(f"{classifier}:\n")
        
        print("Train:")
        # Print Classification Report - Train
        print(classification_report(clf['y_train'], clf['Train Preds']))

        print("\nTest:")
        # Print Classification Report - Test
        print(classification_report(clf['y_test'], clf['Test Preds']))

    return 

### Plot ROC Curves

In [None]:
### Accuracy Metrics - Plot ROC Curves ###
              
# Define function to plot ROC curves together

def plot_roc(clf_df):
    '''
    Plots ROC curves for each classifier in DataFrame on
    the same plot. Prints AUC value next to classifier name
    in plot legend.
    
    Input: Dataframe containing classifier prediction metrics
    
    Output: ROC plot
    '''
    
    # Set figure size & ax
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12,6))
    fig.suptitle('ROC Curves', fontsize=24)    

    # Plot each curve
    for i,clf in clf_df.iterrows():
        
        classifier = clf['Classifier']
                 
        # Training Set 
        ax1.plot(clf['Train FPR'], 
                 clf['Train TPR'], 
                 linestyle='-', 
                 lw=3,
                 label=(str(classifier) + ' - AUC:' + str("%0.2f" % clf['Train ROC AUC']))
                )
                 
        # Sub Title         
        ax1.set_title('Train')
        ax1.set_xlabel('False Positive Rate')
        ax1.set_ylabel('True Positive rate')
                        
        
        # Plot threshold
        ax1.plot([0, 1], [0, 1], color='grey', lw=3, linestyle='--')
        ax1.set_xlim([-0.05, 1.0])
        ax1.set_ylim([0.0, 1.05])
        
        ax1.legend(loc='lower right')
                        
                        
        # Test Set                
        ax2.plot(clf['Test FPR'], 
                 clf['Test TPR'], 
                 linestyle='-', 
                 lw=3, 
                 label=(str(classifier) + ' - AUC:' + str("%0.2f" % clf['Test ROC AUC']))
                )

        # Sub Title
        ax2.set_title('Test')
        ax2.set_xlabel('False Positive Rate')
        ax2.set_ylabel('True Positive rate')
                 
        # Plot threshold
        ax2.plot([0, 1], [0, 1], color='grey', lw=3, linestyle='--')
        ax2.set_xlim([-0.05, 1.0])
        ax2.set_ylim([0.0, 1.05])
                        
        ax2.legend(loc='lower right')
        
        # Save fig
        plt.savefig(f'roc_curve.png', dpi=500, orientation='landscape', bbox_inches='tight')

    return 

### Accuracy Scores

In [None]:
### Accuracy Metrics - Accuracy Scores ###
              
# Function to return table of accuracy scores
def acc_scores(clf_preds):
    '''
    Takes in our DataFrame of classifiers, predictions & metrics.
    
    Returns a table of accuracy scores as well as runtime of 
        training each classifier.
    '''
    
    cols = ['Classifier', 
            'Runtime',
            'Train Precision',
            'Test Precision', 
            'Train Recall', 
            'Test Recall', 
            'Train F1',
            'Test F1', 
            'Train Accuracy', 
            'Test Accuracy',
            'Train ROC AUC', 
            'Test ROC AUC', 
           ]
    
    acc_scores = clf_preds[cols]
    acc_scores = acc_scores.set_index('Classifier')
    
    # Round values 
    acc_scores = acc_scores.astype(float).round(2)
    
    # Save to file
    acc_scores.to_csv("Accuracy Scores.csv")
    
    return acc_scores

### Feature Importance Plot

For XGBoost Classifier

In [None]:
### Feature Importance Plot - XGBoost ###
              
# Create a function to output our feature importance plot
def feat_rank(xgb_clf,X):
    '''
    Input XGBoost fitted classifier and X: DataFrame of features 
    
    Returns ranked feature importance plot
    '''

    # We want to add back our column names to our feature importances
    ft_rank = pd.Series(xgb_clf.feature_importances_, index = X.columns)

    # Sort ascending
    ft_rank = ft_rank.sort_values(ascending = True)

    # Plot
    ft_rank.plot(kind='barh', width=.75, edgecolor = "black", figsize=(12,10), fontsize=18)
    plt.title('Feature Importance Ranking - XGBoost', fontsize=22)
    
    # Save fig
    plt.savefig(f'feat_rank_XGB.png', dpi=500, orientation='landscape', bbox_inches='tight')
    
    
    return plt.show()

###  Grid Search

In [None]:
### Grid Search ###
# Define Function to Conduct Grid Search
def grid_search(params, clf_df, cv=3, scoring='recall'):
    '''
    Conducts gridsearch using GridSearchCV
    
    Inputs:
        params: parameter grid in GridSearchCV dictionary format
        clf: classifier 
        cv: k-folds for crossvalidation
        scoring: scoring metric for optimization
            'accuracy','recall','precision','f1'
            
    Returns:
        Best score for training accuracy
        Optimal parameters for model
    '''
    for i, clf in clf_df.iterrows():
        print(clf['Classifier'])
        print(f"\nParameter Grid: {params[i]} \n")
        clf_grid_search = GridSearchCV(clf['Model'],
                                  param_grid=params[i],
                                  cv=cv,
                                  scoring=scoring)
        clf_grid_search.fit(clf['X_train'], clf['y_train'])
        print(f"Training Accuracy: {clf_grid_search.best_score_ :.2%} \n")
        print(f"Optimal Parameters: {clf_grid_search.best_params_} \n\n")
        
    return