# Matt and Paul's Notebook

## 1. Packages and Libraries Import

In [None]:
#Import the relevant packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, accuracy_score,recall_score,precision_score,\
                            f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.datasets import load_iris

## 1b. Test on Iris Dataset (to be deleted)

In [None]:
# Load Iris Data to test functions
iris = load_iris() 
print(iris.target_names)
print(iris.feature_names)

In [None]:
# Petal length and width features
feature_used = iris.feature_names[2:]
X = iris.data[:, 2:] 
y = iris.target 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

X_train.shape, X_test.shape

## 2. Function Builds for Project

### 2a. Data Scaling

In [None]:
# Scale with Standard Scaler
def SS(X_train,X_test):
    ss = StandardScaler()
    return ss.fit_transform(X_train), ss.transform(X_test)

### 2b. Logistic Regression Model

Create a function to use GridSearchCV, find the best hyperparameters and then create a model with those results and values. Also print some scoring metrics to assess how well the model performs.

In [None]:
def logreg(X_train, X_test, y_train, y_test, cv=5):
    
    # Set GridSearchCV hyperparameters to compare & select
    grid = {
    'penalty': ['l1', 'l2' ,'elasticnet'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
    
    # Instantiate & fit LogReg model for GridSearch
    grid_logreg = LogisticRegression(random_state=42)
    grid_logreg.fit(X_train, y_train)
    
    # Instantiate & fit GridSearchCV with accuracy scoring
    gs = GridSearchCV(estimator=grid_logreg, param_grid=grid, cv=5,
                      scoring='accuracy')
    gs.fit(X_train, y_train)
    
    # Return best hyperparameters
    logreg_params = gs.best_params_
    
    # Use best penalty from best_params
    logreg_penalty = logreg_params['penalty']
    print(f'Penalty: {logreg_penalty}')
    
    # Use best solver from best_params
    logreg_solver = logreg_params['solver']
    print(f'Solver: {logreg_solver}')
    
    # Instantiate & fit LogReg model
    log = LogisticRegression(random_state=42, penalty=logreg_penalty, solver=logreg_solver)
    log.fit(X_train, y_train)
    
    # Create prediction variable using test data
    y_pred = log.predict(X_test)
    
    # Run cross-validate score with cv folds from function parameter
    cv_results = cross_val_score(log, X_train, y_train, cv=cv)
    print(f'Mean Cross-Val Score: {cv_results.mean()}')
    
    # Run and print accuracy, recall, precision and f1 scores
    acc_score = accuracy_score(y_test, y_pred)
    print(f'Accuracy Score: {acc_score}')
    
#     rec_score = recall_score(y_test, y_pred)
#     print(f'Recall Score: {rec_score}')
    
#     prec_score = precision_score(y_test, y_pred)
#     print(f'Precision Score: {prec_score}')
    
#     f1_score = f1_score(y_test, y_pred)
#     print(f'F1 Score: {f1_score}')
    
#     # Plot an ROC curve (only works with binary data)
#     plot_roc_curve(log, X_train, y_train)
    
    # Plot Confusion Matrix
    plot_confusion_matrix(log, X_train, y_train);    

In [None]:
logreg(X_train,X_test,y_train,y_test)

### 2c. K-Nearest Neighbors Model

Create a function to use GridSearchCV, find the best hyperparameters and then create a model with those results and values. Also print some scoring metrics to assess how well the model performs.

In [None]:
def knn(X_train, X_test, y_train, y_test, metric='minkowski', cv=5):
    
    # Set GridSearchCV hyperparameters to compare & select
    grid = {
    'n_neighbors': [1,3,5,7,9,11,13,15,17,19,21,23,25],
    'metric': ['minkowski', 'manhattan'],
    'weights': ['uniform', 'distance']}
    
    # Instantiate & fit KNN model for GridSearch
    grid_knn = KNeighborsClassifier()
    grid_knn.fit(X_train, y_train)
    
    # Instantiate & fit GridSearchCV with accuracy scoring
    gs = GridSearchCV(estimator=grid_knn, param_grid=grid, cv=5, scoring='accuracy')
    gs.fit(X_train, y_train)
    
    # Return best hyperparameters
    knn_params = gs.best_params_
    
    # Use best # of neighbors from best_params
    knn_neighbors = knn_params['n_neighbors']
    print(f'Number of Neighbors: {knn_neighbors}')
    
    # Use best metric from best_params
    knn_metric = knn_params['metric']
    print(f'Metric: {knn_metric}')
    
    # Use best weights from best_params
    knn_weights=knn_params['weights']
    print(f'Weights: {knn_weights}')
    
    # Instantiate & fit K-Nearest Neighbors model
    knn = KNeighborsClassifier(n_neighbors=knn_neighbors, metric=knn_metric,
                               weights=knn_weights)
    knn.fit(X_train, y_train)
    
    # Create prediction variable using test data
    y_pred = knn.predict(X_test)
    
    # Run cross-validate score with cv folds from function parameter
    cv_results = cross_val_score(knn, X_train, y_train, cv=cv)
    print(f'Mean Cross-Val Score: {cv_results.mean()}')
    
    # Run and print accuracy, recall, precision and f1 scores
    acc_score = accuracy_score(y_test, y_pred)
    print(f'Accuracy Score: {acc_score}')
    
    rec_score = recall_score(y_test, y_pred)
    print(f'Recall Score: {rec_score}')
    
    prec_score = precison_score(y_test, y_pred)
    print(f'Precision Score: {prec_score}')
    
    f1_score = f1_score(y_test, y_pred)
    print(f'F1 score: {f1_score}')
    
    # Plot an ROC curve (only works with binary data)
    plot_roc_curve(knn, X_train, y_train)
    
    # Plot Confusion Matrix
    plot_confusion_matrix(knn, X_train, y_train);

In [None]:
knn(X_train,X_test,y_train,y_test)

In [None]:
# # GridSearchCV alternative coded by hand (not to be used)
# knn_dict={}
# for index in range(1,33,2):
#     knn_model = KNeighborsClassifier(n_neighbors=index)
#     knn_log_loss = -1 * cross_val_score(knn_model, X_train,
#                                         y_train, scoring="accuracy").mean()
#     knn_dict[index] = knn_log_loss
# min_knn = min(knn_dict.values())
# low_key = list(knn_dict.keys())[list(knn_dict.values()).index(min_knn)]
# knn = KNeighborsClassifier(n_neighbors=low_key,metric=metric)

### 2d. Decision Tree Model

Create a function to use GridSearchCV, find the best hyperparameters and then create a model with those results and values. Also print some scoring metrics to assess how well the model performs.

In [None]:
def dtree(X_train, X_test, y_train, y_test, cv=5):
    
    # Set GridSearchCV hyperparameters to compare & select
    grid = {
    'max_depth': [2,3,4,5,6,7,8,9,10,11,12,13,14,15],
    'min_samples_split': [2,3,4,5,6,7,8,9,10,11,12,13,14,15],
    'criterion': ['gini', 'entropy']}
    
    # Instantiate & fit Decision Tree model for GridSearch
    grid_dt = DecisionTreeClassifier()
    grid_dt.fit(X_train, y_train)
    
    # Instantiate & fit GridSearchCV with accuracy scoring
    gs = GridSearchCV(estimator=grid_dt, param_grid=grid, cv=5, scoring='accuracy')
    gs.fit(X_train, y_train)
    
    # Return best hyperparameters
    dt_params = gs.best_params_
    
    # Use best max depth from best_params
    dt_max_depth = dt_params['max_depth']
    print(f'Max Depth: {dt_max_depth}')
    
    # Use best minimum sample split from best_params
    dt_min_samp = dt_params['min_samples_split']
    print(f'Min Sample Split: {dt_min_samp}')
    
    # Use best criterion from best_params
    dt_criterion = dt_params['criterion']
    print(f'criterion: {dt_criterion}')
    
    # Instantiate & fit Decision Tree model
    dtree = DecisionTreeClassifier(max_depth=dt_max_depth, criterion=dt_criterion,
                                   min_samples_split=dt_min_samp, random_state=42)
    dtree.fit(X_train, y_train)
    
    # Create prediction variable using test data
    y_pred = dtree.predict(X_test)
    
    # Run cross-validate score with cv folds from function parameter
    cv_results = cross_val_score(dtree, X_train, y_train, cv=cv)
    print(f'Mean Cross-Val Score: {cv_results.mean()}')
    
    # Run and print accuracy, recall, precision and f1 scores
    acc_score = accuracy_score(y_test, y_pred)
    print(f'Accuracy Score: {acc_score}')
    
#     rec_score = recall_score(y_test, y_pred)
#     print(f'Recall Score: {rec_score}')
    
#     prec_score = precison_score(y_test, y_pred)
#     print(f'Precision Score: {prec_score}')
    
#     f1_score = f1_score(y_test, y_pred)
#     print(f'F1 score: {f1_score}')
    
#     # Plot an ROC curve (only works with binary data)
#     plot_roc_curve(dtree, X_train, y_train)
    
    # Plot Confusion Matrix
    plot_confusion_matrix(dtree, X_train, y_train);

In [None]:
dtree(X_train, X_test, y_train, y_test)

### 2e. Random Forest Model

Create a function to use GridSearchCV, find the best hyperparameters and then create a model with those results and values. Also print some scoring metrics to assess how well the model performs.

In [None]:
def random_forest(X_train, X_test, y_train, y_test, cv=5):
    
    # Create list for range of # of trees
    n_list = list(range(50,150))
    
    # Set GridSearchCV hyperparameters to compare & select
    grid = {
    'n_estimators': n_list,
    'criterion': ['gini', 'entropy']}
    
    # Instantiate & fit Random Forest model for GridSearch
    grid_rf = RandomForestClassifier()
    grid_rf.fit(X_train,y_train)
    
    # Instantiate & fit GridSearchCV with accuracy scoring
    gs = GridSearchCV(estimator=grid_rf, param_grid=grid, cv=5, scoring='accuracy')
    gs.fit(X_train, y_train)
    
    # Return best hyperparameters
    rf_params = gs.best_params_
    
    # Use best # of trees from best_params
    rf_n_estimators = rf_params['n_estimators']
    print(f'Number of Trees: {rf_n_estimators}')
    
    # Use best criterion from best_params
    rf_criterion = rf_params['criterion']
    print(f'criterion: {rf_criterion}')
    
    # Instantiate & fit Random Forest model
    rforest = RandomForestClassifier(n_estimators=rf_n_estimators, criterion=rf_criterion,
                                    random_state=42)
    rforest.fit(X_train, y_train)
    
    # Create prediction variable using test data
    y_pred = rforest.predict(X_test)
    
    # Run cross-validate score with cv folds from function parameter
    cv_results = cross_val_score(rforest, X_train, y_train, cv=cv)
    print(f'Mean Cross-Val Score: {cv_results.mean()}')
    
    # Run forest score
    score = rforest.score(X_test,y_test)
    print(f'Random Forest Score: {score}')
    
    # Run and print accuracy, recall, precision and f1 scores
    acc_score = accuracy_score(y_test, y_pred)
    print(f'Accuracy Score: {acc_score}')
    
#     rec_score = recall_score(y_test, y_pred)
#     print(f'Recall Score: {rec_score}')
    
#     prec_score = precison_score(y_test, y_pred)
#     print(f'Precision Score: {prec_score}')
    
#     f1_score = f1_score(y_test, y_pred)
#     print(f'F1 score: {f1_score}')
    
#     # Plot an ROC curve (only works with binary data)
#     plot_roc_curve(rforest, X_train, y_train)
    
    # Plot Confusion Matrix
    plot_confusion_matrix(rforest, X_train, y_train);

In [None]:
# Takes 5 minutes and 30 seconds to run
random_forest(X_train, X_test, y_train, y_test, cv=5)

### 2f. Bagging Classifier Model

Create a function to use GridSearchCV, find the best hyperparameters and then create a model with those results and values. Also print some scoring metrics to assess how well the model performs.

In [None]:
def bagged(X_train, X_test, y_train, y_test, cv=5):

    # Set GridSearchCV hyperparameters to compare & select
    grid = {
    'base_estimator__max_depth': [2,3,4,5,10,15],
    'base_estimator__min_samples_split': [2,3,4,5,10,15],
    'base_estimator__criterion': ['gini', 'entropy'],
    'max_samples': [1,2,3,4,5],
    'max_features': [1,2,3,4,5],
    'n_estimators': [10,20,50,100]}
    
    # Instantiate & fit Bagging Classifier model for GridSearch
    grid_bag = BaggingClassifier(DecisionTreeClassifier())
    grid_bag.fit(X_train, y_train)
    
    # Instantiate & fit GridSearchCV with accuracy scoring
    gs = GridSearchCV(estimator=grid_bag, param_grid=grid, cv=5, scoring='accuracy')
    gs.fit(X_train, y_train)
    
    # Return best hyperparameters
    bag_params = gs.best_params_
    
    # Use best max depth from best_params
    bag_max_depth = bag_params['base_estimator__max_depth']
    print(f'Dec Tree Max Depth: {bag_max_depth}')
    
    # Use best minimum sample split from best_params
    bag_min_sample = bag_params['base_estimator__min_samples_split']
    print(f'Dec Tree Min Sample Split: {bag_min_sample}')
    
    # Use best max depth from best_params
    bag_criterion = bag_params['base_estimator__criterion']
    print(f'Dec Tree Criterion: {bag_criterion}')
    
    # Use best max samples from best_params
    bag_max_sample = bag_params['max_samples']
    print(f'Bagging Max Samples: {bag_max_sample}')
    
    # Use best max features from best_params
    bag_max_features = bag_params['max_features']
    print(f'Bag Max Features: {bag_max_features}')
    
    # Use best max depth from best_params
    bag_estimators = bag_params['n_estimators']
    print(f'# of Base Estimators: {bag_estimators}')
    
    # Instantiate & fit Bagging Classifier model
    bagging = BaggingClassifier(DecisionTreeClassifier(max_depth=bag_max_depth,
                                min_samples_split=bag_min_sample, criterion=bag_criterion),
                                max_samples=bag_max_sample, max_features=bag_max_features,
                                n_estimators=bag_estimators, random_state=42)
    bagging.fit(X_train, y_train)
    
    # Create prediction variable using test data
    y_pred = bagging.predict(X_test)
    
    # Run cross-validate score with cv folds from function parameter
    cv_results = cross_val_score(bagging, X_train, y_train, cv=cv)
    print(f'Mean Cross-Val Score: {cv_results.mean()}')
    
    # Run bagging score
    score = bagging.score(X_test, y_test)
    print(f'Bagging Classifier Score: {score}')
    
    # Run and print accuracy, recall, precision and f1 scores
    acc_score = accuracy_score(y_test, y_pred)
    print(f'Accuracy Score: {acc_score}')
    
#     rec_score = recall_score(y_test, y_pred)
#     print(f'Recall Score: {rec_score}')
    
#     prec_score = precison_score(y_test, y_pred)
#     print(f'Precision Score: {prec_score}')
    
#     f1_score = f1_score(y_test, y_pred)
#     print(f'F1 score: {f1_score}')
    
#     # Plot an ROC curve (only works with binary data)
#     plot_roc_curve(bagging, X_train, y_train)
    
    # Plot Confusion Matrix
    plot_confusion_matrix(bagging, X_train, y_train);

In [None]:
bagged(X_train, X_test, y_train, y_test, cv=5)