# Matt and Paul's Notebook

## 1. Packages and Libraries Import

In [1]:
#Import the relevant packages
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split,GridSearchCV
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, accuracy_score,recall_score,precision_score,\
                            f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier
from sklearn.datasets import load_iris

In [2]:
#Load Iris Data to test functions
iris = load_iris() 
print(iris.target_names)
print(iris.feature_names)

['setosa' 'versicolor' 'virginica']
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [3]:
# petal length and width features
feature_used = iris.feature_names[2:]
X = iris.data[:, 2:] 
y = iris.target 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

X_train.shape, X_test.shape

((120, 2), (30, 2))

## 2. Function Builds for Project

### 2a. Data Scaling

In [4]:
#Standard Scaler
def SS(X_train,X_test):
    ss = StandardScaler()
    return ss.fit_transform(X_train), ss.transform(X_test)

### 2b. Logistic Regression Model

In [None]:
#Logistic Regression
def logreg(X_train,X_test,y_train,y_test,cv=5):
    grid = {
    'penalty': ['l1','l2','elasticnet'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
    grid_logreg=LogisticRegression(random_state=42)
    grid_logreg.fit(X_train,y_train)
    gs = GridSearchCV(estimator=grid_logreg, param_grid=grid, cv=5, scoring='neg_log_loss')
    gs.fit(X_train, y_train)
    logreg_params=gs.best_params_
    logreg_penalty=logreg_params['penalty']
    print(f'Penalty: {logreg_penalty}')
    logreg_solver=logreg_params['solver']
    print(f'Solver: {logreg_solver}')
    #Instantiate logistic regression
    log=LogisticRegression(random_state=42,penalty=logreg_penalty,solver=logreg_solver)
    #Fit it on train data
    log.fit(X_train,y_train)
    #Create y_pred using test data
    y_pred=log.predict(X_test)
    
    #Use cross_val_score with cv folds
    cv_results = cross_val_score(log, X_train, y_train, cv=cv)
    print(f'Cross val mean score: {cv_results.mean()}')
    
    #Examine accuracy,recall,precision and f1 scores
    acc_score=accuracy_score(y_test,y_pred)
    print(f'accuracy score: {acc_score}')
    #rec_score=recall_score(y_test,y_pred)
    #print(f'recall score: {rec_score}')
    #prec_score=precision_score(y_test,y_pred)
   # print(f'precision score: {prec_score}')
   # f1_score=f1_score(y_test,y_pred)
   # print(f'f1 score: {f1_score}')
    
    #Plot an roc curve, only works with binary data
    #plot_roc_curve(log, X_train, y_train);
    
    #Plot and examine confusion matrix
    plot_confusion_matrix(log, X_train, y_train);
    

In [None]:
logreg(X_train,X_test,y_train,y_test)

In [None]:
#for index in range(1,33,2):
    #knn_model = KNeighborsClassifier(n_neighbors=index)
    #knn_log_loss = -1 * cross_val_score(knn_model, X_train,
                                       # y_train, scoring="neg_log_loss").mean()
    #print(f"# of Neighbors: {index}, Log Loss Score: {knn_log_loss}")

In [None]:
#def low_log_loss():
#knn_dict={}
#for index in range(1,33,2):
       # knn_model = KNeighborsClassifier(n_neighbors=index)
        #knn_log_loss = -1 * cross_val_score(knn_model, X_train,
                                    #    y_train, scoring="neg_log_loss").mean()
        #knn_dict[index]=(knn_log_loss)
#min_knn= min(knn_dict.values())
#low_key = list(knn_dict.keys())[list(knn_dict.values()).index(min_knn)]
#print(low_key)
#print(min_knn)

### Examine Gridsearch outside function

In [None]:
grid = {
    'n_neighbors': [1,3,5,7,9,11,13,15,17,19,21,23,25],
    'metric': ['minkowski', 'manhattan'],
    'weights': ['uniform', 'distance']
}

In [None]:
grid_knn=KNeighborsClassifier()
grid_knn.fit(X_train,y_train)

In [None]:
gs = GridSearchCV(estimator=grid_knn, param_grid=grid, cv=5,scoring='neg_log_loss')

In [None]:
gs.fit(X_train, y_train)

In [None]:
KNN_params=gs.best_params_
KNN_neighbors=KNN_params['n_neighbors']
KNN_neighbors

In [None]:
pd.DataFrame(gs.cv_results_)

In [None]:
    #def low_log_loss():
   # knn_dict={}
   # for index in range(1,33,2):
       # knn_model = KNeighborsClassifier(n_neighbors=index)
       # knn_log_loss = -1 * cross_val_score(knn_model, X_train,
                                       # y_train, scoring="neg_log_loss").mean()
       # knn_dict[index]=(knn_log_loss)
   # min_knn= min(knn_dict.values())
    #low_key = list(knn_dict.keys())[list(knn_dict.values()).index(min_knn)]
   # knn=KNeighborsClassifier(n_neighbors=low_key,metric=metric)

### KNN

In [None]:
#K nearest neighbors
def KNN(X_train,X_test,y_train,y_test,metric='minkowski',cv=5):
    grid = {
    'n_neighbors': [1,3,5,7,9,11,13,15,17,19,21,23,25],
    'metric': ['minkowski', 'manhattan'],
    'weights': ['uniform', 'distance']}
    grid_knn=KNeighborsClassifier()
    grid_knn.fit(X_train,y_train)
    gs = GridSearchCV(estimator=grid_knn, param_grid=grid, cv=5, scoring='neg_log_loss')
    gs.fit(X_train, y_train)
    KNN_params=gs.best_params_
    KNN_neighbors=KNN_params['n_neighbors']
    print(f'Number of Neighbors: {KNN_neighbors}')
    KNN_metric=KNN_params['metric']
    print(f'Metric: {KNN_metric}')
    KNN_weights=KNN_params['weights']
    print(f'Weights: {KNN_weights}')
    
    #Instantiate K nearest neighbors
    knn=KNeighborsClassifier(n_neighbors=KNN_neighbors,metric=KNN_metric,weights=KNN_weights)
    knn.fit(X_train,y_train)
    #Create y_pred using test data
    y_pred=knn.predict(X_test)
    #Use cross_val_score with cv folds
    cv_results = cross_val_score(knn, X_train, y_train, cv=cv)
    print(f'Cross val mean score: {cv_results.mean()}')
    
    #Examine accuracy,recall,precision and f1 scores
    acc_score=accuracy_score(y_test,y_pred)
    print(f'accuracy score: {acc_score}')
    #rec_score=recall_score(y_test,y_pred)
    #print(f'recall score: {rec_score}')
    #prec_score=precison_score(y_test,y_pred)
    #print(f'precision score: {prec_score}')
    #f1_score=f1_score(y_test,y_pred)
   # print(f'f1 score: {f1_score}')
    
    #Plot an roc curve
    #plot_roc_curve(knn, X_train, y_train);
    
    
    #Plot and examine confusion matrix
    plot_confusion_matrix(knn, X_train, y_train);
    

In [None]:
KNN(X_train,X_test,y_train,y_test)

### Decision Tree

In [None]:
#Decision Trees
def dtree(X_train,X_test,y_train,y_test,cv=5):
    grid = {
    'max_depth': [2,3,4,5,6,7,8,9,10,11,12,13,14,15],
    'min_samples_split': [2,3,4,5,6,7,8,9,10,11,12,13,14,15],
    'criterion': ['gini', 'entropy']}
    grid_dt=DecisionTreeClassifier()
    grid_dt.fit(X_train,y_train)
    gs = GridSearchCV(estimator=grid_dt, param_grid=grid, cv=5, scoring='neg_log_loss')
    gs.fit(X_train, y_train)
    dt_params=gs.best_params_
    dt_max_depth=dt_params['max_depth']
    print(f'Max Depth: {dt_max_depth}')
    dt_min_samp=dt_params['min_samples_split']
    print(f'Min Sample Split: {dt_min_samp}')
    dt_criterion=dt_params['criterion']
    print(f'criterion: {dt_criterion}')
    #Instantiate decision tree
    Dtree=DecisionTreeClassifier(max_depth=dt_max_depth,criterion=dt_criterion,min_samples_split=dt_min_samp,
                                 random_state=42)
    #Fit it on train data
    Dtree.fit(X_train,y_train)
    #Create y_pred using test data
    y_pred=Dtree.predict(X_test)
    
    #Use cross_val_score with cv folds
    cv_results = cross_val_score(Dtree, X_train, y_train, cv=cv)
    print(f'Cross val mean score: {cv_results.mean()}')
    
    #Examine accuracy,recall,precision and f1 scores
    acc_score=accuracy_score(y_test,y_pred)
    print(f'accuracy score: {acc_score}')
    #rec_score=recall_score(y_test,y_pred)
    #print(f'recall score: {rec_score}')
    #prec_score=precison_score(y_test,y_pred)
    #print(f'precision score: {prec_score}')
    #f1_score=f1_score(y_test,y_pred)
    #print(f'f1 score: {f1_score}')
    
    #Plot an roc curve
    #plot_roc_curve(Dtree, X_train, y_train);
    
    #Plot and examine confusion matrix
    plot_confusion_matrix(Dtree, X_train, y_train);

In [None]:
dtree(X_train,X_test,y_train,y_test)

### Random Forest

In [None]:
n_list=list(range(50,150))


In [None]:
def random_forest(X_train,X_test,y_train,y_test,cv=5):
    grid = {
    'n_estimators': n_list,
    'criterion': ['gini', 'entropy']}
    grid_rf=RandomForestClassifier()
    grid_rf.fit(X_train,y_train)
    gs = GridSearchCV(estimator=grid_rf, param_grid=grid, cv=5, scoring='neg_log_loss')
    gs.fit(X_train, y_train)
    rf_params=gs.best_params_
    rf_n_estimators=rf_params['n_estimators']
    print(f'Number Estimators: {rf_n_estimators}')
    rf_criterion=rf_params['criterion']
    print(f'criterion: {rf_criterion}')
    #Instantiate decision tree
    rforest=RandomForestClassifier(n_estimators=rf_n_estimators,criterion=rf_criterion)
    #Fit it on train data
    rforest.fit(X_train,y_train)
    #Create y_pred using test data
    y_pred=rforest.predict(X_test)
    
    #Use cross_val_score with cv folds
    cv_results = cross_val_score(rforest, X_train, y_train, cv=cv)
    print(f'Cross val mean score: {cv_results.mean()}')
    #print forest score
    score=rforest.score(X_test,y_test)
    print(f'Random Forest Score: {score}')
    
    #Examine accuracy,recall,precision and f1 scores
    #acc_score=accuracy_score(y_test,y_pred)
    #print(f'accuracy score: {acc_score}')
    #rec_score=recall_score(y_test,y_pred)
    #print(f'recall score: {rec_score}')
    #prec_score=precison_score(y_test,y_pred)
    #print(f'precision score: {prec_score}')
    #f1_score=f1_score(y_test,y_pred)
    #print(f'f1 score: {f1_score}')
    
    #Plot an roc curve
    #plot_roc_curve(Dtree, X_train, y_train);
    
    #Plot and examine confusion matrix
    plot_confusion_matrix(rforest, X_train, y_train);

In [None]:
# Takes 5 minutes and 30 seconds to run
random_forest(X_train,X_test,y_train,y_test,cv=5)

### Bagging Classifier

In [None]:
#Bagging
def bagged(X_train,X_test,y_train,y_test,cv=5,max_depth=3,criterion='gini',n_estimators=10):

    #Instantiate decision tree
    bagging=BaggingClassifier(DecisionTreeClassifier(max_depth=max_depth,criterion=criterion),
                              n_estimators=n_estimators)
    #Fit it on train data
    bagging.fit(X_train,y_train)
    #Create y_pred using test data
    y_pred=bagging.predict(X_test)
    
    #Use cross_val_score with cv folds
    cv_results = cross_val_score(bagging, X_train, y_train, cv=cv)
    print(f'Cross val mean score: {cv_results.mean()}')
    #print forest score
    score=bagging.score(X_test,y_test)
    print(f'Random Forest Score: {score}')
    
    #Examine accuracy,recall,precision and f1 scores
    #acc_score=accuracy_score(y_test,y_pred)
    #print(f'accuracy score: {acc_score}')
    #rec_score=recall_score(y_test,y_pred)
    #print(f'recall score: {rec_score}')
    #prec_score=precison_score(y_test,y_pred)
    #print(f'precision score: {prec_score}')
    #f1_score=f1_score(y_test,y_pred)
    #print(f'f1 score: {f1_score}')
    
    #Plot an roc curve
    #plot_roc_curve(Dtree, X_train, y_train);
    
    #Plot and examine confusion matrix
    plot_confusion_matrix(bagging, X_train, y_train);

In [None]:
bagged(X_train,X_test,y_train,y_test,cv=5)