# Week 17 Group Activity

In [27]:
# Dependencies and modules:

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import EditedNearestNeighbours

from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline

from sklearn.metrics import classification_report, roc_curve, roc_auc_score

import matplotlib.pyplot as plt

from sklearn.svm import SVC

In [28]:
# setting style for my rendered tables:

In [29]:
%%html
<style> 
table th {color:blue !important;}
table td, table th, table tr {text-align:left !important;}
table, th, td {border: 1px solid black !important;}
</style>

In [30]:
# diabetes.csv file:

diabetes_path = "C:/Users/Nik/Documents/diabetes.csv"
diabetes_df = pd.read_csv(diabetes_path)
diabetes_df.head(1)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1


### 1. Write simple (straightforward) definitions for the following parameters for RandomForestClassifier (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html) and indicate how they correlate with the precision and recall for the basic diabetes model we built in class. You will need to rerun the model multiple times to do so. 

In [31]:
X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)

sc= StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

In [32]:
from sklearn.ensemble import RandomForestClassifier

# Model with all default parameters:

rf = RandomForestClassifier(n_estimators=100, max_depth= None, min_samples_split= 2, min_samples_leaf = 1, min_weight_fraction_leaf = 0.0, max_leaf_nodes = None, min_impurity_decrease = 0.0, bootstrap=True, random_state=42)

rf = rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.81      0.81      0.81       151
           1       0.64      0.65      0.65        80

    accuracy                           0.75       231
   macro avg       0.73      0.73      0.73       231
weighted avg       0.75      0.75      0.75       231



In [33]:
from sklearn.ensemble import RandomForestClassifier

# Model to tweak:

rf = RandomForestClassifier(n_estimators=100, max_depth= None, min_samples_split= 2, min_samples_leaf =1, min_weight_fraction_leaf = 0.0, max_leaf_nodes = None, min_impurity_decrease = 0.05, bootstrap=True, random_state=42)

rf = rf.fit(X_train, y_train)
predictions = rf.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.69      0.95      0.80       151
           1       0.67      0.20      0.31        80

    accuracy                           0.69       231
   macro avg       0.68      0.57      0.55       231
weighted avg       0.68      0.69      0.63       231



<table border="1">
 <tr>
    <th style="font-size:20px">Parameter</th>
    <th style="font-size:20px">Definition</th>
    <th style="font-size:20px">Correlation with Precision</th>
    <th style="font-size:20px">Correlation with Recall</th>
 </tr>
 <tr>
    <td>estimators</td>
    <td>number of trees in the forest</td>
    <td>fewer estimators lowers sensitivity</td>
    <td>fewer estimators decreases NPV</td>
 </tr>
 <tr>
    <td>max_depth</td>
    <td>limit set on branches of tree</td>
    <td>less depth means less specificity but more sensitivity</td>
    <td>less depth means greater NPV but lower PPV</td> 
 </tr>
 <tr>
    <td>min_samples_split</td>
    <td>number of samples that must be present before a node gets to split</td>
    <td>jacked high, this parameter produces less specificity but more sensitivity</td>
    <td>jacked high, this parameter produces *significantly* greater NPV but lower PPV</td> 
 </tr>
 <tr>
    <td>min_samples_leaf</td>
    <td>number of samples that must be left in each side of a split node</td>
    <td>if this parameter gets raised, you get less specificity but more sensitivity</td>
    <td>if this parameter gets raised, you get greater NPV but lower PPV</td> 
 </tr>
 <tr>
    <td>min_weight_fraction_leaf</td>
    <td>percent of entire sample weight that must be in each leaf</td>
    <td>if taken to 33%, specificity tanks and sensitivity moderately rises</td>
    <td>if taken to 33%, you get *significantly* greater NPV but lower PPV</td> 
 </tr>
 <tr>
    <td>max_leaf_nodes</td>
    <td>limits the number of nodes that can split off, best-first</td>
    <td>if leaf nodes are severely limited (3), you get less specificity but more sensitivity</td>
    <td>if leaf nodes are severely limited (3), you get *significantly* greater NPV but lower PPV</td> 
 </tr>
 <tr>
    <td>min_impurity_decrease</td>
    <td>sets nodes to only split if there is at least this much impurity decrease</td>
    <td>a 5% limit doesn't affect sensitivity much, but specificity tanks</td>
    <td>a 5% limit *significantly* spikes NPV and tanks PPV</td> 
 </tr>
 <tr>
    <td>min_impurity_split</td>
    <td>deprecated</td>
    <td>X</td>
    <td>X</td> 
 </tr>
 
<script src="https://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" type="text/javascript"></script>
    
</table>

2. How does setting bootstrap=False influence the model performance? Note: the default is bootstrap=True. Explain why your results might be so.

The model performs more poorly overall when bootstrap= False. This is because there will be fewer samples to fit to the model and there might not be enough samples to fill each determined parameter. Whereas with bootstrapping, data will be synthisized until desired parameters are met. It would be a shame to spend time tweaking and tuning your parameters and run out of data before you are able to optimize your model.