### imports

In [1]:
import wrangle as w

import pandas as pd
import numpy as np

# Visualize your success!
import seaborn as sns
import matplotlib.pyplot as plt

#Stats
from scipy import stats

# sklearn for modeling:
from sklearn.tree import DecisionTreeClassifier,\
export_text, \
plot_tree
from sklearn.metrics import accuracy_score, \
classification_report, \
confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB

from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

### Get the data

In [2]:
df = w.prep_data()

In [3]:
df.isna().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetic               0
age_bin                0
bmi_class              0
dtype: int64

In [4]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetic,age_bin,bmi_class
0,Female,80,0,1,never,25.19,6.6,140,0,7,3
1,Female,54,0,0,No Info,27.32,6.6,80,0,5,3
2,Male,28,0,0,never,27.32,5.7,158,0,2,3
3,Female,36,0,0,current,23.45,5.0,155,0,3,2
4,Male,76,1,1,current,20.14,4.8,155,0,7,2


### train, val, test split

In [5]:
train, val, test = w.train_validate_test(df, 'diabetic')

In [6]:
train.shape, val.shape, test.shape

((53321, 11), (22853, 11), (19044, 11))

In [7]:
len(train) / len(df) * 100

55.9988657606755

# Modeling 

## scaling

In [8]:
def scale_data(train, val, test):
    x_cols = ['HbA1c_level']
    split = [train, val, test]
    scale_list= []
    scaler = MinMaxScaler()
    scaler.fit(train[x_cols])
    for cut in split:
        cut_copy = cut.copy()
        cut_copy[x_cols] = scaler.transform(cut_copy[x_cols])
        scale_list.append(cut_copy)

    
    return scale_list[0], scale_list[1], scale_list[2] 


In [9]:
train_scaled, val_scaled, test_scaled = scale_data(train, val, test)

In [10]:
train_scaled

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetic,age_bin,bmi_class
7299,Female,77,0,0,never,25.60,0.563636,90,0,7,3
76197,Female,15,0,0,No Info,20.22,0.490909,140,0,1,2
19159,Male,76,1,0,No Info,27.32,0.400000,90,0,7,3
68892,Female,16,0,0,never,29.07,0.181818,155,0,1,3
49924,Female,40,0,0,not current,31.79,0.454545,100,0,4,4
...,...,...,...,...,...,...,...,...,...,...,...
3655,Female,51,0,0,never,23.98,0.418182,130,0,5,2
55255,Male,62,1,0,former,27.32,0.418182,200,1,6,3
91372,Male,26,0,0,never,34.44,0.000000,159,0,2,4
78876,Female,26,0,0,former,26.50,0.545455,158,0,2,3


In [11]:
#Let's get the x and y for the train, val, test

x_cols = ['HbA1c_level', 'age_bin', 'bmi_class']

y_cols = 'diabetic'



x_train = train_scaled[x_cols]
y_train = train[y_cols]

x_val = val_scaled[x_cols]
y_val = val[y_cols]

x_test = test_scaled[x_cols]
y_test = test[y_cols]

In [12]:
x_train

Unnamed: 0,HbA1c_level,age_bin,bmi_class
7299,0.563636,7,3
76197,0.490909,1,2
19159,0.400000,7,3
68892,0.181818,1,3
49924,0.454545,4,4
...,...,...,...
3655,0.418182,5,2
55255,0.418182,6,3
91372,0.000000,2,4
78876,0.545455,2,3


## Baseline

In [13]:
train.diabetic.mode()

0    0
Name: diabetic, dtype: int64

In [14]:
baseline_accuracy = (train.diabetic == 0).mean()

In [15]:
baseline_accuracy

0.910916899533017

## Decision Tree

In [16]:
def get_decisionTree_model(depth):
    """
    Returns a decision treen model with a max depth arg
    prints out the Accuracy of train and validate and the 
    classification report
    """
    clf = DecisionTreeClassifier(max_depth=depth, random_state=706)
    #class_weight='balanced'
    # fit the thing
    clf.fit(x_train, y_train)

    model_proba = clf.predict_proba(x_train)
    model_preds = clf.predict(x_train)

    model_score = clf.score(x_train, y_train)

    #classification report:
    print(
        classification_report(y_train,
                          model_preds))
    print('Accuracy of Random Tree classifier on training set: {:.2f}'
     .format(clf.score(x_train, y_train)))
    print('Accuracy of Random Tree classifier on validation set: {:.2f}'
     .format(clf.score(x_val, y_val)))
    return clf, model_preds

In [17]:
clf, model_preds = get_decisionTree_model(9)


              precision    recall  f1-score   support

           0       0.95      1.00      0.97     48571
           1       0.98      0.47      0.64      4750

    accuracy                           0.95     53321
   macro avg       0.97      0.73      0.81     53321
weighted avg       0.95      0.95      0.94     53321

Accuracy of Random Tree classifier on training set: 0.95
Accuracy of Random Tree classifier on validation set: 0.95


In [18]:
model_preds

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
confusion_matrix(train.diabetic, model_preds, labels=(0, 1))

array([[48535,    36],
       [ 2516,  2234]])

## Random Forrest

In [20]:
def get_random_forest():
    """
    Runs through two for loops from range 1 - 5 each time increasing the max depth 
    and min sample leaf
    puts all of the models in a pandas data frame and sorts for the hightes valadation 
    Prints out the classification report on the best model
    """
    
    model_list = []

    for j in range (1, 15):
        for i in range(2, 15):
            rf = RandomForestClassifier(n_estimators=101 ,max_depth=i, min_samples_leaf=j, random_state=706)

            rf = rf.fit(x_train, y_train)
            train_accuracy = rf.score(x_train, y_train)
            validate_accuracy = rf.score(x_val, y_val)
            model_preds = rf.predict(x_train)

            output = {
                "min_samples_per_leaf": j,
                "max_depth": i,
                "train_accuracy": train_accuracy,
                "validate_accuracy": validate_accuracy,
                'model_preds': model_preds
            }
            model_list.append(output)
            
    df = pd.DataFrame(model_list)
    df["difference"] = df.train_accuracy - df.validate_accuracy
    df["baseline_accuracy"] = baseline_accuracy
    # df[df.validate_accuracy > df.baseline_accuracy + .05].sort_values(by=['difference'], ascending=True).head(15)
    df.sort_values(by=['validate_accuracy'], ascending=False).head(1)
    
    #classification report:
    print(classification_report(y_train, df['model_preds'][1]))
    return df.sort_values(by=['validate_accuracy'], ascending=False).head(1)
    

In [21]:
# RandomForestClassifier?

In [22]:
get_random_forest()

KeyboardInterrupt: 

### Log Reg model

In [None]:
def get_logReg_model(data):
    """
    build a logistical regression model and prints out the accuracy on training and validation along with the classification report. 
    Must type in train_val as your data arrg to get the train val result.
    Type test if you want to test the model
    if you want a csv of the model preds and preds proba then un comment all of the stuff at the bottom
    """
    logit = LogisticRegression(random_state=706)
    logit.fit(x_train_scaled, y_train)
    y_pred = logit.predict(x_train_scaled)
    y_proba = logit.predict_proba(x_train_scaled)
    logit_val = logit.predict(x_val)
    if data == 'train_val':
        print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
         .format(logit.score(x_train_scaled, y_train)))
        print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
         .format(logit.score(x_val, y_val)))
        print(
        classification_report(y_train,
                          y_pred))
    else: 
        print('Accuracy of logistic regression classifier on test set: {:.2f}'
         .format(logit.score(x_test, y_test)))

In [None]:
get_logReg_model('train_val')

In [None]:
k = 9
knn = KNeighborsClassifier(n_neighbors=k, weights='distance')

knn.fit(x_train, y_train)

y_pred = knn.predict(x_val)

print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
 .format(knn.score(x_train, y_train)))
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
 .format(knn.score(x_val, y_val)))

print(classification_report(y_val,
                          y_pred))

In [None]:
len(y_train) 

In [None]:
len(y_pred)

## Naive Bayes

In [None]:
bnb = BernoulliNB()
# Fit the model using the training data
bnb.fit(x_train, y_train)

# Make predictions on the testing data
y_pred = bnb.predict(x_val)

# Calculate the accuracy of the model
print('Accuracy of Logistic Regression classifier on training set: {:.2f}'
 .format(bnb.score(x_train, y_train)))
print('Accuracy of Logistic Regression classifier on validation set: {:.2f}'
 .format(bnb.score(x_val, y_val)))
print("Accuracy: {:.2f}%".format(accuracy*100))
print(classification_report(y_val,
                          y_pred))