# Predicting Diabetes
## Steps to prepare data
* use Pandas to read in data
* identify correlated features
* clean data - removing any correlated features
* mold data - convert data into suitable format
* check True/False ratio to ensure data can be used for prediction

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# do plotting inline instead of in a separate window
%matplotlib inline

## Load and review data

In [None]:
df = pd.read_csv('./data/pima-data.csv')

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
df.tail(5)

## Check for null values

In [None]:
df.isnull().values.any()

## Check for correlations

In [None]:
def plot_corr(df, size=11):
    """
    Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot

    Displays:
        matrix of correlation between columns. Blue-cyan-yellow-red-darkred => less to more correlated
                                               0 ------------------> 1
                                               Expect a darkened line running from top left to bottom right
    """

    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr) # color code the rectangles by correlation value
    plt.xticks(range(len(corr.columns)), corr.columns) # draw x tick marks
    plt.yticks(range(len(corr.columns)), corr.columns) # draw y tick marks

In [None]:
# invoke correlation crossplot
plot_corr(df)

Check the correlation between skin and thickness

In [None]:
df.corr()

In [None]:
del df['skin']

## Check Data Types

In [None]:
df.head(5)

Change True to 1, False to 0

In [None]:
diabetes_map = { True: 1, False: 0}

In [None]:
df['diabetes'] = df['diabetes'].map(diabetes_map)

In [None]:
df.head(5)

## Check true/false ratio

In [None]:
num_true = len(df.loc[df['diabetes'] == True])
num_false = len(df.loc[df['diabetes'] == False])
print('Number of True cases:  {0} ({1:2.2f}%)'.format(num_true, (num_true / (num_true + num_false)) * 100))
print('Number of False cases: {0} ({1:2.2f}%)'.format(num_false, (num_false / (num_true + num_false)) * 100))

## Training the model
### Splitting the data
70% for training, 30% for testing

In [None]:
from sklearn.model_selection import train_test_split

feature_col_names = ['num_preg', 'glucose_conc', 'diastolic_bp', 'thickness', 'insulin', 'bmi', 'diab_pred', 'age']
predicted_class_namees = ['diabetes']

X = df[feature_col_names].values # predictor feature columns (8 X m)
y = df[predicted_class_namees].values # predicted class (1=true, 0=false) column (1 x m)
split_test_size = 0.30

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split_test_size, random_state=42)

We check to ensure we have the desired 70% train, 30% test split of the data

In [None]:
print('{0:0.2f}% in training set'.format((len(X_train) / len(df.index)) * 100))
print('{0:0.2f}% in test set'.format((len(X_test) / len(df.index)) * 100))

Verifying predicated value was split correctly

In [None]:
print("Original True   : {0} ({1:0.2f}%)".format(len(df.loc[df['diabetes'] == 1]), (len(df.loc[df['diabetes'] == 1])/len(df.index)) * 100.0))
print("Original False  : {0} ({1:0.2f}%)".format(len(df.loc[df['diabetes'] == 0]), (len(df.loc[df['diabetes'] == 0])/len(df.index)) * 100.0))
print("")
print("Training True   : {0} ({1:0.2f}%)".format(len([i for i in y_train if i == 1]), (len([i for i in y_train if i == 1])/len(y_train) * 100.0)))
print("Training False  : {0} ({1:0.2f}%)".format(len([i for i in y_train if i == 0]), (len([i for i in y_train if i == 0])/len(y_train) * 100.0)))
print("")
print("Test True       : {0} ({1:0.2f}%)".format(len([i for i in y_test if i == 1]), (len([i for i in y_test if i == 1])/len(y_test) * 100.0)))
print("Test False      : {0} ({1:0.2f}%)".format(len([i for i in y_test if i == 0]), (len([i for i in y_test if i == 0])/len(y_test) * 100.0)))

### Hidden Missing Values
Are "0" values possible? See thickness and insulin features

In [None]:
df.head()

How many rows have unexpected 0 values?

In [None]:
print("# rows in dataframe {0}".format(len(df)))
print("# rows missing glucose_conc: {0}".format(len(df.loc[df['glucose_conc'] == 0])))
print("# rows missing diastolic_bp: {0}".format(len(df.loc[df['diastolic_bp'] == 0])))
print("# rows missing thickness: {0}".format(len(df.loc[df['thickness'] == 0])))
print("# rows missing insulin: {0}".format(len(df.loc[df['insulin'] == 0])))
print("# rows missing bmi: {0}".format(len(df.loc[df['bmi'] == 0])))
print("# rows missing diab_pred: {0}".format(len(df.loc[df['diab_pred'] == 0])))
print("# rows missing age: {0}".format(len(df.loc[df['age'] == 0])))

### Inpute with the mean

In [None]:
from sklearn.impute import SimpleImputer

# Impute with mean all 0 readings
fill_0 = SimpleImputer(missing_values=0, strategy="mean")

X_train = fill_0.fit_transform(X_train)
X_test = fill_0.fit_transform(X_test)

### Training Initial Algorithm - Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

# create Gaussian Naive Bayes model object and train it with the data
nb_model = GaussianNB()
nb_model.fit(X_train, y_train.ravel())

## Evaluating the Model
### Performance on Training Data

In [None]:
# predict values using the training data
nb_predict_train = nb_model.predict(X_train)

# import the performance metrics library
from sklearn import metrics

# Accuracy
print('Accuracy: {0:.4f}'.format(metrics.accuracy_score(y_train, nb_predict_train)))

### Performance on Testing Data

In [None]:
# predict values using the testing data
nb_predict_test = nb_model.predict(X_test)

print('Accuracy: {0:.4f}'.format(metrics.accuracy_score(y_test, nb_predict_test)))

### Metrics
*Confusion Matrix - compares the predicted natural result for diabetes.*

```
Columns are predicted values
Left column is predicated false
Right column is predicated true
Top row is actual false
Bottom row is actual true
TN FP
FN TP

TN=true negative, actual not diabetes and predicted to be not diabetes.
FP=false positive, actual not diabetes, but predicted to be diabetes.
FN=false negative, actual diabetes, but predicted to be not diabetes.
TP=true positive, actual diabetes and predicted to be diabetes.

"Perfect" classifier would return 0 for both FP and FN.
```

*Classification Report*

Recall is also known as the true positive rate and sensitivity.
It is measuring how well the model predicts diabetes when the result is actually diabetes.
Precision is also known as the positive predictor value.
This is how often the patient actually had diabetes when the model said they would.
We want fewer false positives.

In [None]:
print('Confusion Matrix')
print('{0}'.format(metrics.confusion_matrix(y_test, nb_predict_test)))
print('')

print('Classification Report')
print(metrics.classification_report(y_test, nb_predict_test))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train.ravel())

### Predict Training Data

In [None]:
rf_predict_train = rf_model.predict(X_train)

print('Accuracy: {0:.4f}'.format(metrics.accuracy_score(y_train, rf_predict_train)))

Predict Testing Data

In [None]:
rf_predict_test = rf_model.predict(X_test)

print('Accuracy: {0:.4f}'.format(metrics.accuracy_score(y_test, rf_predict_test)))

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(C=0.7, random_state=42, solver='liblinear', max_iter=10000) # C = the regularisation hyperparameter
lr_model.fit(X_train, y_train.ravel())
lr_predict_test = lr_model.predict(X_test)

# Training metrics
print('Accuracy: {0:.4f}'.format(metrics.accuracy_score(y_test, lr_predict_test)))
print('Confusion Matrix')
print('{0}'.format(metrics.confusion_matrix(y_test, lr_predict_test)))
print('')
print('Classification Report')
print(metrics.classification_report(y_test, lr_predict_test))

Setting regularisation parameter

In [None]:
# Get the C value that plots the highest recall score
C_start = 0.1
C_end = 5
C_inc = 0.1

C_values, recall_scores = [], []

C_val = C_start
best_recall_score = 0
while (C_val < C_end):
    C_values.append(C_val)
    lr_model_loop = LogisticRegression(C=C_val, random_state=42, solver='liblinear')
    lr_model_loop.fit(X_train, y_train.ravel())
    lr_predict_loop_test = lr_model_loop.predict(X_test)
    recall_score = metrics.recall_score(y_test, lr_predict_loop_test)
    recall_scores.append(recall_score)
    if (recall_score > best_recall_score):
        best_recall_score = recall_score
        best_lr_predict_test = lr_predict_loop_test
        
    C_val = C_val + C_inc

best_score_C_val = C_values[recall_scores.index(best_recall_score)]
print("1st max value of {0:.3f} occured at C={1:.3f}".format(best_recall_score, best_score_C_val))

%matplotlib inline 
plt.plot(C_values, recall_scores, "-")
plt.xlabel("C value")
plt.ylabel("recall score")

### Logistic regression with class_weight='balanced'

In [None]:
C_start = 0.1
C_end = 5
C_inc = 0.1

C_values, recall_scores = [], []

C_val = C_start
best_recall_score = 0
while (C_val < C_end):
    C_values.append(C_val)
    lr_model_loop = LogisticRegression(C=C_val, random_state=42, solver='liblinear', class_weight='balanced')
    lr_model_loop.fit(X_train, y_train.ravel())
    lr_predict_loop_test = lr_model_loop.predict(X_test)
    recall_score = metrics.recall_score(y_test, lr_predict_loop_test)
    recall_scores.append(recall_score)
    if (recall_score > best_recall_score):
        best_recall_score = recall_score
        best_lr_predict_test = lr_predict_loop_test
        
    C_val = C_val + C_inc

best_score_C_val = C_values[recall_scores.index(best_recall_score)]
print("1st max value of {0:.3f} occured at C={1:.3f}".format(best_recall_score, best_score_C_val))

%matplotlib inline 
plt.plot(C_values, recall_scores, "-")
plt.xlabel("C value")
plt.ylabel("recall score")

In [None]:
lr_model = LogisticRegression(C=best_score_C_val, class_weight='balanced', random_state=42, solver='liblinear', max_iter=10000)
lr_model.fit(X_train, y_train.ravel())
lr_predict_test = lr_model.predict(X_test)

# Training metrics
print('Accuracy: {0:.4f}'.format(metrics.accuracy_score(y_test, lr_predict_test)))
print('Confusion Matrix')
print('{0}'.format(metrics.confusion_matrix(y_test, lr_predict_test)))
print('')
print('Classification Report')
print(metrics.classification_report(y_test, lr_predict_test))

In [None]:
### LogisticsRegressionCV

In [None]:
from sklearn.linear_model import LogisticRegressionCV
lr_cv_model = LogisticRegressionCV(n_jobs=-1, random_state=42, Cs=3, cv=10, refit=False, class_weight='balanced', max_iter=500)
lr_cv_model.fit(X_train, y_train.ravel())

### Predict on Testing data

In [None]:
lr_cv_predict_test = lr_cv_model.predict(X_test)

# training metrics
print("Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_cv_predict_test)))
print(metrics.confusion_matrix(y_test, lr_cv_predict_test) )
print("")
print("Classification Report")
print(metrics.classification_report(y_test, lr_cv_predict_test))