# Predicting Diabetes
## Steps to prepare data
* use Pandas to read in data
* identify correlated features
* clean data - removing any correlated features
* mold data - convert data into suitable format
* check True/False ratio to ensure data can be used for prediction

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# do plotting inline instead of in a separate window
%matplotlib inline

## Load and review data

In [None]:
df = pd.read_csv('./data/pima-data.csv')

In [None]:
df.shape

In [None]:
df.head(5)

In [None]:
df.tail(5)

## Check for null values

In [None]:
df.isnull().values.any()

## Check for correlations

In [None]:
def plot_corr(df, size=11):
    """
    Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot

    Displays:
        matrix of correlation between columns. Blue-cyan-yellow-red-darkred => less to more correlated
                                               0 ------------------> 1
                                               Expect a darkened line running from top left to bottom right
    """

    corr = df.corr()
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr) # color code the rectangles by correlation value
    plt.xticks(range(len(corr.columns)), corr.columns) # draw x tick marks
    plt.yticks(range(len(corr.columns)), corr.columns) # draw y tick marks

In [None]:
# invoke correlation crossplot
plot_corr(df)

Check the correlation between skin and thickness

In [None]:
df.corr()

In [None]:
del df['skin']

## Check Data Types

In [None]:
df.head(5)

Change True to 1, False to 0

In [None]:
diabetes_map = { True: 1, False: 0}

In [None]:
df['diabetes'] = df['diabetes'].map(diabetes_map)

In [None]:
df.head(5)

## Check true/false ratio

In [None]:
num_true = len(df.loc[df['diabetes'] == True])
num_false = len(df.loc[df['diabetes'] == False])
print('Number of True cases:  {0} ({1:2.2f}%)'.format(num_true, (num_true / (num_true + num_false)) * 100))
print('Number of False cases: {0} ({1:2.2f}%)'.format(num_false, (num_false / (num_true + num_false)) * 100))