In [None]:
from datetime import datetime
from IPython.display import display, Markdown
import joblib
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.svm import SVC



## Background

Use the [Pima Indians Diabetes Dataset](https://www.kaggle.com/uciml/pima-indians-diabetes-database) to develop a simple model predicting onset of diabetes using a few simple features.

## Explore the data

In [None]:
data = pd.read_csv('../data/diabetes.csv', delimiter=',')
print(data.columns.to_list())
print(data.head())
print(data.groupby('Outcome')['Glucose'].plot.kde())

## Split the data
Splitting the data into train, validation, and test sets

In [None]:
def train_validate_test_split(x, y, test_size, random_state):
    x_train, x_test_temp, y_train, y_test_temp = train_test_split(x,y, test_size = 0.5, random_state=42)
    x_validate, x_test, y_validate, y_test = train_test_split(x_test_temp, y_test_temp, test_size = 0.5, random_state=42)
    return x_train, y_train, x_validate, y_validate, x_test, y_test

x_train, y_train, x_validate, y_validate, x_test, y_test = train_validate_test_split(
    x = data.drop(columns = 'Outcome'), 
    y = data['Outcome'], 
    test_size = .5, 
    random_state = 42
)

print(x_train.shape)
print(x_validate.shape)
print(x_test.shape)




## Feature engineering

In [None]:
def create_hyperglyemic_flag(series, **kwargs):
    return np.where(
        series > (kwargs.get('glucose_value') or 240), 1, 0
    )

def create_hypoglyemic_flag(series, **kwargs):
    return np.where(
        series < (kwargs.get('glucose_value') or 100), 1, 0
    )

def generate_features(df):
    result = df.copy(deep=True)
    result['hyperglycemic_flag'] = create_hyperglyemic_flag(result['Glucose'])
    result['hypoglycemic_flag'] = create_hypoglyemic_flag(result['Glucose'])
    return result

x_train = generate_features(x_train)



## Normalize and train

In [None]:
pipe = Pipeline(
    [
        ('scaler', StandardScaler()), 
        ('svc', SVC())
    ]
)

pipe.fit(x_train, y_train)

## Evaluate model on validation data

In [None]:
x_validate = generate_features(x_validate)
y_predict = pipe.predict(x_validate)

cm = np.array(confusion_matrix(y_validate, y_predict, labels=[0,1]))

confusion = pd.DataFrame(cm, index=['Not Diabetic', 'Diabetic'], columns=['Predicted Healthy', 'Predicted Diabetes'])

display(Markdown('## Confusion matrix'))
print(confusion)
display(Markdown('<br>'))
display(Markdown('## Classification report'))
print(classification_report(y_validate, y_predict))

## Save the model and related objects

In [None]:
model_package = {
        'model_pipeline': pipe,
        'data_features': x_train.columns.tolist(),
        'name': f'Diabetes predictor',
        'model_version': '0.0',
        'model_type': 'Supervised',
        'model_objective': 'Classification',
        'model_algorithm': 'Support Vector Classification (SVC)',
        'model_trained_date': str(datetime.today().date())
    }

In [None]:
joblib.dump(model_package, '../models/diabetes_predictor')