In [1]:
from datetime import datetime
from IPython.display import display, Markdown
import joblib
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.svm import SVC

data = pd.read_csv('../data/diabetes.csv', delimiter=',')
print(data.columns)
data.head()
#data.groupby('Outcome')['Glucose'].plot.kde()

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


## Split the data
Splitting the data into train, validation, and test sets

In [2]:
def train_validate_test_split(x, y, test_size, random_state):
    x_train, x_test_temp, y_train, y_test_temp = train_test_split(x,y, test_size = 0.5, random_state=42)
    x_validate, x_test, y_validate, y_test = train_test_split(x_test_temp, y_test_temp, test_size = 0.5, random_state=42)
    return x_train, y_train, x_validate, y_validate, x_test, y_test

x_train, y_train, x_validate, y_validate, x_test, y_test = train_validate_test_split(
    x = data.drop(columns = 'Outcome'), 
    y = data['Outcome'], 
    test_size = .5, 
    random_state = 42
)

print(x_train.shape)
print(x_validate.shape)
print(x_test.shape)




(384, 8)
(192, 8)
(192, 8)


## Feature engineering

In [3]:
def create_hyperglyemic_flag(series, **kwargs):
    return np.where(
        series > (kwargs.get('glucose_value') or 240), 1, 0
    )

def create_hypoglyemic_flag(series, **kwargs):
    return np.where(
        series < (kwargs.get('glucose_value') or 100), 1, 0
    )

def generate_features(df):
    result = df.copy(deep=True)
    result['hyperglycemic_flag'] = create_hyperglyemic_flag(result['Glucose'])
    result['hypoglycemic_flag'] = create_hypoglyemic_flag(result['Glucose'])
    return result

x_train = generate_features(x_train)



## Normalize and train

In [4]:
pipe = Pipeline(
    [
        ('scaler', StandardScaler()), 
        ('svc', SVC())
    ]
)

pipe.fit(x_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()), ('svc', SVC())])

## Evaluate model on validation data

In [5]:
x_validate = generate_features(x_validate)
y_predict = pipe.predict(x_validate)

cm = np.array(confusion_matrix(y_validate, y_predict, labels=[0,1]))

confusion = pd.DataFrame(cm, index=['Not Diabetic', 'Diabetic'], columns=['Predicted Healthy', 'Predicted Diabetes'])

display(Markdown('## Confusion matrix'))
print(confusion)
display(Markdown('<br>'))
display(Markdown('## Classification report'))
print(classification_report(y_validate, y_predict))

## Confusion matrix

              Predicted Healthy  Predicted Diabetes
Not Diabetic                113                  21
Diabetic                     19                  39


<br>

## Classification report

              precision    recall  f1-score   support

           0       0.86      0.84      0.85       134
           1       0.65      0.67      0.66        58

    accuracy                           0.79       192
   macro avg       0.75      0.76      0.76       192
weighted avg       0.79      0.79      0.79       192



## Save the model and related objects

In [6]:
model_package = {
        'model_pipeline': pipe,
        'data_features': x_train.columns.tolist(),
        'name': f'Diabetes predictor',
        'model_version': '0.0',
        'model_type': 'Supervised',
        'model_objective': 'Classification',
        'model_algorithm': 'Support Vector Classification (SVC)',
        'model_trained_date': str(datetime.today().date())
    }

In [7]:
joblib.dump(model_package, '../models/diabetes_predictor')

['../models/diabetes_predictor']