# Diabetes Prediction
## Creating models using ElasticNet (sklearn) and Plots ElasticNet Descent Paths

Uses a Diabetes [dataset](https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt) to predict diabetes progression in next year using ElasticNet.
The predicted "progression" column is a quantitative measure of disease progression one year after baseline
Combines the above with the [Lasso Coordinate Descent Path Plot](http://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_coordinate_descent_path.html)

Original author: Alexandre Gramfort <alexandre.gramfort@inria.fr>; License: BSD 3 clause
The [main page](https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html)
[Related paper](https://web.stanford.edu/~hastie/Papers/LARS/LeastAngle_2002.pdf)

Usage:
 python train_diabetes.py 0.01 0.01
 python train_diabetes.py 0.01 0.75
 python train_diabetes.py 0.01 1.0

In [1]:
import os
import warnings
import sys

import pandas as pd
import numpy as np
import mlyzer

## 1) Data Preparation

In [2]:
# from sklearn import datasets
# Load Diabetes datasets
# diabetes = datasets.load_diabetes()

print("Read the diabetes txt file from the URL")
csv_url ='https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt'
try:
    data = pd.read_csv(csv_url, sep='\t')
except Exception as e:
    logger.exception(
    "Unable to download training & test CSV, check your internet connection. Error: %s", e)
df = data
d_mean = data.mean() ; d_std = data.std(); # print("mean & std are:", d_mean, d_std)
diabetes = (data-d_mean)/d_std

X = data.drop(data.columns[[10]], axis = 1, inplace = False) 
y = data.iloc[:,10]

diabetes.iloc[:,10] = y
data = diabetes.rename(columns={'Y': 'progression'})

Read the diabetes txt file from the URL


## 2) Utils

In [3]:
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.linear_model import lasso_path, enet_path

def path_plot(X, Y, l1_ratio, fig_name): 
    # Compute paths
    eps = 5e-3  # the smaller it is the longer is the path

    # Computing regularization path using the elastic net.
    alphas_enet, coefs_enet, _ = enet_path(X, y, eps=eps, l1_ratio=l1_ratio, fit_intercept=False)

    # Display results
    fig = plt.figure(1)
    ax = plt.gca()
    colors = cycle(["b", "r", "g", "c", "k"])
    neg_log_alphas_enet = -np.log10(alphas_enet)
    for coef_e, c in zip(coefs_enet, colors):
        l2 = plt.plot(neg_log_alphas_enet, coef_e, linestyle="--", c=c)

    plt.xlabel("-Log(alpha)"); plt.ylabel("coefficients")
    title = "ElasticNet Path by alpha for l1_ratio = " + str(l1_ratio)
    plt.title(title); plt.axis("tight")

    # Save figures
    fig.savefig(fig_name)

    # Close plot
    plt.close(fig)
    
    
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

## 3) Model Training & Logging

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet

import mlyzer
remote_server_uri = "https://mlyzer.solouk.net" # set to your server URI
experiment_name="DS__Test"

mlyzer.set_tracking_uri(remote_server_uri)
#mlyzer.get_tracking_uri()


def train(alpha_in, ratio_in):
    warnings.filterwarnings("ignore")
    np.random.seed(40)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "progression" which is a quantitative measure of disease progression one year after baseline
    train_x = train.drop(["progression"], axis=1)
    test_x = test.drop(["progression"], axis=1)
    train_y = train[["progression"]]
    test_y = test[["progression"]]

    alpha = float(alpha_in) if alpha_in is not None else 0.05
    l1_ratio = float(ratio_in) if ratio_in is not None else 0.05

    
    mlyzer.create_expriment(experiment_name)    
    mlyzer.clean()
    mlyzer.run()
    lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
    lr.fit(train_x, train_y)
    predicted_qualities = lr.predict(test_x)
    (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)
    path_plot(X, y, l1_ratio, fig_name="ElasticNet-paths.png")
    
    mlyzer.log_param("alpha", alpha)
    mlyzer.log_param("l1_ratio", l1_ratio)
    mlyzer.log_metric("rmse", rmse)
    mlyzer.log_metric("r2", r2)
    mlyzer.log_metric("mae", mae)
    mlyzer.log_model("model", lr, "sklearn")
    # Log artifacts (output files)
    mlyzer.log_artifact("ElasticNet-paths.png")
    
    mlyzer.end_run()

in set...


In [5]:
#for i in [0.00001, 0.000001, 0.000005, 0.0001, 0.001, 0.01, 0.1, 1.0]:
#    for j in [0.1, 0.01, 0.001,0.0001,0.0001, 0.00001, 0.000001, 0.000001]:
#        train (i,j)
train(0.1,0.01)
train(0.1,0.1)
train(0.01,0.01)

The experiment currently exist! Abort re-creation...
Name: DS__Test
Artifact Location: /opt/artifacts/22
Lifecycle_stage: active
The experiment currently exist! Abort re-creation...
Name: DS__Test
Artifact Location: /opt/artifacts/22
Lifecycle_stage: active
The experiment currently exist! Abort re-creation...
Name: DS__Test
Artifact Location: /opt/artifacts/22
Lifecycle_stage: active


## 4) Model Prediction


In [1]:
curl_data = '\'{"columns":["age", "sex", "bmi", "bp", "s1", "s2", "s3", "s4", "s5", "s6"],"data":[[-0.039522, -0.937474, -1.080955,-0.552878, -0.177423, -0.402430, 1.562643, -0.829361, -1.434962, -1.936285]]}\''
host = '127.0.0.1'
port = '12355'

! curl -X POST -H "Content-Type:application/json; format=pandas-split" --data {curl_data} http://{host}:{port}/invocations
print()
print("related data is:  age:48  Sex:1  bmi:21.6   bp:87.00  s1:183  s2:103.2  s3:70.0  s4:3.00  s5:3.8918   s6:69 ")
print ("expected value is 75!");

[67.94773994435154]
related data is:  age:48  Sex:1  bmi:21.6   bp:87.00  s1:183  s2:103.2  s3:70.0  s4:3.00  s5:3.8918   s6:69 
expected value is 75!
