In [2]:
# import packages
import numpy as py
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error, r2_score, mean_absolute_error, mean_squared_error

# Data Preparation

In [3]:
# load data
data = load_diabetes()
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = pd.Series(data.target)
df.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [4]:
# check types of columns
df.dtypes

age       float64
sex       float64
bmi       float64
bp        float64
s1        float64
s2        float64
s3        float64
s4        float64
s5        float64
s6        float64
target    float64
dtype: object

# Model Training

In [5]:
def pre_process(df: pd.DataFrame):
    df_total = df.fillna(df.mean())
    return df_total


# Model building
# split data into train and test
def split_data(df_total, target_column):
    X, y = df_total.drop(target_column, axis=1), df_total[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X_train, X_test, y_train, y_test


# train a model
def train_model(X_train: np.ndarray, y_train: np.ndarray):
    model = LinearRegression()
    model.fit(X_train, y_train)
    return model


# model evaluation
def compute_rmsle(y_test: np.ndarray, y_pred: np.ndarray, precision: int = 2) -> float:
    rmsle = np.sqrt(mean_squared_log_error(y_test, y_pred))
    return round(rmsle, precision)


def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2


# Inference
def make_prediction(model, df):
    prediction = model.predict(df)
    prediction = pd.DataFrame(prediction)
    return prediction

In [6]:
df_total = pre_process(df)
df_total.head()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,target
0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


In [7]:
X_train, X_test, y_train, y_test = split_data(df_total, 'target')
model = train_model(X_train, y_train)
y_pred = make_prediction(model, X_test)
y_pred

Unnamed: 0,0
0,86.517682
1,83.144139
2,208.501659
3,156.543551
4,48.152064
...,...
84,123.631094
85,178.828613
86,296.505189
87,200.993711


In [8]:
rmse, mae, r2 = eval_metrics(y_test, y_pred)
rmse, mae, r2

(51.828211628520265, 42.02342622963727, 0.5265959971896882)

# MLflow

In [9]:
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn
import sys

import logging
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

#alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5
#l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5

with mlflow.start_run():
    df_total = pre_process(df)
    X_train, X_test, y_train, y_test = split_data(df_total, 'target')
    model = train_model(X_train, y_train)
    y_pred = make_prediction(model, X_test)
    rmse, mae, r2 = eval_metrics(y_test, y_pred)

    #print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
    print("  RMSE: %s" % rmse)
    print("  MAE: %s" % mae)
    print("  R2: %s" % r2)

    #mlflow.log_param("alpha", alpha)
    #mlflow.log_param("l1_ratio", l1_ratio)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # Model registry does not work with file store
    if tracking_url_type_store != "file":

        # Register the model
        # There are other ways to use the Model Registry, which depends on the use case,
        # please refer to the doc for more information:
        # https://mlflow.org/docs/latest/model-registry.html#api-workflow
        mlflow.sklearn.log_model(model, "model", registered_model_name="ElasticnetWineModel")
    else:
        mlflow.sklearn.log_model(model, "model")

  RMSE: 58.60056828945725
  MAE: 46.801057743460525
  R2: 0.4694490737953928


In [10]:
# !mlflow ui
# link: http://localhost:5000

In [23]:
# pickling the Model
import pickle
file = open('Patient_Prediction.pkl', 'wb')
pickle.dump(model, file)

In [24]:
loaded_model=pickle.load(open('Patient_Prediction.pkl','rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.4694490737953928
