In [12]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
import mlflow
import mlflow.sklearn
import logging

In [13]:
logging.basicConfig(level=logging.WARN)
logger = logging.getLogger(__name__)

In [41]:
def eval_metrics(actual, pred):
    # Root Mean Square Error (https://en.wikipedia.org/wiki/Root-mean-square_deviation)
    rmse = np.sqrt(mean_squared_error(actual, pred))
    # Mean Absolute Error (https://en.wikipedia.org/wiki/Mean_absolute_error)
    mae = mean_absolute_error(actual, pred)
    # R squared / Coefficient of Determination (https://en.wikipedia.org/wiki/Coefficient_of_determination)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2


# set fixed random seed for numpy for reproducibility
np.random.seed(40)

# read the csv file from the local directory (needs to have been imported / pulled with DVC)
try:
    data = pd.read_csv("winequality-red.csv", sep=",")
except Exception as e:
    logger.exception(
        "Unable to find CSV file! Have you imported it with DVC? Error: %s", e
    )
    exit(-1)

# split the data into training and test sets (0.75 and 0.25)
train, test = train_test_split(data)

# the column to be predicted is "quality" which is a scalar from [3, 9]
train_x = train.drop(["quality"], axis=1)
test_x = test.drop(["quality"], axis=1)
train_y = train[["quality"]]
test_y = test[["quality"]]

# start training loop to test 10 different hyperparameter settings
for x in range(10):

    # set hyperparameters based on loop variable (0.01 to 0.91)
    alpha = 0.03 + x / 10
    l1_ratio = 0.03 + x / 10

    with mlflow.start_run() as run:
        # instantiate the model
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        # train the model with the training data
        lr.fit(train_x, train_y)
        # evaluate the model on the test data
        predicted_qualities = lr.predict(test_x)
        # calculate evaluation metrics based on prediction results
        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        print("Training loop #%s" % (x + 1))
        print("  Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("    RMSE: %s" % rmse)
        print("    MAE: %s" % mae)
        print("    R2: %s" % r2)

        # TODO: log the two experiment params and the three evaluation metrics using mlflow
        
        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        
        mlflow.log_metric("RMSE", rmse)
        mlflow.log_metric("MAE", mae)
        mlflow.log_metric("R2", r2)
        mlflow.end_run()

        # TODO: store the model using mlflow
        mlflow.sklearn.save_model(lr, f"model/ElasticNet_{run.info.run_id}")

Training loop #1
  Elasticnet model (alpha=0.030000, l1_ratio=0.030000):
    RMSE: 0.6870790513828099
    MAE: 0.5257435863368343
    R2: 0.3311216423802178
Training loop #2
  Elasticnet model (alpha=0.130000, l1_ratio=0.130000):
    RMSE: 0.7205795852258078
    MAE: 0.5530587346723914
    R2: 0.2643052833492303
Training loop #3
  Elasticnet model (alpha=0.230000, l1_ratio=0.230000):
    RMSE: 0.7360863848819698
    MAE: 0.5669847346269457
    R2: 0.2323004271777731
Training loop #4
  Elasticnet model (alpha=0.330000, l1_ratio=0.330000):
    RMSE: 0.7491789819122071
    MAE: 0.5807029026707362
    R2: 0.20474776389215965
Training loop #5
  Elasticnet model (alpha=0.430000, l1_ratio=0.430000):
    RMSE: 0.7730252345842651
    MAE: 0.6057393726791199
    R2: 0.1533165480442663
Training loop #6
  Elasticnet model (alpha=0.530000, l1_ratio=0.530000):
    RMSE: 0.8036662230102831
    MAE: 0.6381671157130895
    R2: 0.08486500937605623
Training loop #7
  Elasticnet model (alpha=0.630000, l1_