In [143]:
import logging

import numpy as np
import pandas as pd
import mlflow

from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [122]:
# loading the dataset

load_path = r"../../files/data/2_end.xlsx"

original_data = pd.read_excel(load_path)
original_data.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,year,trade_deficit,ward_cluster
0,-5.089087,-2.509118,-1.371926,-1.105728,-0.669043,-0.034845,-0.445481,-0.103239,2003,-712.331648,0
1,-4.640566,-1.906677,-0.17839,-0.480414,-0.222851,-0.10415,-0.188241,-0.024866,2004,-851.752241,0
2,-4.124425,-1.509576,1.154137,0.039131,-0.433086,-0.3158,0.329608,0.168554,2005,-953.674088,0
3,-3.653797,-1.937473,1.841681,0.579203,-0.116135,-0.054812,0.475929,0.117863,2006,-888.751877,0
4,-3.10454,0.498026,2.235057,1.444249,-0.316111,-0.007681,-0.312718,-0.250548,2007,-781.766393,0


In [138]:
logger = logging.getLogger("mlflow")
logger.setLevel(logging.WARNING)

experiment_name = 'trade-deficit-service'
a = mlflow.get_experiment_by_name(experiment_name)
if not mlflow.get_experiment_by_name(experiment_name):
    mlflow.create_experiment(experiment_name, tags={'stage': 'inference', 'version': 'v1'})

mlflow.set_tracking_uri('http://127.0.0.1:8080')
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/652246872552117148', creation_time=1719418203058, experiment_id='652246872552117148', last_update_time=1719418203058, lifecycle_stage='active', name='trade-deficit-service', tags={'stage': 'inference', 'version': 'v1'}>

In [123]:
y = original_data['trade_deficit']
cluster_data = original_data['ward_cluster']
X = original_data.drop(['trade_deficit', 'year'], axis=1)

In [144]:
def evaluate_model(model_object, n_repeats, test_size=0.25, model_parameters={}):
    r2_scores = []
    encoder = OneHotEncoder(drop='first')

    with mlflow.start_run(run_name=model_object.__name__, nested=True) as child_run:

        for i in range(n_repeats):
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, stratify=X['ward_cluster'], random_state=i)

            X_train_clusters_encoded = encoder.fit_transform(X_train[['ward_cluster']])
            X_val_clusters_encoded = encoder.transform(X_val[['ward_cluster']])

            X_train_clusters_encoded = pd.DataFrame(X_train_clusters_encoded.toarray(), columns=encoder.get_feature_names_out(['ward_cluster']), index=X_train.index)
            X_val_clusters_encoded = pd.DataFrame(X_val_clusters_encoded.toarray(), columns=encoder.get_feature_names_out(['ward_cluster']), index=X_val.index)

            X_train = pd.concat([X_train.drop(columns=['ward_cluster']), X_train_clusters_encoded], axis=1)
            X_val = pd.concat([X_val.drop(columns=['ward_cluster']), X_val_clusters_encoded], axis=1)

            clf = model_object(**model_parameters)
            clf.fit(X_train, y_train)

            yhat_val = clf.predict(X_val)

            r2_val = r2_score(y_val, yhat_val)
            r2_scores.append(r2_val)

        mean_r2 = np.mean(r2_scores)

        mlflow.log_params(model_parameters)
        mlflow.log_metric("Root mean square error", mean_r2)

    return mean_r2

In [145]:
with mlflow.start_run(run_name="sklearn-models"):
    print(evaluate_model(LinearRegression, n_repeats=4, model_parameters={}))
    print(evaluate_model(model_object=KernelRidge, n_repeats=4, model_parameters={'alpha': 0.05, 'kernel': 'rbf'}))
    print(evaluate_model(model_object=KernelRidge, n_repeats=4, model_parameters={'alpha': 0.7, 'kernel': 'poly', 'degree': 2}))
    print(evaluate_model(model_object=RandomForestRegressor, n_repeats=4, model_parameters={'n_estimators': 100, 'random_state': 42}))
    print(evaluate_model(model_object=GaussianProcessRegressor, n_repeats=4, model_parameters={
                             'kernel': RBF(length_scale=5.0, length_scale_bounds=(1e-1, 1e2)), 'alpha': 0.2, 'n_restarts_optimizer': 5}))

0.8857086830140422
0.8290169994184273
0.5836547864448605
0.5025909254671204
0.8205849063313202
