In [1]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

ml_client = MLClient.from_config(credential=credential)

Found the config file in: /config.json


In [52]:
%%writefile env.yml
name: mlflow-env
channels:
  - conda-forge
dependencies:
  - python=3.10
  - pip
  - pip:
    - numpy
    - pandas
    - scikit-learn
    - matplotlib
    - mlflow
    - azureml-mlflow
    - pmdarima
    - prophet
    - stats_can
    - openpyxl

Overwriting env.yml


In [53]:
from azure.ai.ml.entities import Environment

my_env = Environment(
    image="mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04",
    conda_file="./env.yml",
    name="time-series-backtest",
    description = "Environment for time series model backtesting."
)

ml_client.environments.create_or_update(my_env)

Environment({'intellectual_property': None, 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'time-series-backtest', 'description': 'Environment for time series model backtesting.', 'tags': {}, 'properties': {'azureml.labels': 'latest'}, 'print_as_yaml': True, 'id': '/subscriptions/ba7684dd-6ce0-49a1-9eab-54c65c60d6e1/resourceGroups/rg-dp100-ldc63a064cfb7438089/providers/Microsoft.MachineLearningServices/workspaces/mlw-dp100-ldc63a064cfb7438089/environments/time-series-backtest/versions/4', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/cidc63a064cfb7438089/code/Users/Gkaube', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7f4052bd2f50>, 'serialize': <msrest.serialization.Serializer object at 0x7f4052bd2e60>, 'version': '4', 'latest_version': None, 'conda_file': {'channels': ['conda-forge'], 'dependencies': ['python=3.10', 'pip', {'pip': ['numpy', 'pandas', 'scikit-l

In [6]:
%%writefile src/cv_test.py
from samplemodels import ARIMAModel
from timeseriescrossvalidation import TimeSeriesCrossValidator
from pipeline import Pipeline
from utils import initialize_directories
import argparse
import pandas as pd
import mlflow
import numpy as np
import matplotlib.pyplot as plt

def main(args):
    initialize_directories()

    path = args.input_data_path
    order = tuple(args.order)
    features = args.features
    pca_comps = args.pca_comps
    
    run_desc = "Back testing ARIMA{}, with {} pca components with features: {}".format(order, pca_comps, features)
    
    with mlflow.start_run(description=run_desc):
        data = load_data(path=path)
        data_clean = prep_data(data)
        data_clean.index = pd.DatetimeIndex(data_clean['ds'])
        data_clean.to_csv(path + "/trainv1.csv")
        mlflow.log_artifact(path + "/trainv1.csv", artifact_path="data")

        # y values that are 0.00 have just not been observed yet
        data_clean = data_clean[data_clean['y'] != 0.00]
        
        
        mlflow.log_param("Features used for training", features)

        cross_validation(data_clean,
                         order = order,
                         columns = features,
                         pca_comps = pca_comps)

        mlflow.set_tag("Training Info", "Cross validation")

        X_train, y_train = split_data(data_clean, features)

        # Apply PCA to the data
        X_transformed = dim_recuction(X_train, pca_comps)

        X_transformed.to_csv(path + "/X_transformed.csv")
        y_train.to_csv(path + "/y_train.csv")
        mlflow.log_artifact(path + "/X_transformed.csv", artifact_path="data")
        mlflow.log_artifact(path + "/y_train.csv", artifact_path="data")

        trained_model = train_model(X_transformed, y_train, order)
        # Log the model
        log_model(trained_model, X_transformed, y_train)


def cross_validation(data_set, 
                     order:tuple,
                     columns:list,
                     pca_comps:int = 0
                    ):
    
    cv = TimeSeriesCrossValidator(ARIMAModel(order=order), fold_size=4, order=order, with_intercept=True)
    
    
    if pca_comps > 0:
        cv.principal_components(n_components=2)
    
    
    # Subset data_set
    train_modified = data_set[columns +['y','ds']]
    
    # Run the backtest and print metrics
    cv.fit(train_modified, start=33, end = 55)
    cv.print_metrics()
    
    # Log parameters and metrics
    mlflow.log_params(cv.model.model.get_params())
    mlflow.log_metric("Train MSE", np.mean(cv.train_mse))
    mlflow.log_metric("MSE", np.mean(cv.mse))
    mlflow.log_metric("RMSE", np.mean(np.sqrt(cv.mse)))
    mlflow.log_metric("MAE", np.mean(cv.mae))
    mlflow.log_metric("MAPE", np.mean(cv.mape))

    # Log training performance
    train_mse_per_fold = {f"Fold {i+1}": j for i, j in enumerate(cv.train_mse)}
    mlflow.log_dict(train_mse_per_fold, "train_mse_per_fold.json")

    # Do some plotting
    plot_metric_per_fold("RMSE", np.sqrt(cv.mse))
    plot_metric_per_fold("MAPE", cv.mape)
    

def plot_metric_per_fold(metric_name, metric:list):
    metric_per_fold = {f"Fold {i+1}": j for i, j in enumerate(metric)}
    fold_list = metric_per_fold.keys()
    metrics = metric_per_fold.values()
    fig=plt.figure(figsize=(6, 4))
    plt.bar(x=fold_list, height=metrics)
    plt.title(f"{metric_name} per fold")
    plt.ylabel(f"{metric_name}")
    plt.xlabel("Fold")
    
    plot_name = f"{metric_name}_per_fold.png"
    plt.savefig(plot_name) 
    mlflow.log_artifact(plot_name)


def load_data(path="./data"):
    data_pipeline = Pipeline(data_path=path)
    data = data_pipeline.run()
    return data


def prep_data(df:pd.DataFrame):
    df = df.dropna()
    df.columns = map(str.lower, df.columns)
    df.columns = df.columns.str.replace(' ', '_')
    return df[df.sales != 0.00].reset_index(drop=True)


def train_model(X, y, order):
    import pmdarima as pm
    model = pm.ARIMA(order=order, with_intercept=True)
    model.fit(y, X)
    return model
    

def log_model(model, X, y):
    from mlflow.models.signature import infer_signature
    # Get insample predictions
    predictions = model.predict_in_sample(X)

    signature = infer_signature(y, predictions)
    mlflow.pmdarima.log_model(model, "model", signature=signature)


def split_data(data, features):
    X = data[features]
    y = data['y']
    return X, y


def dim_recuction(X, n_comps=2):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=n_comps)
    
    X_transformed = pca.fit_transform(X)
    X_transformed_df = pd.DataFrame(X_transformed, index=pd.to_datetime(X.index))

    return X_transformed_df


def parse_args():
    parser = argparse.ArgumentParser()
    
    parser.add_argument("--data_path", dest = "input_data_path", default = "./data", type=str)
    parser.add_argument("--order", action="append", dest = "order", type=int)
    parser.add_argument("--features", action="append", dest = "features")
    parser.add_argument("--pca_comps", dest = "pca_comps", type=int, default=2)
    
    args = parser.parse_args()
    return args


if __name__ == "__main__":
    args = parse_args()
    main(args)

Overwriting src/cv_test.py


In [8]:
from azure.ai.ml import command
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes

inputs = {
    "data_path": Input(type=AssetTypes.URI_FOLDER, path="train-data:version", mode= 'download'),
    "ar": 3,
    "I": 1,
    "ma": 0,
    "feature1": "feature1", 
    "feature2":"feature2"
}

job = command(
    code = "./src",
    command = "python cv_test.py --data_path ${{inputs.data_path}} --order ${{inputs.ar}} --order ${{inputs.I}} --order ${{inputs.ma}} --features ${{inputs.feature1}} --features ${{inputs.feature2}}",
    inputs = inputs,
    environment="time-series-backtest:4",
    compute = "aml-cluster",
    display_name= "ts-cv",
    experiment_name="timeseries-cv"
)

job_results = ml_client.create_or_update(job)


[32mUploading src (0.05 MBs): 100%|██████████| 51818/51818 [00:00<00:00, 376405.10it/s]
[39m



In [7]:
from azure.ai.ml.entities import Model
job_name = job_results.name
print(job_name)

# Register the model
saved_model = Model(
    path=f"azureml://jobs/{job_name}/outputs/artifacts/paths/model/",
    name = "Arima_3_1_0",
    description= "Arima(3,1,0) trained with pca features",
    type=AssetTypes.MLFLOW_MODEL,
)

ml_client.models.create_or_update(saved_model)

musing_arch_sgkgrz8v7r


ResourceExistsError: (UserError) Conflict
Code: UserError
Message: Conflict
Exception Details:	(ModelVersionInUse) Model Arima_3_1_0:2 already exists. Please use a different name or version.
	Code: ModelVersionInUse
	Message: Model Arima_3_1_0:2 already exists. Please use a different name or version.