In [49]:
!python -V

Python 3.9.16


# Data Preparation

In [50]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import mlflow
import pickle
import os
import numpy as np

In [51]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder

from sklearn.metrics import mean_squared_error

In [52]:
# Connecting to the AWS S3
os.environ["AWS_PROFILE"] = "default" # fill in with your AWS profile. More info: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup.html#setup-credentials

TRACKING_SERVER_HOST = "ec2-13-251-63-107.ap-southeast-1.compute.amazonaws.com" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

# Set the new experiment
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

# Check the experiment
mlflow.search_experiments()

tracking URI: 'http://ec2-13-251-63-107.ap-southeast-1.compute.amazonaws.com:5000'


[<Experiment: artifact_location='s3://mlflow-artifacts-remote-zoomcamp/2', creation_time=1688375544076, experiment_id='2', last_update_time=1688375544076, lifecycle_stage='active', name='my-experiment-1', tags={}>,
 <Experiment: artifact_location='s3://mlflow-artifacts-remote-zoomcamp/1', creation_time=1688358175294, experiment_id='1', last_update_time=1688358175294, lifecycle_stage='active', name='my-experiment-2', tags={}>,
 <Experiment: artifact_location='s3://mlflow-artifacts-remote-zoomcamp/0', creation_time=1688358000094, experiment_id='0', last_update_time=1688358000094, lifecycle_stage='active', name='Default', tags={}>]

In [53]:
# kaggle datasets download -d mchirico/montcoalert

dataset_name = "spotify-and-youtube"
!kaggle datasets download -d salvatorerastelli/{dataset_name}
!mkdir data
!cd data
!mkdir {dataset_name}
!unzip -o {dataset_name}.zip -d /home/ubuntu/mlops_project/data/{dataset_name}/

spotify-and-youtube.zip: Skipping, found more recently modified local copy (use --force to force download)
mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘spotify-and-youtube’: File exists
Archive:  spotify-and-youtube.zip
  inflating: data/spotify-and-youtube/Spotify_Youtube.csv  


In [54]:
# Read the CSV file and check
df = pd.read_csv("/home/ubuntu/mlops_project/data/spotify-and-youtube/Spotify_Youtube.csv")

In [55]:
# Reset the index to be starting from 1 not zero
df = df.reset_index(drop=True)
df.index = df.index + 1

# show the table
df.head(2)

Unnamed: 0.1,Unnamed: 0,Artist,Url_spotify,Track,Album,Album_type,Uri,Danceability,Energy,Key,...,Url_youtube,Title,Channel,Views,Likes,Comments,Description,Licensed,official_video,Stream
1,0,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Feel Good Inc.,Demon Days,album,spotify:track:0d28khcov6AiegSCpG5TuT,0.818,0.705,6.0,...,https://www.youtube.com/watch?v=HyHNuVaZJ-k,Gorillaz - Feel Good Inc. (Official Video),Gorillaz,693555221.0,6220896.0,169907.0,Official HD Video for Gorillaz' fantastic trac...,True,True,1040235000.0
2,1,Gorillaz,https://open.spotify.com/artist/3AA28KZvwAUcZu...,Rhinestone Eyes,Plastic Beach,album,spotify:track:1foMv2HQwfQ2vntFf9HFeG,0.676,0.703,8.0,...,https://www.youtube.com/watch?v=yYDmaexVHic,Gorillaz - Rhinestone Eyes [Storyboard Film] (...,Gorillaz,72011645.0,1079128.0,31003.0,The official video for Gorillaz - Rhinestone E...,True,True,310083700.0


In [56]:
# Rename all the column to be nice to see
df['Trending'] = df.shape[0] + 1 - df['Views'].rank()
df = df.sort_values(by='Trending', ascending=True)

# Drop some unnecessary file
df = df.dropna()
df = df.drop('Url_spotify', axis=1)
df = df.drop('Uri', axis=1)

df.head()

Unnamed: 0.1,Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Key,Loudness,Speechiness,...,Title,Channel,Views,Likes,Comments,Description,Licensed,official_video,Stream,Trending
1148,1147,Luis Fonsi,Despacito,VIDA,album,0.655,0.797,2.0,-4.787,0.153,...,Luis Fonsi - Despacito ft. Daddy Yankee,LuisFonsiVEVO,8079649000.0,50788652.0,4252791.0,“Despacito” disponible ya en todas las platafo...,True,True,1506598000.0,471.0
366,365,Daddy Yankee,Despacito,VIDA,album,0.655,0.797,2.0,-4.787,0.153,...,Luis Fonsi - Despacito ft. Daddy Yankee,LuisFonsiVEVO,8079647000.0,50788626.0,4252791.0,“Despacito” disponible ya en todas las platafo...,True,True,1506598000.0,472.0
12453,12452,Ed Sheeran,Shape of You,÷ (Deluxe),album,0.825,0.652,1.0,-3.183,0.0802,...,Ed Sheeran - Shape of You (Official Music Video),Ed Sheeran,5908398000.0,31047780.0,1130327.0,The official music video for Ed Sheeran - Shap...,True,True,3362005000.0,473.0
14581,14580,Charlie Puth,See You Again (feat. Charlie Puth),See You Again (feat. Charlie Puth),single,0.689,0.481,10.0,-7.503,0.0815,...,Wiz Khalifa - See You Again ft. Charlie Puth [...,Wiz Khalifa Music,5773798000.0,40147674.0,2127346.0,Download the new Furious 7 Soundtrack Deluxe V...,True,True,1521255000.0,474.0
12470,12469,Wiz Khalifa,See You Again (feat. Charlie Puth),See You Again (feat. Charlie Puth),single,0.689,0.481,10.0,-7.503,0.0815,...,Wiz Khalifa - See You Again ft. Charlie Puth [...,Wiz Khalifa Music,5773797000.0,40147618.0,2127345.0,Download the new Furious 7 Soundtrack Deluxe V...,True,True,1521255000.0,475.0


In [57]:
# Select only numerical columns
numerical_columns = df.select_dtypes(include=[np.number]).columns

# Calculate correlation
correlation = df[numerical_columns].corr()['Trending']
print(correlation)

Unnamed: 0         -0.042535
Danceability       -0.153034
Energy             -0.178070
Key                -0.019986
Loudness           -0.285794
Speechiness         0.029659
Acousticness        0.162451
Instrumentalness    0.226587
Liveness            0.004129
Valence            -0.091374
Tempo              -0.045393
Duration_ms        -0.065395
Views              -0.479455
Likes              -0.488178
Comments           -0.195022
Stream             -0.465537
Trending            1.000000
Name: Trending, dtype: float64


In [58]:
# Split the file into training data and testing data
split_number = int(len(df)/2)

df_train = df[:split_number]
df_val = df[split_number:]

df.head()

Unnamed: 0.1,Unnamed: 0,Artist,Track,Album,Album_type,Danceability,Energy,Key,Loudness,Speechiness,...,Title,Channel,Views,Likes,Comments,Description,Licensed,official_video,Stream,Trending
1148,1147,Luis Fonsi,Despacito,VIDA,album,0.655,0.797,2.0,-4.787,0.153,...,Luis Fonsi - Despacito ft. Daddy Yankee,LuisFonsiVEVO,8079649000.0,50788652.0,4252791.0,“Despacito” disponible ya en todas las platafo...,True,True,1506598000.0,471.0
366,365,Daddy Yankee,Despacito,VIDA,album,0.655,0.797,2.0,-4.787,0.153,...,Luis Fonsi - Despacito ft. Daddy Yankee,LuisFonsiVEVO,8079647000.0,50788626.0,4252791.0,“Despacito” disponible ya en todas las platafo...,True,True,1506598000.0,472.0
12453,12452,Ed Sheeran,Shape of You,÷ (Deluxe),album,0.825,0.652,1.0,-3.183,0.0802,...,Ed Sheeran - Shape of You (Official Music Video),Ed Sheeran,5908398000.0,31047780.0,1130327.0,The official music video for Ed Sheeran - Shap...,True,True,3362005000.0,473.0
14581,14580,Charlie Puth,See You Again (feat. Charlie Puth),See You Again (feat. Charlie Puth),single,0.689,0.481,10.0,-7.503,0.0815,...,Wiz Khalifa - See You Again ft. Charlie Puth [...,Wiz Khalifa Music,5773798000.0,40147674.0,2127346.0,Download the new Furious 7 Soundtrack Deluxe V...,True,True,1521255000.0,474.0
12470,12469,Wiz Khalifa,See You Again (feat. Charlie Puth),See You Again (feat. Charlie Puth),single,0.689,0.481,10.0,-7.503,0.0815,...,Wiz Khalifa - See You Again ft. Charlie Puth [...,Wiz Khalifa Music,5773797000.0,40147618.0,2127345.0,Download the new Furious 7 Soundtrack Deluxe V...,True,True,1521255000.0,475.0


# Training Data 

In [59]:
# Set up for features engineering

categorical = ['Licensed']
numerical = ['Views', 'Likes']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'Trending'
y_train = df_train[target].values
y_val = df_val[target].values

In [60]:
# Test the data using simple method

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

9609.787951762835

In [61]:
# Check the range to assess the quality of our result (using percentage)
range = df['Trending'].max() - df['Trending'].min()
print(range)

20238.0


In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Set the experiment. If the experiment does not exist, it will be created.
mlflow.set_experiment("my-experiment-2")

with mlflow.start_run():
    
    mlflow.set_tag("owner", 'M Irfan')
    
    mlflow.log_param("train-data-path", "/home/ubuntu/mlops_project/data/{dataset_name}.csv")
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared = False)
    mlflow.log_metric("rmse", rmse)
    
    mlflow.sklearn.log_model(lr, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")


default artifacts URI: 's3://mlflow-artifacts-remote-zoomcamp/1/eaf6dcfc50de4d6d996726aeca07b7a2/artifacts'


In [63]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from datetime import datetime
import mlflow.xgboost
import pickle
import boto3
import s3fs

In [64]:
# Train the data

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [65]:
def objective(params):
    fs = s3fs.S3FileSystem(anon=False)  # Add this line

    os.environ["AWS_PROFILE"] = "default"
    TRACKING_SERVER_HOST = "ec2-13-251-63-107.ap-southeast-1.compute.amazonaws.com"
    mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
    
    current_datetime = datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')

    with mlflow.start_run():
        print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")
        
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        
        error_percentage = rmse / range * 100
        mlflow.log_metric("error_percentage", error_percentage)

        mlflow.xgboost.save_model(booster, f"/tmp/models_mlflow-{current_datetime}")

        with open(f"/tmp/preprocessor-{current_datetime}.pkl", "wb") as f_out:
            pickle.dump(dv, f_out)

        fs.put(f"/tmp/preprocessor-{current_datetime}.pkl", f"s3://mlflow-artifacts-remote-zoomcamp/mlruns/preprocessor-{current_datetime}.pkl")
        fs.put(f"/tmp/models_mlflow-{current_datetime}", f"s3://mlflow-artifacts-remote-zoomcamp/mlruns/model-{current_datetime}.xgb")

        mlflow.log_artifact(f"/tmp/preprocessor-{current_datetime}.pkl", artifact_path="preprocessor")
        mlflow.log_artifact(f"/tmp/models_mlflow-{current_datetime}", artifact_path="models_mlflow")

        print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

    return {'loss': rmse, 'status': STATUS_OK}

In [66]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 200, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -6, -3),
    'objective': 'reg:squarederror',
    'seed': 42,
    'verbosity': 3
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=3,
    trials=Trials()
)


default artifacts URI: 's3://mlflow-artifacts-remote-zoomcamp/1/fd980da492374593813ee94a068835b8/artifacts'
[13:02:57] DEBUG: ../src/gbm/gbtree.cc:157: Using tree method: 2                
[13:02:57] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 128 extra nodes, 0 pruned nodes, max_depth=8
[0]	validation-rmse:12478.91826                                                 
[13:02:57] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 124 extra nodes, 0 pruned nodes, max_depth=8
[1]	validation-rmse:10302.12759                                                 
[13:02:57] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 122 extra nodes, 0 pruned nodes, max_depth=8
[2]	validation-rmse:8854.06445                                                  
[13:02:58] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 130 extra nodes, 0 pruned nodes, max_depth=9
[3]	validation-rmse:7876.78021                                                  
[13:02:58] INFO: ../src/tree/update

[13:03:03] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1102 extra nodes, 0 pruned nodes, max_depth=28
[82]	validation-rmse:5853.54880                                                 
[13:03:03] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1158 extra nodes, 0 pruned nodes, max_depth=53
[83]	validation-rmse:5853.54887                                                 
[13:03:03] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 992 extra nodes, 0 pruned nodes, max_depth=27
[84]	validation-rmse:5853.54889                                                 
[13:03:03] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1050 extra nodes, 0 pruned nodes, max_depth=57
[85]	validation-rmse:5853.54881                                                 
[13:03:03] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1052 extra nodes, 0 pruned nodes, max_depth=60
[86]	validation-rmse:5853.54881                                                 
[13:03:03] INFO: ../s

[45]	validation-rmse:5853.59450                                                 
[13:03:16] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1002 extra nodes, 0 pruned nodes, max_depth=43
[46]	validation-rmse:5853.59447                                                 
[13:03:16] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 878 extra nodes, 0 pruned nodes, max_depth=28
[47]	validation-rmse:5853.59447                                                 
[13:03:17] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1004 extra nodes, 0 pruned nodes, max_depth=52
[48]	validation-rmse:5853.59447                                                 
[13:03:17] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1000 extra nodes, 0 pruned nodes, max_depth=26
[49]	validation-rmse:5853.59447                                                 
[13:03:17] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 912 extra nodes, 0 pruned nodes, max_depth=30
[50]	validation-rmse:5

[87]	validation-rmse:5853.59401                                                 
[13:03:18] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 512 extra nodes, 0 pruned nodes, max_depth=23
[88]	validation-rmse:5853.59400                                                 
[13:03:18] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 584 extra nodes, 0 pruned nodes, max_depth=27
[89]	validation-rmse:5853.59400                                                 
[13:03:18] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 592 extra nodes, 0 pruned nodes, max_depth=30
[90]	validation-rmse:5853.59400                                                 
[13:03:18] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 554 extra nodes, 0 pruned nodes, max_depth=31
[91]	validation-rmse:5853.59400                                                 
[13:03:18] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 470 extra nodes, 0 pruned nodes, max_depth=22
[92]	validation-rmse:5853

[171]	validation-rmse:5853.59341                                                
[13:03:21] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 348 extra nodes, 0 pruned nodes, max_depth=19
[172]	validation-rmse:5853.59341                                                
[13:03:21] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 300 extra nodes, 0 pruned nodes, max_depth=18
[173]	validation-rmse:5853.59341                                                
[13:03:21] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 272 extra nodes, 0 pruned nodes, max_depth=20
[174]	validation-rmse:5853.59341                                                
[13:03:21] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 348 extra nodes, 0 pruned nodes, max_depth=15
[175]	validation-rmse:5853.59341                                                
[13:03:21] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 232 extra nodes, 0 pruned nodes, max_depth=19
[176]	validation-rmse:585

[13:03:24] PrunerUpdate: 1.91147s, 252 calls @ 1911467us                        

default artifacts URI: 's3://mlflow-artifacts-remote-zoomcamp/1/071c02534e2041be8c2a66f7e531d3ff/artifacts'
[13:03:28] Configure: 0.000671s, 1 calls @ 671us                                

default artifacts URI: 's3://mlflow-artifacts-remote-zoomcamp/1/23b9a96a9ea246e388e77471b39383bd/artifacts'
[13:03:29] DEBUG: ../src/gbm/gbtree.cc:157: Using tree method: 2                
[13:03:29] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 104 extra nodes, 0 pruned nodes, max_depth=8
[0]	validation-rmse:13870.12698                                                 
[13:03:29] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 104 extra nodes, 0 pruned nodes, max_depth=8
[1]	validation-rmse:12376.97491                                                 
[13:03:29] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 100 extra nodes, 0 pruned nodes, max_depth=7
[2]	validation-rmse:11173.17138       

[79]	validation-rmse:5853.28084                                                 
[13:03:33] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 4824 extra nodes, 0 pruned nodes, max_depth=69
[80]	validation-rmse:5853.28009                                                 
[13:03:33] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 4272 extra nodes, 0 pruned nodes, max_depth=84
[81]	validation-rmse:5853.28002                                                 
[13:03:33] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 4416 extra nodes, 0 pruned nodes, max_depth=83
[82]	validation-rmse:5853.27965                                                 
[13:03:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 4002 extra nodes, 0 pruned nodes, max_depth=72
[83]	validation-rmse:5853.27965                                                 
[13:03:34] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 4232 extra nodes, 0 pruned nodes, max_depth=84
[84]	validation-rmse

[162]	validation-rmse:5853.27921                                                
[13:03:39] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 830 extra nodes, 0 pruned nodes, max_depth=32
[163]	validation-rmse:5853.27921                                                
[13:03:39] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 830 extra nodes, 0 pruned nodes, max_depth=32
[164]	validation-rmse:5853.27921                                                
[13:03:39] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 830 extra nodes, 0 pruned nodes, max_depth=32
[165]	validation-rmse:5853.27921                                                
[13:03:39] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 830 extra nodes, 0 pruned nodes, max_depth=32
[166]	validation-rmse:5853.27921                                                
[13:03:39] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 830 extra nodes, 0 pruned nodes, max_depth=32
[167]	validation-rmse:585

In [67]:
# Test using the best result

# Make sure AWS credentials are set properly
# Either using environment variables or ~/.aws/credentials
os.environ["AWS_PROFILE"] = "default" # fill in with your AWS profile. More info: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup.html#setup-credentials

TRACKING_SERVER_HOST = "ec2-13-251-63-107.ap-southeast-1.compute.amazonaws.com" # fill in with the public DNS of the EC2 instance

# create a connection to S3
fs = s3fs.S3FileSystem()

# use a local tracking server
mlflow.set_tracking_uri("file:///tmp/mlruns")

with mlflow.start_run(experiment_id="0"):
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:squarederror',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    
    error_percentage = rmse / range * 100
    mlflow.log_metric("error_percentage", error_percentage)

    # Save model locally
    mlflow.xgboost.save_model(booster, "/tmp/models_mlflow")

    # Save the preprocessor locally
    with open("/tmp/preprocessor.pkl", "wb") as f_out:
        pickle.dump(dv, f_out)

    # Upload model and preprocessor to S3
    fs.put("/tmp/preprocessor.pkl", "s3://mlflow-artifacts-remote-zoomcamp/mlruns/preprocessor.pkl")
    fs.put("/tmp/models_mlflow", "s3://mlflow-artifacts-remote-zoomcamp/mlruns/model.xgb")

    # Log artifacts paths to mlflow
    mlflow.log_artifact("/tmp/preprocessor2.pkl", artifact_path="preprocessor")
    mlflow.log_artifact("/tmp/models_mlflow2", artifact_path="models_mlflow")
    
    
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

[13:03:44] DEBUG: ../src/gbm/gbtree.cc:157: Using tree method: 2
[13:03:44] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 208 extra nodes, 0 pruned nodes, max_depth=9
[0]	validation-rmse:14732.65596
[13:03:44] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 216 extra nodes, 0 pruned nodes, max_depth=9
[1]	validation-rmse:13859.59647
[13:03:44] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 208 extra nodes, 0 pruned nodes, max_depth=9
[2]	validation-rmse:13074.57000
[13:03:44] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 210 extra nodes, 0 pruned nodes, max_depth=9
[3]	validation-rmse:12363.43208
[13:03:44] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 212 extra nodes, 0 pruned nodes, max_depth=9
[4]	validation-rmse:11725.20722
[13:03:44] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 214 extra nodes, 0 pruned nodes, max_depth=9
[5]	validation-rmse:11146.79106
[13:03:44] INFO: ../src/tree/updater_prune.cc:98: tree pruning en

[13:03:45] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 2600 extra nodes, 0 pruned nodes, max_depth=22
[56]	validation-rmse:5886.39528
[13:03:45] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 2758 extra nodes, 0 pruned nodes, max_depth=21
[57]	validation-rmse:5883.30861
[13:03:45] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 3138 extra nodes, 0 pruned nodes, max_depth=26
[58]	validation-rmse:5880.60854
[13:03:45] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 3444 extra nodes, 0 pruned nodes, max_depth=28
[59]	validation-rmse:5878.07528
[13:03:45] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 3802 extra nodes, 0 pruned nodes, max_depth=29
[60]	validation-rmse:5875.81348
[13:03:45] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 3900 extra nodes, 0 pruned nodes, max_depth=27
[61]	validation-rmse:5873.73859
[13:03:45] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 4228 extra nodes, 0 pruned nodes, max_depth=30
[62

[13:03:48] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 3700 extra nodes, 0 pruned nodes, max_depth=30
[112]	validation-rmse:5853.82644
[13:03:48] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 3388 extra nodes, 0 pruned nodes, max_depth=30
[113]	validation-rmse:5853.82005
[13:03:48] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 3872 extra nodes, 0 pruned nodes, max_depth=30
[114]	validation-rmse:5853.80981
[13:03:48] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 3466 extra nodes, 0 pruned nodes, max_depth=30
[115]	validation-rmse:5853.80587
[13:03:48] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 3362 extra nodes, 0 pruned nodes, max_depth=30
[116]	validation-rmse:5853.79252
[13:03:48] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 3498 extra nodes, 0 pruned nodes, max_depth=30
[117]	validation-rmse:5853.78993
[13:03:48] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 3306 extra nodes, 0 pruned nodes, max_depth=

[13:03:50] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1438 extra nodes, 0 pruned nodes, max_depth=30
[168]	validation-rmse:5853.68351
[13:03:50] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1494 extra nodes, 0 pruned nodes, max_depth=30
[169]	validation-rmse:5853.68351
[13:03:50] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1854 extra nodes, 0 pruned nodes, max_depth=30
[170]	validation-rmse:5853.68351
[13:03:50] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 2188 extra nodes, 0 pruned nodes, max_depth=30
[171]	validation-rmse:5853.68349
[13:03:50] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1240 extra nodes, 0 pruned nodes, max_depth=30
[172]	validation-rmse:5853.68349
[13:03:50] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1576 extra nodes, 0 pruned nodes, max_depth=30
[173]	validation-rmse:5853.68349
[13:03:50] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 2024 extra nodes, 0 pruned nodes, max_depth=

[13:03:52] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 882 extra nodes, 0 pruned nodes, max_depth=30
[224]	validation-rmse:5853.67230
[13:03:52] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1728 extra nodes, 0 pruned nodes, max_depth=30
[225]	validation-rmse:5853.67231
[13:03:52] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1254 extra nodes, 0 pruned nodes, max_depth=30
[226]	validation-rmse:5853.67152
[13:03:52] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 768 extra nodes, 0 pruned nodes, max_depth=30
[227]	validation-rmse:5853.67112
[13:03:52] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1192 extra nodes, 0 pruned nodes, max_depth=30
[228]	validation-rmse:5853.67112
[13:03:52] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 998 extra nodes, 0 pruned nodes, max_depth=30
[229]	validation-rmse:5853.67112
[13:03:52] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1054 extra nodes, 0 pruned nodes, max_depth=30


[13:03:54] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1726 extra nodes, 0 pruned nodes, max_depth=30
[280]	validation-rmse:5853.66796
[13:03:55] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 2310 extra nodes, 0 pruned nodes, max_depth=30
[281]	validation-rmse:5853.66796
[13:03:55] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1296 extra nodes, 0 pruned nodes, max_depth=30
[282]	validation-rmse:5853.66796
[13:03:55] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1496 extra nodes, 0 pruned nodes, max_depth=30
[283]	validation-rmse:5853.66796
[13:03:55] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1878 extra nodes, 0 pruned nodes, max_depth=30
[284]	validation-rmse:5853.66796
[13:03:55] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1658 extra nodes, 0 pruned nodes, max_depth=30
[285]	validation-rmse:5853.66797
[13:03:55] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1522 extra nodes, 0 pruned nodes, max_depth=

[335]	validation-rmse:5853.66790
[13:03:57] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1694 extra nodes, 0 pruned nodes, max_depth=30
[336]	validation-rmse:5853.66790
[13:03:57] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1522 extra nodes, 0 pruned nodes, max_depth=30
[337]	validation-rmse:5853.66790
[13:03:57] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1636 extra nodes, 0 pruned nodes, max_depth=30
[338]	validation-rmse:5853.66790
[13:03:57] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1402 extra nodes, 0 pruned nodes, max_depth=30
[339]	validation-rmse:5853.66790
[13:03:57] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 822 extra nodes, 0 pruned nodes, max_depth=30
[340]	validation-rmse:5853.66790
[13:03:57] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 786 extra nodes, 0 pruned nodes, max_depth=30
[341]	validation-rmse:5853.66790
[13:03:57] INFO: ../src/tree/updater_prune.cc:98: tree pruning end, 1386 extra no

MlflowException: Path '/tmp/models_mlflow' already exists and is not empty

# Model Registry

In [None]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

## Interacting with the MLflow tracking server¶
The MlflowClient object allows us to interact with...

an MLflow Tracking Server that creates and manages experiments and runs.
an MLflow Registry Server that creates and manages registered models and model versions.
To instantiate it we need to pass a tracking URI and/or a registry URI

In [None]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

client.list_experiments()

In [None]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

### Interacting with the Model Registry

In this section We will use the `MlflowClient` instance to:

1. Register a new version for the experiment `nyc-taxi-regressor`
2. Retrieve the latests versions of the model `nyc-taxi-regressor` and check that a new version `4` was created.
3. Transition the version `4` to "Staging" and adding annotations to it.

In [None]:
run_id = "b8904012c84343b5bf8ee72aa8f0f402"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

In [None]:
model_name = "nyc-taxi-regressor"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

In [None]:
model_version = 4
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

In [None]:
from datetime import datetime

date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

In [None]:
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

In [None]:
client.transition_model_version_stage(
    name=model_name,
    version=4,
    stage="Production",
    archive_existing_versions=True
)