In [1]:
!python -V

Python 3.9.16


# Data Preparation

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import mlflow
import pickle
import os
import numpy as np

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder

from sklearn.metrics import mean_squared_error

In [4]:
# Connecting to the AWS S3
os.environ["AWS_PROFILE"] = "default" # fill in with your AWS profile. More info: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup.html#setup-credentials

TRACKING_SERVER_HOST = "ec2-13-251-63-107.ap-southeast-1.compute.amazonaws.com" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

# Set the new experiment
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

# Check the experiment
mlflow.search_experiments()

In [5]:
# kaggle datasets download -d mchirico/montcoalert

dataset_name = "spotify-and-youtube"
!kaggle datasets download -d salvatorerastelli/{dataset_name}
!mkdir data
!cd data
!mkdir {dataset_name}
!unzip -o {dataset_name}.zip -d data/{dataset_name}/

In [6]:
# Read the CSV file and check
df = pd.read_csv("data/spotify-and-youtube/Spotify_Youtube.csv")

In [7]:
# Reset the index to be starting from 1 not zero
df = df.reset_index(drop=True)
df.index = df.index + 1

# show the table
df.head(2)

In [8]:
# Rename all the column to be nice to see
df['Trending'] = df.shape[0] + 1 - df['Views'].rank()
df = df.sort_values(by='Trending', ascending=True)

# Drop some unnecessary file
df = df.dropna()
df = df.drop('Url_spotify', axis=1)
df = df.drop('Uri', axis=1)

df.head()

In [9]:
# Select only numerical columns
numerical_columns = df.select_dtypes(include=[np.number]).columns

# Calculate correlation
correlation = df[numerical_columns].corr()['Trending']
print(correlation)

In [10]:
# Split the file into training data and testing data
split_number = int(len(df)/2)

df_train = df[:split_number]
df_val = df[split_number:]

df.head()

# Training Data 

In [11]:
# Set up for features engineering

categorical = ['Licensed']
numerical = ['Views', 'Likes']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

target = 'Trending'
y_train = df_train[target].values
y_val = df_val[target].values

In [12]:
# Test the data using simple method

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

mean_squared_error(y_val, y_pred, squared=False)

In [13]:
# Check the range to assess the quality of our result (using percentage)
range = df['Trending'].max() - df['Trending'].min()
print(range)

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

# Set the experiment. If the experiment does not exist, it will be created.
mlflow.set_experiment("my-experiment-2")

with mlflow.start_run():
    
    mlflow.set_tag("owner", 'M Irfan')
    
    mlflow.log_param("train-data-path", "./data/{dataset_name}.csv")
    
    lr = LinearRegression()
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared = False)
    mlflow.log_metric("rmse", rmse)
    
    mlflow.sklearn.log_model(lr, artifact_path="models")
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")


In [15]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

from datetime import datetime
import mlflow.xgboost
import pickle
import boto3
import s3fs

In [16]:
# Train the data

train = xgb.DMatrix(X_train, label=y_train)
valid = xgb.DMatrix(X_val, label=y_val)

In [17]:
def objective(params):
    fs = s3fs.S3FileSystem(anon=False)  # Add this line

    os.environ["AWS_PROFILE"] = "default"
    TRACKING_SERVER_HOST = "ec2-13-251-63-107.ap-southeast-1.compute.amazonaws.com"
    mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")
    
    current_datetime = datetime.now().strftime('%Y-%m-%d_%H:%M:%S.%f')

    with mlflow.start_run():
        print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")
        
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)
        
        error_percentage = rmse / range * 100
        mlflow.log_metric("error_percentage", error_percentage)

        mlflow.xgboost.save_model(booster, f"/tmp/models_mlflow-{current_datetime}")

        with open(f"/tmp/preprocessor-{current_datetime}.pkl", "wb") as f_out:
            pickle.dump(dv, f_out)

        fs.put(f"/tmp/preprocessor-{current_datetime}.pkl", f"s3://mlflow-artifacts-remote-zoomcamp/mlruns/preprocessor-{current_datetime}.pkl")
        fs.put(f"/tmp/models_mlflow-{current_datetime}", f"s3://mlflow-artifacts-remote-zoomcamp/mlruns/model-{current_datetime}.xgb")

        mlflow.log_artifact(f"/tmp/preprocessor-{current_datetime}.pkl", artifact_path="preprocessor")
        mlflow.log_artifact(f"/tmp/models_mlflow-{current_datetime}", artifact_path="models_mlflow")

        print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

    return {'loss': rmse, 'status': STATUS_OK}

In [18]:
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 200, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -6, -3),
    'objective': 'reg:squarederror',
    'seed': 42,
    'verbosity': 3
}

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=3,
    trials=Trials()
)


In [19]:
# Test using the best result

# Make sure AWS credentials are set properly
# Either using environment variables or ~/.aws/credentials
os.environ["AWS_PROFILE"] = "default" # fill in with your AWS profile. More info: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup.html#setup-credentials

TRACKING_SERVER_HOST = "ec2-13-251-63-107.ap-southeast-1.compute.amazonaws.com" # fill in with the public DNS of the EC2 instance

# create a connection to S3
fs = s3fs.S3FileSystem()

# use a local tracking server
mlflow.set_tracking_uri("file:///tmp/mlruns")

with mlflow.start_run(experiment_id="0"):
    
    train = xgb.DMatrix(X_train, label=y_train)
    valid = xgb.DMatrix(X_val, label=y_val)

    best_params = {
        'learning_rate': 0.09585355369315604,
        'max_depth': 30,
        'min_child_weight': 1.060597050922164,
        'objective': 'reg:squarederror',
        'reg_alpha': 0.018060244040060163,
        'reg_lambda': 0.011658731377413597,
        'seed': 42
    }

    mlflow.log_params(best_params)

    booster = xgb.train(
        params=best_params,
        dtrain=train,
        num_boost_round=1000,
        evals=[(valid, 'validation')],
        early_stopping_rounds=50
    )

    y_pred = booster.predict(valid)
    rmse = mean_squared_error(y_val, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    
    error_percentage = rmse / range * 100
    mlflow.log_metric("error_percentage", error_percentage)

    # Save model locally
    mlflow.xgboost.save_model(booster, "/tmp/models_mlflow")

    # Save the preprocessor locally
    with open("/tmp/preprocessor.pkl", "wb") as f_out:
        pickle.dump(dv, f_out)

    # Upload model and preprocessor to S3
    fs.put("/tmp/preprocessor.pkl", "s3://mlflow-artifacts-remote-zoomcamp/mlruns/preprocessor.pkl")
    fs.put("/tmp/models_mlflow", "s3://mlflow-artifacts-remote-zoomcamp/mlruns/model.xgb")

    # Log artifacts paths to mlflow
    mlflow.log_artifact("/tmp/preprocessor2.pkl", artifact_path="preprocessor")
    mlflow.log_artifact("/tmp/models_mlflow2", artifact_path="models_mlflow")
    
    
    print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")

# Model Registry

In [20]:
from mlflow.tracking import MlflowClient

MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"

## Interacting with the MLflow tracking server¶
The MlflowClient object allows us to interact with...

an MLflow Tracking Server that creates and manages experiments and runs.
an MLflow Registry Server that creates and manages registered models and model versions.
To instantiate it we need to pass a tracking URI and/or a registry URI

In [21]:
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

client.list_experiments()

In [22]:
for run in runs:
    print(f"run id: {run.info.run_id}, rmse: {run.data.metrics['rmse']:.4f}")

### Interacting with the Model Registry

In this section We will use the `MlflowClient` instance to:

1. Register a new version for the experiment `nyc-taxi-regressor`
2. Retrieve the latests versions of the model `nyc-taxi-regressor` and check that a new version `4` was created.
3. Transition the version `4` to "Staging" and adding annotations to it.

In [23]:
run_id = "b8904012c84343b5bf8ee72aa8f0f402"
model_uri = f"runs:/{run_id}/model"
mlflow.register_model(model_uri=model_uri, name="nyc-taxi-regressor")

In [24]:
model_name = "nyc-taxi-regressor"
latest_versions = client.get_latest_versions(name=model_name)

for version in latest_versions:
    print(f"version: {version.version}, stage: {version.current_stage}")

In [25]:
model_version = 4
new_stage = "Staging"
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage=new_stage,
    archive_existing_versions=False
)

In [26]:
from datetime import datetime

date = datetime.today().date()
client.update_model_version(
    name=model_name,
    version=model_version,
    description=f"The model version {model_version} was transitioned to {new_stage} on {date}"
)

In [27]:
%time test_model(name=model_name, stage="Production", X_test=X_test, y_test=y_test)

In [28]:
client.transition_model_version_stage(
    name=model_name,
    version=4,
    stage="Production",
    archive_existing_versions=True
)