## Import libs

In [9]:
import pandas as pd
import numpy as np
import os 

import seaborn as sns
import matplotlib.pyplot as plt

import pickle

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
import mlflow


mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

2024/05/30 16:22:07 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2024/05/30 16:22:07 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

<Experiment: artifact_location='/Users/kseniialakhman/projects/mlops/mlops-zoomcamp/02-experiment-tracking/mlruns/1', creation_time=1717075328297, experiment_id='1', last_update_time=1717075328297, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

## Download data

In [None]:
!python3 preprocess_data.py --raw_data_path "./data" --dest_path "./output"

## Q1. Install MLflow

### What's the version that you have?
2.13.0

In [3]:
!mlflow --version

mlflow, version 2.13.0


## Q2. Download and preprocess the data

### How many files were saved to OUTPUT_FOLDER?
4

In [5]:
!python preprocess_data.py --raw_data_path "./data" --dest_path "./output"

In [7]:
!ls "./output"

dv.pkl    test.pkl  train.pkl val.pkl


## Q3. Train a model with autolog

What is the value of the min_samples_split parameter:

    2
    4
    8
    10


In [8]:
def load_pickle(filename: str):
    with open(filename, "rb") as f_in:
        return pickle.load(f_in)

In [None]:
def run_train(data_path: str):
    mlflow.sklearn.autolog()
    
    X_train, y_train = load_pickle(os.path.join(data_path, "train.pkl"))
    X_val, y_val = load_pickle(os.path.join(data_path, "val.pkl"))
    
    with mlflow.start_run():
        
        rf = RandomForestRegressor(max_depth=10, random_state=0)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)

        rmse = mean_squared_error(y_val, y_pred, squared=False)
