In [26]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.22.1-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.22.1 (from mlflow)
  Downloading mlflow_skinny-2.22.1-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.1->mlflow)
  Downloading databricks_sdk-0.56.0-py3-none-any.whl.metadata (39 kB)
Collecting opentelemetry-sdk<3,>=1.9.0 (from mlflow-skinny==2.22.1->mlflow)
  Downloading opentelemetry_sdk-1.34.1-py3-none-any.whl.metadata (1.6 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.6-py3-none-any.whl.metadata (11 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_relay-3.2.0-py3-none-any.whl.metadata (12 kB)
Collecting opentelemetry-semantic-conventions==0

In [27]:
import pandas as pd
import numpy as np
import pickle
import mlflow
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error

### Q1. Select the Tool

In [2]:
!pip install prefect

Collecting prefect
  Downloading prefect-3.4.5-py3-none-any.whl.metadata (13 kB)
Collecting aiosqlite<1.0.0,>=0.17.0 (from prefect)
  Downloading aiosqlite-0.21.0-py3-none-any.whl.metadata (4.3 kB)
Collecting alembic<2.0.0,>=1.7.5 (from prefect)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting apprise<2.0.0,>=1.1.0 (from prefect)
  Downloading apprise-1.9.3-py3-none-any.whl.metadata (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.7/53.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting asgi-lifespan<3.0,>=1.0 (from prefect)
  Downloading asgi_lifespan-2.1.0-py3-none-any.whl.metadata (10 kB)
Collecting asyncpg<1.0.0,>=0.23 (from prefect)
  Downloading asyncpg-0.30.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.0 kB)
Collecting click<8.2,>=8.0 (from prefect)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting coolname<3.0.0,>=1.0.4 (from prefect)
  Downloading coolname-2.2.0-py

Answer: I have used **Prefect** for orchestration

### Q2. Version

In [4]:
!prefect version

Version:             3.4.5
API version:         0.8.4
Python version:      3.11.13
Git commit:          df37c8cf
Built:               Sat, Jun 07, 2025 02:24 AM
OS/Arch:             linux/x86_64
Profile:             ephemeral
Server type:         ephemeral
Pydantic version:    2.11.5
Server:
  Database:          sqlite
  SQLite version:    3.37.2


Answer: The version of Prefect is **3.4.5**

### Q3. Creating a pipeline

In [12]:
df = pd.read_parquet('data/yellow_tripdata_2023-03.parquet')
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,8.6,1.0,0.5,0.0,0.0,1.0,11.1,0.0,0.0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,52.7,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,18.4,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.0
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,15.6,3.5,0.5,4.1,0.0,1.0,24.7,2.5,0.0
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.2,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0


In [13]:
df.shape

(3403766, 19)

Answer: There are **3403766** records in the dataset.

### Q4. Data preparation

In [14]:
df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
df.duration = df.duration.dt.total_seconds() / 60

df = df[(df.duration >= 1) & (df.duration <= 60)]

categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].astype(str)

df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,Airport_fee,duration
0,2,2023-03-01 00:06:43,2023-03-01 00:16:43,1.0,0.0,1.0,N,238,42,2,8.6,1.0,0.5,0.0,0.0,1.0,11.1,0.0,0.0,10.0
1,2,2023-03-01 00:08:25,2023-03-01 00:39:30,2.0,12.4,1.0,N,138,231,1,52.7,6.0,0.5,12.54,0.0,1.0,76.49,2.5,1.25,31.083333
2,1,2023-03-01 00:15:04,2023-03-01 00:29:26,0.0,3.3,1.0,N,140,186,1,18.4,3.5,0.5,4.65,0.0,1.0,28.05,2.5,0.0,14.366667
3,1,2023-03-01 00:49:37,2023-03-01 01:01:05,1.0,2.9,1.0,N,140,43,1,15.6,3.5,0.5,4.1,0.0,1.0,24.7,2.5,0.0,11.466667
4,2,2023-03-01 00:08:04,2023-03-01 00:11:06,1.0,1.23,1.0,N,79,137,1,7.2,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0,3.033333


In [15]:
df.shape

(3316216, 20)

Answer: There are **3316216** rows after preprocessing

### Q5. Train a model

In [16]:
df_copy1 = df.copy()

In [17]:
dv = DictVectorizer()
train_dicts = df_copy1[categorical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

target = 'duration'
y_train = df_copy1[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

In [19]:
lr.intercept_

np.float64(24.77203445209766)

Answer: The intercept of the model is **24.77**

### Q6. Register the model

In [29]:
!mlflow ui --backend-store-uri sqlite:///mlflow.db --default-artifact-root ./artifacts

2025/06/10 10:38:11 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/06/10 10:38:11 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

In [30]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc_taxi_experiment")

with mlflow.start_run():
    mlflow.set_tag("developer", "Krishna")
    mlflow.set_tag("model", "LinearRegression")
    lr = LinearRegression()
    lr.fit(X_train, y_train)

    mlflow.log_param("intercept_", lr.intercept_)
    mlflow.sklearn.log_model(
        sk_model=lr,
        artifact_path="artifacts_local",
        registered_model_name="MyLinearRegressor"
    )

2025/06/10 10:39:38 INFO mlflow.tracking.fluent: Experiment with name 'nyc_taxi_experiment' does not exist. Creating a new experiment.
Successfully registered model 'MyLinearRegressor'.
Created version '1' of model 'MyLinearRegressor'.


In [36]:
!cat mlruns/1/cf2f88b411dd4704984fe636e1a67c32/artifacts/artifacts_local/MLmodel

artifact_path: artifacts_local
flavors:
  python_function:
    env:
      conda: conda.yaml
      virtualenv: python_env.yaml
    loader_module: mlflow.sklearn
    model_path: model.pkl
    predict_fn: predict
    python_version: 3.11.13
  sklearn:
    code: null
    pickled_model: model.pkl
    serialization_format: cloudpickle
    sklearn_version: 1.6.1
mlflow_version: 2.22.1
model_size_bytes: 4501
model_uuid: 9b4e02dc82034ae0b7402a5f8eb7b672
prompts: null
run_id: cf2f88b411dd4704984fe636e1a67c32
utc_time_created: '2025-06-10 10:40:53.313379'


Answer: The model size is **4501**, nearest match is **4534**