# Baseline model for batch monitoring example

In [3]:
import datetime
import requests
import pandas as pd

from joblib import load, dump
from tqdm import tqdm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [4]:
files = [('green_tripdata_2022-02.parquet', './data'), ('green_tripdata_2022-01.parquet', './data')]

print("Downloading data...")
for file, path in files:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp = requests.get(url, stream=True)
    save_path = f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                         desc=file,
                         postfix=f"save to {save_path}",
                         total=int(resp.headers.get('content-length', 0))):
            handle.write(data)

Downloading data...


green_tripdata_2022-02.parquet: 100%|██████████| 1428262/1428262 [00:03<00:00, 397654.14it/s, save to ./data/green_tripdata_2022-02.parquet]
green_tripdata_2022-01.parquet: 100%|██████████| 1254291/1254291 [00:03<00:00, 411337.36it/s, save to ./data/green_tripdata_2022-01.parquet]


In [14]:
jan_data = pd.read_parquet("./data/green_tripdata_2022-01.parquet")

In [15]:
jan_data.describe()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
count,62495.0,62495,62495,56200.0,62495.0,62495.0,56200.0,62495.0,62495.0,62495.0,62495.0,62495.0,62495.0,62495.0,62495.0,56200.0,56200.0,56200.0
mean,1.849508,2022-01-16 14:18:36.026353920,2022-01-16 14:37:37.189551104,1.198719,97.615041,135.969902,1.258399,77.758632,13.914,0.34756,0.426322,1.66654,0.208182,0.297312,17.490532,1.403594,1.040925,0.706628
min,1.0,2009-01-01 00:34:01,2009-01-01 17:05:20,1.0,1.0,1.0,0.0,0.0,-65.0,-4.5,-0.5,-0.86,-6.55,-0.3,-71.85,1.0,1.0,0.0
25%,2.0,2022-01-08 23:41:50,2022-01-08 23:55:28.500000,1.0,55.0,74.0,1.0,1.08,7.0,0.0,0.5,0.0,0.0,0.3,9.36,1.0,1.0,0.0
50%,2.0,2022-01-16 15:59:42,2022-01-16 16:17:00,1.0,75.0,137.0,1.0,1.9,10.5,0.0,0.5,1.0,0.0,0.3,13.86,1.0,1.0,0.0
75%,2.0,2022-01-24 09:16:50,2022-01-24 09:37:28,1.0,130.0,215.0,1.0,3.5,17.0,0.5,0.5,2.61,0.0,0.3,21.05,2.0,1.0,2.75
max,2.0,2022-01-31 23:57:37,2022-02-01 21:01:54,5.0,265.0,265.0,8.0,224481.38,604.5,4.5,0.5,76.77,44.75,0.3,605.3,5.0,2.0,2.75
std,0.357556,,,0.862313,62.987311,77.590956,0.877743,2909.354163,12.088819,0.65781,0.181381,2.484928,1.257659,0.037201,13.623416,0.516316,0.198119,1.201632


In [16]:
jan_data.shape

(62495, 20)

In [17]:
# create target
jan_data['duration_min'] = jan_data['lpep_dropoff_datetime'] - jan_data['lpep_pickup_datetime']
jan_data['duration_min'] = jan_data['duration_min'].dt.total_seconds() / 60

In [18]:
# filter out outliers
jan_data = jan_data[(jan_data['duration_min'] >= 0) & (jan_data['duration_min'] <= 60)]
jan_data = jan_data[(jan_data['passenger_count'] > 0) & (jan_data['passenger_count'] <= 8)]

In [19]:
# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

In [20]:
jan_data.shape

(55211, 21)

In [21]:
train_data = jan_data[:30000]
val_data = jan_data[30000:]

In [22]:
model = LinearRegression()

In [23]:
model.fit(train_data[num_features + cat_features], train_data[target])

In [24]:
train_preds = model.predict(train_data[num_features + cat_features])
train_data['prediction'] = train_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['prediction'] = train_preds


In [25]:
val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data['prediction'] = val_preds


In [26]:
print(mean_absolute_error(train_data[target], train_data['prediction']))
print(mean_absolute_error(val_data[target], val_data['prediction']))

3.804665373785083
4.14206407368847


# Dump model and reference data

In [27]:
with open("models/lin_reg.bin", "wb") as f:
    dump(model, f)

In [28]:
val_data.to_parquet("data/reference.parquet")