# Baseline model for batch monitoring example

In [1]:
import requests
import pandas as pd
import datetime
from tqdm import tqdm
from joblib import load, dump
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [2]:
files = [('green_tripdata_2022-02.parquet', './data'),('green_tripdata_2022-01.parquet', './data')]

print("Downloading files")

for file, path in files:
    url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp = requests.get(url, stream=True)
    save_path = f'{path}/{file}'
    with open(save_path, 'wb') as handle:
        for data in tqdm(
            resp.iter_content(),
            desc= f'{file}',
            postfix= f'save to {save_path}',
            total= int(resp.headers['Content-Length'])):
            
            handle.write(data)

Downloading files


green_tripdata_2022-02.parquet: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1428262/1428262 [00:06<00:00, 209549.36it/s, save to ./data/green_tripdata_2022-02.parquet]
green_tripdata_2022-01.parquet: 100%|███████████████████████████████████████████████████████████████████████████████████████████████| 1254291/1254291 [00:06<00:00, 208890.44it/s, save to ./data/green_tripdata_2022-01.parquet]


In [3]:
jan_data = pd.read_parquet('data/green_tripdata_2022-01.parquet')

In [4]:
jan_data.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge
0,2,2022-01-01 00:14:21,2022-01-01 00:15:33,N,1.0,42,42,1.0,0.44,3.5,0.5,0.5,0.0,0.0,,0.3,4.8,2.0,1.0,0.0
1,1,2022-01-01 00:20:55,2022-01-01 00:29:38,N,1.0,116,41,1.0,2.1,9.5,0.5,0.5,0.0,0.0,,0.3,10.8,2.0,1.0,0.0
2,1,2022-01-01 00:57:02,2022-01-01 01:13:14,N,1.0,41,140,1.0,3.7,14.5,3.25,0.5,4.6,0.0,,0.3,23.15,1.0,1.0,2.75
3,2,2022-01-01 00:07:42,2022-01-01 00:15:57,N,1.0,181,181,1.0,1.69,8.0,0.5,0.5,0.0,0.0,,0.3,9.3,2.0,1.0,0.0
4,2,2022-01-01 00:07:50,2022-01-01 00:28:52,N,1.0,33,170,1.0,6.26,22.0,0.5,0.5,5.21,0.0,,0.3,31.26,1.0,1.0,2.75


In [5]:
# create target 
jan_data['duration'] = jan_data['lpep_dropoff_datetime']-jan_data['lpep_pickup_datetime']
jan_data['duration'] = jan_data.duration.dt.total_seconds()/60

In [6]:
# filter data to trip to between 1min and 60min
jan_data = jan_data[(jan_data.duration >= 1) & (jan_data.duration <=60)]
jan_data = jan_data[(jan_data.passenger_count >= 0) & (jan_data.passenger_count <=8)]

In [7]:
target = 'duration'
numeric_features = ['passenger_count','trip_distance', 'fare_amount', 'total_amount']
cat_features = ['PULocationID','DOLocationID']

In [8]:
cut_off = 30000
train_data = jan_data[:cut_off]
val_data = jan_data[cut_off:]
print(f"Training set is : {len(train_data)}")
print(f"Validation set is : {len(val_data)}")

Training set is : 30000
Validation set is : 23367


In [9]:
train_set = train_data[numeric_features + cat_features]
validation_set = val_data[numeric_features + cat_features]
y = train_data[target]

print(train_set.head())
print(validation_set.head())
print(type(y))

   passenger_count  trip_distance  fare_amount  total_amount  PULocationID  \
0              1.0           0.44          3.5          4.80            42   
1              1.0           2.10          9.5         10.80           116   
2              1.0           3.70         14.5         23.15            41   
3              1.0           1.69          8.0          9.30           181   
4              1.0           6.26         22.0         31.26            33   

   DOLocationID  
0            42  
1            41  
2           140  
3           181  
4           170  
       passenger_count  trip_distance  fare_amount  total_amount  \
31594              5.0           3.44         13.0         25.94   
31595              5.0           0.87          5.5          8.76   
31596              1.0           2.51         11.0         12.80   
31597              1.0           2.36         13.0         14.80   
31598              1.0           6.95         22.5         32.46   

       PULocat

In [10]:
model = LinearRegression()

In [11]:
model.fit(train_set,y)

In [15]:
train_preds = model.predict(train_set)
train_data["predictions"] = train_preds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data["predictions"] = train_preds


In [17]:
val_preds = model.predict(validation_set)
val_data["predictions"] = val_preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_data["predictions"] = val_preds


In [18]:
print(mean_absolute_error(train_data.duration,train_data.predictions))
print(mean_absolute_error(val_data.duration,val_data.predictions))

3.189412303982246
3.428923449920613


# Dump model and reference data

In [20]:
with open('models/lin_reg.bin', 'wb') as f_out:
    dump(model, f_out)

In [19]:
val_data.to_parquet('data/reference.parquet')