In [1]:
!pip install pyarrow



In [2]:
!pip list --format=freeze | grep scikit-learn

scikit-learn==1.0.2


In [3]:
import pickle
import pandas as pd
import numpy as np

In [4]:
with open('model.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)

In [5]:
categorical = ['PUlocationID', 'DOlocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [6]:
year = 2021
month = 2
input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_{year:04d}-{month:02d}.parquet'
output_file = f'output/fhv_tripdata_{year:04d}-{month:02d}.parquet'

In [7]:
df = read_data(input_file)

In [8]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = lr.predict(X_val)

In [9]:
# Question 1: What's the mean predicted duration for this dataset?

In [10]:
np.mean(y_pred)

16.191691679979066

In [11]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

In [12]:
df_result = df[['ride_id', 'duration']].copy()

In [13]:
!mkdir output

mkdir: no se puede crear el directorio «output»: El archivo ya existe


In [14]:
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [17]:
# Question 2: What's the size of the output file?
import math
import os
math.ceil(os.path.getsize(output_file) / (1024 * 1024))

19

In [81]:
# Question 3: Now let's turn the notebook into a script. Which command you need to execute for that?

# jupyter nbconvert --to=script starter.ipynb

# Then for legibility I rename it to batch.py since it will be used to deploy it as batch/offline deployment

In [82]:
# Question 4: What's the first hash for the Scikit-Learn dependency?
# pipenv install scikit-learn==1.0.2 pandas pyarrow s3fs --python=3.9
# and then check Pipfile.lock to see the hash:
# sha256:08ef968f6b72033c16c479c966bf37ccd49b06ea91b765e1cc27afefe723920b

In [83]:
# Question 5: What's the mean predicted duration?
# I parametrize the script starter.py with two parameters, year and month and then run
# pipenv run python batch.py 2021 3
# 16.298821614015107

In [None]:
# Question 6: Now run the script with docker. What's the mean predicted duration for April 2021?

# docker build -t mlops-zoomcamp-hw4:v1 .
# note: if docker file is diferent from 'Dockerfile', ie: 'mydockerfile.dockerfile' the name needs to be specified, like this
# docker build -t mlops-zoomcamp-hw4:v1 -f mydockerfile.dockerfile .
# then I run docker run -it --rm mlops-zoomcamp-hw4:v1 2021 4
# predicted mean duration:  9.967573179784523

In [None]:
# Bonus question
# docker build -t mlops-zoomcamp-hw4:v1 -f mydockerfile.dockerfile .
# docker run -it --rm -v $HOME/.aws:/root/.aws mlops-zoomcamp-hw4:v1 2021 4