In [1]:
!pip freeze | grep scikit-learn

scikit-learn==1.5.0


In [2]:
!python -V

Python 3.10.13


In [1]:
import pickle
import pandas as pd

In [2]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [4]:
df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')
# https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3316216 entries, 0 to 3403765
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int32         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           object        
 8   DOLocationID           object        
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  Airport_fee            floa

In [6]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

## Q1 Notebook

In [7]:
import numpy as np
np.std(y_pred)

6.247488852238703

In [8]:
np.mean(y_pred)

14.203865642696083

## Q2 Output Size

In [12]:
year = 2023
month = 3
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

df_result = pd.DataFrame()
df_result['ride_id'] = df['ride_id']
df_result['prediction'] = y_pred

output_file = f'./homework_{year:04d}-{month:02d}_predicted.parquet'

df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [16]:
!dir


 Volume in drive C is OS
 Volume Serial Number is 3CAD-5A5C

 Directory of C:\Users\sudwa\Desktop\mlops-zoomcamp-2024\module_4_model_deployment\homework

06/12/2024  03:01 PM    <DIR>          .
06/12/2024  03:01 PM    <DIR>          ..
06/11/2024  04:08 PM    <DIR>          .ipynb_checkpoints
06/11/2024  11:55 PM             4,522 homework.md
06/12/2024  02:57 PM        68,641,760 homework_2023-03_predicted.parquet
06/11/2024  03:52 PM            17,376 model.bin
06/11/2024  03:59 PM               193 Pipfile
06/10/2024  08:09 PM            13,287 Pipfile.lock
06/10/2024  08:18 PM               134 README.md
06/10/2024  08:02 PM                29 requirements.txt
06/12/2024  03:01 PM            11,948 starter.ipynb
06/11/2024  06:21 PM             1,405 starter.py
               9 File(s)     68,690,654 bytes
               3 Dir(s)  16,214,102,016 bytes free


### Q3. Creating the scoring script

Which command you need to execute for that?

- jupyter nbconvert --to script starter.ipynb

## Q4. Virtual environment

Now let's put everything into a virtual environment. We'll use pipenv for that.

Install all the required libraries. Pay attention to the Scikit-Learn version: it should be the same as in the starter
notebook.

After installing the libraries, pipenv creates two files: `Pipfile`
and `Pipfile.lock`. The `Pipfile.lock` file keeps the hashes of the
dependencies we use for the virtual env.

What's the first hash for the Scikit-Learn dependency?

- "sha256:057b991ac64b3e75c9c04b5f9395eaf19a6179244c089afdebaad98264bff37c"

## Q5. Parametrize the script

Let's now make the script configurable via CLI. We'll create two
parameters: year and month.

Run the script for April 2023.

What's the mean predicted duration?

In [18]:
april = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-04.parquet')

In [19]:
xdicts = april[categorical].to_dict(orient='records')
X_val_april = dv.transform(xdicts)
y_pred_april = model.predict(X_val_april)

In [20]:
np.mean(y_pred_april)

14.292282936862449