In [7]:
!pip freeze | grep scikit-learn
# !pip install scikit-learn==1.5.0

scikit-learn==1.5.0


In [2]:
!python -V

Python 3.11.9


In [3]:
import pickle
import pandas as pd

In [8]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [9]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [26]:
year = 2023
month = 3
input_path = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
output_path = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet'

In [27]:
df = read_data(input_path)

In [17]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

<h3>Q1. Notebook. The standard deviation of the predicted duration for this dataset is</h3>

In [18]:
y_pred.std()

np.float64(6.247488852238703)

<h3>Q2. Preparing the output</h3>

In [29]:
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
df_output = pd.DataFrame()
df_output['ride_id'] = df['ride_id']
df_output['predicted_duration'] = y_pred

df_output.to_parquet(
    output_path,
    engine='pyarrow',
    compression=None,
    index=False
)

!ls -lh output

<h3>Q3. Creating the scoring script</h3>

In [3]:
!jupyter nbconvert --to script starter.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr