In [1]:
import pickle
import polars as pl
import sklearn

In [2]:
sklearn.__version__

'1.5.0'

In [None]:
year = 2023
month = 3


input_file = f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet'
output_file = f'output/yellow_tripdata_{year:04d}-{month:02d}.parquet'

In [4]:
!mkdir output

mkdir: cannot create directory ‘output’: File exists


In [5]:
with open('model.bin', 'rb') as f_in:
    dv, lr = pickle.load(f_in)

In [6]:
def read_data(filename):
    df = pl.scan_parquet(filename)

    month_of_year = f"{year:04d}/{month:02d}"
    
    df = df.with_row_index(name="index")
    df = df.select(
        duration=(
            (
                pl.col("tpep_dropoff_datetime") - pl.col("tpep_pickup_datetime")
            ).dt.total_seconds()
            / 60
        ),
        ride_id=pl.concat_str(
            pl.lit(month_of_year),
            pl.col("index").cast(str),
            separator="_",),
        PULocationID=pl.col("PULocationID").cast(pl.String),
        DOLocationID=pl.col("DOLocationID").cast(pl.String),
    )

    df = df.filter(pl.col("duration").is_between(1, 60))

    # df = df.with_columns(
    #     pu_do=pl.concat_str(
    #         [  # pl.concat_str will cast to str automatically
    #             pl.col("PULocationID"),  # .cast(str),
    #             pl.col("DOLocationID"),  # cast(str),
    #         ],
    #         separator="_",
    #     )
    # )

    categorical = ("PULocationID", "DOLocationID")
    target = ("ride_id", "duration")

    df_input = df.select(
        pl.col(categorical),
        # pl.col(numerical),
    ).collect()  # to_dicts()

    df_target = df.select(pl.col(target)).collect()

    return df_input, df_target

    # df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    # return df

In [7]:
df_input, df_target = read_data(input_file)
(df_input, df_target)


(shape: (3_316_216, 2)
 ┌──────────────┬──────────────┐
 │ PULocationID ┆ DOLocationID │
 │ ---          ┆ ---          │
 │ str          ┆ str          │
 ╞══════════════╪══════════════╡
 │ 238          ┆ 42           │
 │ 138          ┆ 231          │
 │ 140          ┆ 186          │
 │ 140          ┆ 43           │
 │ 79           ┆ 137          │
 │ …            ┆ …            │
 │ 163          ┆ 75           │
 │ 125          ┆ 198          │
 │ 50           ┆ 224          │
 │ 113          ┆ 158          │
 │ 41           ┆ 166          │
 └──────────────┴──────────────┘,
 shape: (3_316_216, 2)
 ┌─────────────────┬───────────┐
 │ ride_id         ┆ duration  │
 │ ---             ┆ ---       │
 │ str             ┆ f64       │
 ╞═════════════════╪═══════════╡
 │ 2023/03_0       ┆ 10.0      │
 │ 2023/03_1       ┆ 31.083333 │
 │ 2023/03_2       ┆ 14.366667 │
 │ 2023/03_3       ┆ 11.466667 │
 │ 2023/03_4       ┆ 3.033333  │
 │ …               ┆ …         │
 │ 2023/03_3403761 ┆ 16.48333

In [8]:
dicts = df_input.to_dicts()
dicts[0]

{'PULocationID': '238', 'DOLocationID': '42'}

In [9]:
X_val = dv.transform(dicts)
y_pred = lr.predict(X_val)

In [10]:
y_pred

array([16.24590642, 26.1347962 , 11.88426424, ..., 11.59533603,
       13.11317847, 12.89999218], shape=(3316216,))

### Q1. Standard deviation

In [18]:
y_pred.std()

np.float64(6.247488852238703)

### Q2. Preparing the output

In [21]:
df_result = df_target.select(
    pl.col("ride_id"),
    pl.Series("predicted_duration", y_pred),
)

In [27]:
# df_result.to_pandas().to_parquet(
#     output_file,
#     engine='pyarrow',
#     compression=None,
#     index=False
# )

# another way to save
df_result.write_parquet(
    output_file,
    compression=None,
    # row_group_size=1000000,
    use_pyarrow=True,
    compression_level=None,
    # file_options={"version": "2.6"},
)

In [28]:
!ls -lh output

total 66M
-rw-rw-rw- 1 codespace codespace 66M Jun 13 22:15 yellow_tripdata_2023-03.parquet


### Q3. Script converting

In [29]:
!jupyter nbconvert --to script solution.ipynb

[NbConvertApp] Converting notebook solution.ipynb to script
[NbConvertApp] Writing 2456 bytes to solution.py
