In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import pickle

### Load the model and vectorizer

In [None]:
# For this homework, we'll create a simple model first
# In practice, you would load a pre-trained model

# Load training data to create model
df_jan = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

# Compute duration
df_jan['duration'] = (df_jan['tpep_dropoff_datetime'] - df_jan['tpep_pickup_datetime']).dt.total_seconds() / 60

# Filter outliers
df_jan = df_jan[(df_jan['duration'] >= 1) & (df_jan['duration'] <= 60)].copy()

# Prepare features
categorical = ['PULocationID', 'DOLocationID']
df_jan[categorical] = df_jan[categorical].astype(str)

train_dicts = df_jan[categorical].to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
y_train = df_jan['duration'].values

# Train model
lr = LinearRegression()
lr.fit(X_train, y_train)

print('Model trained successfully')

### Load March 2023 data for scoring

In [None]:
# Load March 2023 data
year = 2023
month = 3

df = pd.read_parquet(f'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_{year:04d}-{month:02d}.parquet')

print(f'Data loaded: {len(df)} records')

### Preprocess the data

In [None]:
# Compute duration
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Filter outliers
df = df[(df['duration'] >= 1) & (df['duration'] <= 60)].copy()

# Prepare categorical features
categorical = ['PULocationID', 'DOLocationID']
df[categorical] = df[categorical].astype(str)

print(f'Data after filtering: {len(df)} records')

### Make predictions

In [None]:
# Transform features
dicts = df[categorical].to_dict(orient='records')
X = dv.transform(dicts)

# Make predictions
y_pred = lr.predict(X)

print(f'Predictions made for {len(y_pred)} records')

### Q1: Standard deviation of predicted duration

In [None]:
# Calculate standard deviation of predictions
std_pred = np.std(y_pred)
print(f'Standard deviation of predicted duration: {std_pred:.2f}')

### Q2: Prepare output and save as parquet

In [None]:
# Create ride_id column
df['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')

# Create results dataframe
df_result = pd.DataFrame({
    'ride_id': df['ride_id'],
    'predicted_duration': y_pred
})

# Save as parquet
output_file = f'predictions_{year:04d}_{month:02d}.parquet'
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

print(f'Results saved to {output_file}')

In [None]:
import os

# Check file size
file_size = os.path.getsize(output_file)
file_size_mb = file_size / (1024 * 1024)
print(f'File size: {file_size_mb:.0f}M')