In [1]:
!pip install pyarrow



In [2]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet

--2025-03-20 16:48:10--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.239.38.83, 18.239.38.147, 18.239.38.163, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.239.38.83|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47673370 (45M) [application/x-www-form-urlencoded]
Saving to: ‘yellow_tripdata_2023-01.parquet’


2025-03-20 16:48:10 (274 MB/s) - ‘yellow_tripdata_2023-01.parquet’ saved [47673370/47673370]



In [3]:
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

--2025-03-20 16:48:13--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 18.239.38.147, 18.239.38.181, 18.239.38.163, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|18.239.38.147|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 47748012 (46M) [application/x-www-form-urlencoded]
Saving to: ‘yellow_tripdata_2023-02.parquet’


2025-03-20 16:48:13 (191 MB/s) - ‘yellow_tripdata_2023-02.parquet’ saved [47748012/47748012]



In [4]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [5]:
# Q1 Answer

df = pd.read_parquet('./yellow_tripdata_2023-01.parquet')
print("Number of columns:",len(df.columns))  # 19

Number of columns: 19


In [6]:
# Q2 Answer

# Duration column in seconds
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds() / 60

# Deviation
std_duration = df['duration'].std()
print("Standard deviation of duration in January:", std_duration) # 42.59

Standard deviation of duration in January: 42.594351241920904


In [7]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df = pd.read_csv(filename)
        df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
        df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df = pd.read_parquet(filename)
    
    df['duration'] = (df.tpep_dropoff_datetime - df.tpep_pickup_datetime).dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    return df

In [8]:
# Load datasets
df_train = read_dataframe('./yellow_tripdata_2023-01.parquet')
df_val = read_dataframe('./yellow_tripdata_2023-02.parquet')

In [9]:
# Q3 Answer
fraction_left = len(df_train) / len(pd.read_parquet('./yellow_tripdata_2023-01.parquet')) * 100
print("Fraction of remaining records:", fraction_left)  # 98%

Fraction of remaining records: 98.1220282212598


In [10]:
# Q4 Answer: One-hot encoding
features_train = df_train[['PULocationID', 'DOLocationID']].to_dict(orient="records")
features_val = df_val[['PULocationID', 'DOLocationID']].to_dict(orient="records")
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(features_train)  # One-hot encoding for training
X_val = dv.transform(features_val)          # One-hot encoding for validation
print("Number of columns in feature matrix:", X_train.shape[1])  # 515

Number of columns in feature matrix: 515


In [11]:
# Q5 Answer: Train Linear Regression model
y_train = df_train['duration'].values
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_train = lr.predict(X_train)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
print("RMSE on Training Data:", rmse_train)  # 7.64

RMSE on Training Data: 7.649261027826866


In [12]:
# Q6 Answer: Evaluate on Validation Data
y_val = df_val['duration'].values
y_pred_val = lr.predict(X_val)
rmse_val = mean_squared_error(y_val, y_pred_val, squared=False)
print("RMSE on Validation Data:", rmse_val)  # 7.81

RMSE on Validation Data: 7.811832641626525
