In [37]:
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [38]:
data_jan = pd.read_parquet("yellow_tripdata_2022-01.parquet")
data_feb = pd.read_parquet("yellow_tripdata_2022-02.parquet")

Question 1: Number of columns in Jan 2022 Yellow Taxi Trip data:

In [39]:
print(data_jan.columns)
print(f"Number of columns in Jan 2022 Yellow Taxi Trip data: {len(data_jan.columns)}")

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')
Number of columns in Jan 2022 Yellow Taxi Trip data: 19


Question 2:  Standard deviation of the trips duration in Jan 2022 Yellow Taxi Trip data

In [40]:
data_jan['duration'] = (data_jan['tpep_dropoff_datetime'] - data_jan['tpep_pickup_datetime'])
data_jan['duration_in_minutes'] = data_jan['duration'].dt.total_seconds() / 60.0

data_feb['duration'] = (data_feb['tpep_dropoff_datetime'] - data_feb['tpep_pickup_datetime'])
data_feb['duration_in_minutes'] = data_feb['duration'].dt.total_seconds() / 60.0

In [41]:
print(f"Standard deviation of the trips duration in Jan 2022 Yellow Taxi Trip data: {data_jan['duration_in_minutes'].std()}")

Standard deviation of the trips duration in Jan 2022 Yellow Taxi Trip data: 46.44530513776499


Question 3: Fraction of the records left after dropping the outliers

In [42]:
data_jan.duration_in_minutes.describe(percentiles=[0.95, 0.98, 0.99])

count    2.463931e+06
mean     1.421220e+01
std      4.644531e+01
min     -3.442400e+03
50%      1.018333e+01
95%      3.193333e+01
98%      4.215000e+01
99%      5.085000e+01
max      8.513183e+03
Name: duration_in_minutes, dtype: float64

In [43]:
# Number of rows before filtering
n_rows_data_jan_full = len(data_jan)
n_rows_data_jan_full

2463931

In [44]:
# Remove outliers
data_jan = data_jan[((data_jan.duration_in_minutes < 60.0) & (data_jan.duration_in_minutes >= 1.0))]
data_feb = data_feb[((data_feb.duration_in_minutes < 60.0) & (data_feb.duration_in_minutes >= 1.0))]

In [45]:
print(f"The percentual fraction of data left after removing the outliers is: {len(data_jan) / n_rows_data_jan_full * 100}%")

The percentual fraction of data left after removing the outliers is: 98.2721512899509%


Question 4: Dimensionality after OHE

In [46]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

data_jan[categorical] = data_jan[categorical].astype(str)
data_feb[categorical] = data_feb[categorical].astype(str)

dv = DictVectorizer()

train_dicts = data_jan[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

dv = DictVectorizer()

val_dicts = data_feb[categorical + numerical].to_dict(orient='records')
X_val = dv.fit_transform(val_dicts)

In [47]:
# One feature name of X_train is 'trip_distance' which needs to be subtracted.
print(f"The nunmber of columns after OHE is: {X_train.shape[1] - 1}")

The nunmber of columns after OHE is: 515


In [49]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = data_jan[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = data_feb[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

Question 5: RMSE on train

In [59]:
target = 'duration_in_minutes'

y_train = data_jan[target].values
y_val = data_feb[target].values

In [60]:
model = LinearRegression()
model.fit(X_train, y_train)

In [61]:
# As requested in the assignment - Predict on training dataset
y_pred_jan = model.predict(X_train)

In [67]:
print(f"The RMSE of the model on the training data is: {round(mean_squared_error(y_train, y_pred_jan, squared=False), 2)}")

The RMSE of the model on the training data is: 7.0


Question 6: RMSE on validation

In [68]:
y_pred_feb = model.predict(X_val)

In [69]:
print(f"The RMSE of the model on the validation data is: {round(mean_squared_error(y_val, y_pred_feb, squared=False), 2)}")

The RMSE of the model on the validation data is: 7.79
