In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
#!pip install pyarrow

In [3]:
data = pd.read_parquet("data/yellow_tripdata_2022-01.parquet")
data.shape

(2463931, 19)

### Q1 Read the data for January. How many columns are there?

In [4]:
print(f"The data has {data.shape[1]} columns")

The data has 19 columns


In [5]:
val = pd.read_parquet("data/yellow_tripdata_2022-02.parquet")
val.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-02-01 00:06:58,2022-02-01 00:19:24,1.0,5.4,1.0,N,138,252,1,17.0,1.75,0.5,3.9,0.0,0.3,23.45,0.0,1.25
1,1,2022-02-01 00:38:22,2022-02-01 00:55:55,1.0,6.4,1.0,N,138,41,2,21.0,1.75,0.5,0.0,6.55,0.3,30.1,0.0,1.25
2,1,2022-02-01 00:03:20,2022-02-01 00:26:59,1.0,12.5,1.0,N,138,200,2,35.5,1.75,0.5,0.0,6.55,0.3,44.6,0.0,1.25
3,2,2022-02-01 00:08:00,2022-02-01 00:28:05,1.0,9.88,1.0,N,239,200,2,28.0,0.5,0.5,0.0,3.0,0.3,34.8,2.5,0.0
4,2,2022-02-01 00:06:48,2022-02-01 00:33:07,1.0,12.16,1.0,N,138,125,1,35.5,0.5,0.5,8.11,0.0,0.3,48.66,2.5,1.25


In [6]:
data.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [7]:
data.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [8]:
def extract_trip_time(df):
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df['duration'].apply(lambda x: x.total_seconds() / 60 ) # Time in minutes
    return df

### Q2 What's the standard deviation of the trips duration in January?

In [9]:
new_dat = extract_trip_time(data)
ans = new_dat['duration'].std()
print(f"The standard deviation of the trip duration is {ans:.2f}")

The standard deviation of the trip duration is 46.45


In [10]:
def remove_outlier(df):
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    return df

### Q3 What fraction of the records left after you dropped the outliers?

In [11]:
data_without_outlier = remove_outlier(new_dat)
df_num_rows = new_dat.shape[0]
df_less_outlier_num_rows = data_without_outlier.shape[0]
percentage_left = (df_less_outlier_num_rows/df_num_rows)*100
print(f"The fraction of the records left after I dropped the outliers is {percentage_left:.2f}")

The fraction of the records left after I dropped the outliers is 98.28


In [12]:
category_columns = ['PULocationID', 'DOLocationID']

In [13]:
train_data = remove_outlier(extract_trip_time(data))
test_data = remove_outlier(extract_trip_time(val))

In [14]:
train_dicts = train_data[category_columns].fillna(-1).astype('int').astype('str').to_dict(orient="records")
val_dicts = test_data[category_columns].fillna(-1).astype('int').astype('str').to_dict(orient="records")

In [15]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [16]:
X_train.shape, X_val.shape

((2421440, 515), (2918187, 515))

### Q4 What's the dimensionality of this matrix (number of columns)?

In [17]:
print(f"The dimensionality of this matrix is {X_train.shape[1]}")

The dimensionality of this matrix is 515


In [18]:

y_train = train_data["duration"].values
y_val = test_data["duration"].values

In [19]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

### Q5 What's the RMSE on train?

In [20]:
err = mean_squared_error(y_pred, y_train, squared=False)
err
print(f"The RMSE of on the train data is {err:.2f} ")

The RMSE of on the train data is 6.99 


### Q6 What's the RMSE on validation?

In [21]:
y_pred_1 = lr.predict(X_val)
err_1= mean_squared_error(y_pred_1, y_val, squared=False)
err_1
print(f"The RMSE on the validation data is {err_1:.2f}")

The RMSE on the validation data is 7.79
