## Import libraries

In [1]:
import pandas as pd 
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np 


## Downloading the Data

In [2]:
data_jan = pd.read_parquet("./data/yellow_tripdata_2023-01.parquet")

data_feb = pd.read_parquet("./data/yellow_tripdata_2023-02.parquet")

In [3]:
# January 2023 Data
data_jan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3066766 entries, 0 to 3066765
Data columns (total 19 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee           

In [4]:
data_jan.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [5]:
data_feb.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'Airport_fee'],
      dtype='object')

* The January 2023 Data has 19 features (columns)

## Computing duration

In [6]:
# Calculating Duration
data_jan['duration'] = (data_jan['tpep_dropoff_datetime'] - data_jan['tpep_pickup_datetime']).dt.total_seconds()/60

std_duration = data_jan['duration'].std()

print(f"The standard deviation of the trips duration in January is {std_duration:.2f} minutes.")



The standard deviation of the trips duration in January is 42.59 minutes.


In [7]:
# Do the same for february 
data_feb['duration'] = (data_feb['tpep_dropoff_datetime'] - data_feb['tpep_pickup_datetime']).dt.total_seconds()/60

std_duration_feb = data_feb['duration'].std()

print(f"The standard deviation of the trips duration in February is {std_duration_feb:.2f} minutes.")


The standard deviation of the trips duration in February is 42.84 minutes.


## Dropping Outliers

In [8]:
data_jan['duration'].value_counts()

duration
7.000000       3697
8.000000       3665
9.000000       3558
6.000000       3523
11.000000      3518
               ... 
1395.516667       1
217.666667        1
1424.766667       1
1413.066667       1
1421.350000       1
Name: count, Length: 8579, dtype: int64

In [9]:
# Filter data
filter_data = data_jan[(data_jan['duration'] >= 1) & (data_jan['duration'] <= 60)]

# length of filtered data to cover only values between 1 and 60 minutes
fraction = (len(filter_data)/len(data_jan)) * 100

print(f"The fraction of records left after dropping outliers is {fraction:.0f}%")

The fraction of records left after dropping outliers is 98%


## One Hot Encoding

In [10]:
filter_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3009173 entries, 0 to 3066765
Data columns (total 20 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   VendorID               int64         
 1   tpep_pickup_datetime   datetime64[us]
 2   tpep_dropoff_datetime  datetime64[us]
 3   passenger_count        float64       
 4   trip_distance          float64       
 5   RatecodeID             float64       
 6   store_and_fwd_flag     object        
 7   PULocationID           int64         
 8   DOLocationID           int64         
 9   payment_type           int64         
 10  fare_amount            float64       
 11  extra                  float64       
 12  mta_tax                float64       
 13  tip_amount             float64       
 14  tolls_amount           float64       
 15  improvement_surcharge  float64       
 16  total_amount           float64       
 17  congestion_surcharge   float64       
 18  airport_fee            floa

In [11]:
# Getting target features for our model 
new_filter = filter_data[['PULocationID','DOLocationID']].astype(str)

# Turn Dta frame into a list of dictionaries
jan_dicts = new_filter.to_dict(orient='records')

# Fitting a vectorizer 
DV = DictVectorizer(sparse=False)

# Getting a feature matrix 
feature_matrix = DV.fit_transform(jan_dicts)

# Getting dimensionality of the feature matrix 
columns_num = feature_matrix.shape[1]

print(f"The dimensionality of the feature matrix is {columns_num} columns.")

The dimensionality of the feature matrix is 515 columns.


## Training a model

In [12]:
#  Note the feature matrix is our X and the target variable is duration
y = filter_data['duration'].values 
X = feature_matrix

# Initialize model
model = LinearRegression()

# Fit Model 
model.fit(X, y)

# Predictions on training data 
y_pred = model.predict(X)

: 

In [None]:
# Calculate RMSE 
rmse = np.sqrt(mean_squared_error(y, y_pred))


print(f"The RMSe on the training data is {rmse:.2f} minutes.")

# Evaluating model

In [None]:
# Getting features
data_feb_filter = data_feb[['PULocationID', 'DOLocationID']].astype(str)

data_feb_dicts = data_feb_filter.to_dict(orient='records')

X_val = vect.transform(data_feb_dicts)
y_val = data_feb['duration'].values

y_pred_val = model.predict(X_val)

In [None]:
rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))

print(f"The RMSE for the validation data is {rmse_val:.2f} minutes.")