In [None]:
# !pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-16.1.0-cp39-cp39-manylinux_2_28_x86_64.whl (40.8 MB)
[K     |████████████████████████████████| 40.8 MB 10.4 MB/s eta 0:00:01


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [2]:
# check the web: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
# Jan, Feb 2023, Yellow Taxi Trip Records
yellow_taxi_23_jan_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet'
yellow_taxi_23_feb_url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet'
df_23_jan = pd.read_parquet(yellow_taxi_23_jan_url)

df_23_jan.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [3]:
print(f"The nuber of columns in the dataset is {len(df_23_jan.columns)}")

The nuber of columns in the dataset is 19


In [4]:
# new column "duration", difference between tpep_dropoff_datetime tpep_pickup_datetime
# convert the duration to minutes
df_23_jan['duration'] = (df_23_jan['tpep_dropoff_datetime'] - df_23_jan['tpep_pickup_datetime']).dt.total_seconds() / 60
print(f"The std of the duration is {df_23_jan['duration'].std()}")

The std of the duration is 42.594351241920904


In [5]:
# only keep the row where duration is greater than 1 and less than 60
df_23_jan_filter_dur = df_23_jan[(df_23_jan['duration'] >= 1) & (df_23_jan['duration'] <= 60)]
print(f"The percentage of rows that have duration between 0 and 60 minutes is {len(df_23_jan_filter_dur) / len(df_23_jan) * 100}%")

The percentage of rows that have duration between 0 and 60 minutes is 98.1220282212598%


In [6]:
df_23_jan_filter_dur[['PULocationID', 'DOLocationID']].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3009173 entries, 0 to 3066765
Data columns (total 2 columns):
 #   Column        Dtype
---  ------        -----
 0   PULocationID  int64
 1   DOLocationID  int64
dtypes: int64(2)
memory usage: 68.9 MB


In [7]:
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

df_23_jan_filter_dur[categorical] = df_23_jan_filter_dur[categorical].astype(str)
# show me the info of the dataset
df_23_jan_filter_dur[['PULocationID', 'DOLocationID']].info()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_23_jan_filter_dur[categorical] = df_23_jan_filter_dur[categorical].astype(str)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3009173 entries, 0 to 3066765
Data columns (total 2 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   PULocationID  object
 1   DOLocationID  object
dtypes: object(2)
memory usage: 68.9+ MB


In [11]:
X_train_df = df_23_jan_filter_dur[categorical]
X_train_dict = X_train_df.to_dict(orient='records')
dv = DictVectorizer()
X_train = dv.fit_transform(X_train_dict)

y_train = df_23_jan_filter_dur['duration'].values

In [9]:
X_train[:5]

<5x515 sparse matrix of type '<class 'numpy.float64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [17]:
print(f"The dimensionality of the dataset is {X_train.shape}")

The dimensionality of the dataset is (3009173, 515)


In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_train)

print(f"The mean squared error of the linear regression model is {mean_squared_error(y_train, y_pred)}")

The mean squared error of the linear regression model is 58.51120959168245


In [15]:
# RMSE of the linear regression model
print(f"The RMSE of the linear regression model is {np.sqrt(mean_squared_error(y_train, y_pred))}")

The RMSE of the linear regression model is 7.649262029221019


In [16]:
# fit the model for the Feb 2023 dataset
df_23_feb = pd.read_parquet(yellow_taxi_23_feb_url)
df_23_feb['duration'] = (df_23_feb['tpep_dropoff_datetime'] - df_23_feb['tpep_pickup_datetime']).dt.total_seconds() / 60
df_23_feb_filter_dur = df_23_feb[(df_23_feb['duration'] >= 1) & (df_23_feb['duration'] <= 60)]
df_23_feb_filter_dur[categorical] = df_23_feb_filter_dur[categorical].astype(str)
X_test_df = df_23_feb_filter_dur[categorical]
X_test_dict = X_test_df.to_dict(orient='records')
X_test = dv.transform(X_test_dict)
y_test = df_23_feb_filter_dur['duration'].values
y_pred = lr.predict(X_test)
print(f"The RMSE of the linear regression model on the Feb 2023 dataset is {np.sqrt(mean_squared_error(y_test, y_pred))}")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_23_feb_filter_dur[categorical] = df_23_feb_filter_dur[categorical].astype(str)


The RMSE of the linear regression model on the Feb 2023 dataset is 7.8118169669491095
