In [1]:
import numpy as np
import pandas as pd
import  sklearn

from  sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import root_mean_squared_error
from sklearn.pipeline import Pipeline


In [2]:
#!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet
#!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet

In [3]:
tripdata_jan_df=pd.read_parquet('yellow_tripdata_2023-01.parquet')
tripdata_feb_df=pd.read_parquet('yellow_tripdata_2023-02.parquet')

In [4]:
print(tripdata_jan_df.dtypes)
print(tripdata_jan_df.shape)

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                 float64
trip_distance                   float64
RatecodeID                      float64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object
(3066766, 19)


In [5]:
tripdata_jan_df.isna().sum()

VendorID                     0
tpep_pickup_datetime         0
tpep_dropoff_datetime        0
passenger_count          71743
trip_distance                0
RatecodeID               71743
store_and_fwd_flag       71743
PULocationID                 0
DOLocationID                 0
payment_type                 0
fare_amount                  0
extra                        0
mta_tax                      0
tip_amount                   0
tolls_amount                 0
improvement_surcharge        0
total_amount                 0
congestion_surcharge     71743
airport_fee              71743
dtype: int64

In [6]:
def process_duration(df):
    df.loc[:,'duration']=(df['tpep_dropoff_datetime']-df['tpep_pickup_datetime'])/ pd.Timedelta("1m")
    print('std: ',df['duration'].std())
    df=df.loc[ (df['duration']>=1) &(df['duration']<=60)]
    return df

In [7]:
tripdata_jan_df=process_duration(tripdata_jan_df)

std:  42.594351241920904


In [8]:
tripdata_jan_df.shape

(3009173, 20)

In [9]:
print('remaining after removing outliers: ', 1-(3066766-3009173)/3066766)

remaining after removing outliers:  0.9812202822125979


In [10]:
def extract_feats_target(df):

    df.loc[:,'PULocationID_str']=df['PULocationID'].astype('str')
    df.loc[:,'DOLocationID_str']=df['DOLocationID'].astype('str')
    
    input_feats = df.loc[:, ['PULocationID_str','DOLocationID_str'] ].to_dict('records')
    
    target=df['duration'].to_numpy()

    return input_feats,target


In [11]:
pipeline = Pipeline( [('dict_vect',DictVectorizer(sparse=False)), 
                      ('linear_regression',LinearRegression())] )

In [12]:
train_feats, train_target = extract_feats_target(tripdata_jan_df)

In [13]:
pipeline.fit(train_feats, y=train_target)

In [14]:
pipeline['dict_vect'].transform(train_feats).shape

(3009173, 515)

In [15]:
train_rmse = root_mean_squared_error(train_target, pipeline.predict(train_feats))
print(train_rmse)

7.649261024900074


In [16]:
tripdata_feb_df=process_duration(tripdata_feb_df)
test_feats, test_target = extract_feats_target(tripdata_feb_df)
test_rmse = root_mean_squared_error(test_target, pipeline.predict(test_feats))
print(test_rmse)



std:  42.84210176105097
7.811832798910476
