In [1]:
import pickle
import warnings
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.metrics import mean_squared_error
warnings.filterwarnings("ignore")

In [2]:
def read_taxi_data(filename):
    data = pd.read_parquet(f"/home/codespace/dataset/{filename}")
    data.tpep_dropoff_datetime = pd.to_datetime(data.tpep_dropoff_datetime)
    data.tpep_pickup_datetime = pd.to_datetime(data.tpep_pickup_datetime)
    data["duration"] = data.tpep_dropoff_datetime - data.tpep_pickup_datetime
    data.duration = data.duration.apply(lambda x: x.total_seconds()/60)
    categorical = ['PULocationID', 'DOLocationID']
    data[categorical] = data[categorical].astype(str)   
    return data

##### LOAD TRAIN & VALIDATION DATA

In [3]:
training_file_path = "yellow_tripdata_2023-01.parquet"
validation_file_path = "yellow_tripdata_2023-02.parquet"
full_train = read_taxi_data(training_file_path)
full_validation = read_taxi_data(validation_file_path)

In [4]:
#QUESTION - 01
print(f"Number of columns in the initial dataset is : {len(full_train.columns)-1}") #-1 for duration columns that has been added
#Number of columns in the dataset is : 19

Number of columns in the initial dataset is : 19


In [5]:
#QUESTION - 02

print(full_train.duration.describe())
print("---------------")
print(f"Standard deviation of the trips duration in January is {round(full_train.duration.describe()['std'], 2)}")
# Standard deviation of the trips duration in January is 42.59

count    3.066766e+06
mean     1.566900e+01
std      4.259435e+01
min     -2.920000e+01
25%      7.116667e+00
50%      1.151667e+01
75%      1.830000e+01
max      1.002918e+04
Name: duration, dtype: float64
---------------
Standard deviation of the trips duration in January is 42.59


In [6]:
#QUESTION - 03
df_train = full_train[(full_train.duration>=1)&(full_train.duration<=60)]
fraction_left = round(len(df_train)/len(full_train) * 100)
print( f"Fraction of the records left after you dropped the outliers is  {fraction_left}%")
# Fraction of the records left after you dropped the outliers is  98%

Fraction of the records left after you dropped the outliers is  98%


In [7]:
#QUESTION - 04
categorical = ['PULocationID', 'DOLocationID']
dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)
    

In [8]:
print(f"The dimensionality of this matrix (number of columns) is {X_train.shape[1]}")
# The dimensionality of this matrix (number of columns) is 515

The dimensionality of this matrix (number of columns) is 515


In [9]:
#QUESTION - 05
target = "duration"

lr = LinearRegression()
y_train = df_train[target].values
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)
rms_train = mean_squared_error(y_train, y_pred, squared=False)
print (f"RMSE on train: {round(rms_train, 3)}")
# RMSE on train: 7.649

RMSE on train: 7.649


In [10]:
# QUESTION - 06
df_val = full_validation[(full_validation.duration>=1)&(full_validation.duration<=60)]
val_dicts = df_val[categorical].to_dict(orient="records")
X_val = dv.transform(val_dicts)
y_val = df_val[target].values
y_pred = lr.predict(X_val)
rms_validation = mean_squared_error(y_val, y_pred, squared=False)
print (f"RMSE on validation: {round(rms_validation, 3)}")
# RMSE on validation: 7.812

RMSE on validation: 7.812
