We'll use the same NYC taxi dataset, but instead of "Green Taxi Trip Records", we'll use "Yellow Taxi Trip Records".
https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page

In [1]:
import sys

# Print the Python version
print("Python version:", sys.version)

Python version: 3.10.11 (main, Apr 20 2023, 19:02:41) [GCC 11.2.0]


In [2]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
#!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet
#!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet

In [4]:
data_jan = pd.read_parquet(".data/yellow_tripdata_2022-01.parquet")
data_feb = pd.read_parquet(".data/yellow_tripdata_2022-02.parquet")

### Q 1
Read the data for January. How many columns are there?

In [5]:
# Assuming data_jan is your DataFrame object
column_count = len(data_jan.columns)
print("Number of Columns in data_jan:", column_count)

Number of Columns in data_jan: 19


### Q2. Computing duration

Now let's compute the duration variable. It should contain the duration of a ride in minutes.

What's the standard deviation of the trips duration in January?

In [6]:
data_jan.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.8,1.0,N,142,236,1,14.5,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.1,1.0,N,236,42,1,8.0,0.5,0.5,4.0,0.0,0.3,13.3,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.5,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.0,0.5,0.5,0.0,0.0,0.3,11.8,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.3,1.0,N,68,163,1,23.5,0.5,0.5,3.0,0.0,0.3,30.3,2.5,0.0


In [7]:
data_jan['duration'] = data_jan['tpep_dropoff_datetime'] - data_jan['tpep_pickup_datetime']
data_jan['duration_minutes'] = data_jan['duration'].dt.total_seconds() // 60
data_jan.head()


# Calculate the standard deviation of the 'duration_minutes' column
duration_std = data_jan['duration_minutes'].std()

# Display the result
print("Standard Deviation of duration_minutes:", round(duration_std, 2))

Standard Deviation of duration_minutes: 46.45


### Q3. Dropping outliers

In [8]:
len_original = data_jan.shape[0]

In [9]:
data_jan_cleaned = data_jan[(data_jan.duration_minutes >= 1) & (data_jan.duration_minutes <= 60)]

In [10]:
print("The fraction of the records that is left after dropping the outliers?:", round(data_jan_cleaned.shape[0]/len_original*100, 0))

The fraction of the records that is left after dropping the outliers?: 98.0


### Q4. One-hot encoding

Let's apply one-hot encoding to the pickup and dropoff location IDs. We'll use only these two features for our model.

    Turn the dataframe into a list of dictionaries
    Fit a dictionary vectorizer
    Get a feature matrix from it

What's the dimensionality of this matrix (number of columns)?

In [11]:
#first check for missing values.
print(data_jan_cleaned['DOLocationID'].isnull().sum())
print(data_jan_cleaned['PULocationID'].isnull().sum())
#there are no missing values in the data.

0
0


In [12]:
data_jan_temp = data_jan_cleaned[['PULocationID','DOLocationID']]
print(data_jan_temp.dtypes)

data_jan_temp = data_jan_temp.astype(str)
train_dicts = data_jan_temp.to_dict(orient='records')

dv = DictVectorizer()

X_train = dv.fit_transform(train_dicts)

PULocationID    int64
DOLocationID    int64
dtype: object


In [13]:
print(X_train.shape[0])
print(X_train.shape[1])

2423325
515


In [14]:
y_train = data_jan_cleaned['duration_minutes'].values
len(y_train)

2423325

In [15]:
# Create a LinearRegression object
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

In [16]:
# Make predictions on the training data
y_train_pred = model.predict(X_train)

# Calculate the mean squared error
mse = mean_squared_error(y_train, y_train_pred)

# Calculate the RMSE
rmse = np.sqrt(mse)

print("RMSE on training data:", rmse)

RMSE on training data: 7.016720168173186


In [17]:
data_feb['duration'] = data_feb['tpep_dropoff_datetime'] - data_feb['tpep_pickup_datetime']
data_feb['duration_minutes'] = data_feb['duration'].dt.total_seconds() // 60

data_feb_cleaned = data_feb[(data_feb.duration_minutes >= 1) & (data_feb.duration_minutes <= 60)]

data_feb_temp = data_feb_cleaned[['PULocationID','DOLocationID']]

print("The shape of the data_feb_temp is", data_feb_temp.shape)

data_feb_temp = data_feb_temp.astype(str)
train_dicts_feb = data_feb_temp.to_dict(orient='records')

 
X_feb = dv.transform(train_dicts_feb)

y_feb = data_feb_cleaned['duration_minutes'].values

print(X_feb.shape)

The shape of the data_feb_temp is (2921396, 2)
(2921396, 515)


In [18]:
y_pred = model.predict(X_feb)

In [21]:
mse = mean_squared_error(y_pred, y_feb)

# Calculate the RMSE
rmse = np.sqrt(mse)

print("RMSE on training data:", round(rmse, 2))

RMSE on training data: 7.83
