In [4]:
import pandas as pd

In [5]:
pd.__version__

'1.4.2'

In [6]:
!pip install pyarrow



In [7]:
df1 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet')

In [8]:
df2 = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet')

In [9]:
df1.head(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [10]:
print(f"Number of columns: {len(df1.columns)}")

Number of columns: 19


In [11]:
# create a new series that contains the difference between droppff time and pickup time
df1['duration'] = df1['tpep_dropoff_datetime'] - df1['tpep_pickup_datetime']

In [12]:
# .total_seconds() takes "0 days 00:12:00" and turns it into just "720" (seconds).
# / 60 makes the unit minites
df1['duration'] = df1['duration'].dt.total_seconds() / 60

In [13]:
# std() takes a series and calculates the deviation
std_dev = df1['duration'].std()

In [14]:
print(f"The standard deviation is {std_dev:.2f} minutes")

The standard deviation is 42.59 minutes


In [15]:
# mask is a series of true and false values
mask = (df1['duration'] >= 1) & (df1['duration'] <= 60)

In [20]:
# We pass that mask into df1. Pandas keeps the 'True' rows and throws away the 'False' rows.
df1_filtered = df1[mask].copy()

In [22]:
print(f"Original row count: {len(df1)}")
print(f"Filtered row count: {len(df1_filtered)}")

Original row count: 3066766
Filtered row count: 3009173


In [23]:
fraction = len(df1_filtered) / len(df1)
print(f"fraction left is {fraction:.2%}")

fraction left is 98.12%


In [24]:
from sklearn.feature_extraction import DictVectorizer

In [25]:
# 1. Define the columns we want to encode
categorical = ['PULocationID', 'DOLocationID']

In [26]:
df1_filtered[categorical] = df1_filtered[categorical].astype(str)


In [28]:
# 3. Turn the DataFrame into a list of dictionaries
# orient='records' creates a list where each row is a dictionary like:
# [{'PULocationID': '142', 'DOLocationID': '236'}, {'PULocationID': '1', ...}]
train_dicts = df1_filtered[categorical].to_dict(orient='records')

In [29]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [30]:
print(f"Dimensionality (Number of columns): {X_train.shape[1]}")

Dimensionality (Number of columns): 515
