In [2]:
import pandas as pd
import numpy as np
import sklearn

In [3]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet')

#### Q1 dowloading the data. Read the data from January. How many columns are there

In [4]:
print('there are', df.shape[1], 'columns in the file')

there are 19 columns in the file


#### Q2 Computing duration, which is the duration of a ride in minutes. what's the standard deviation of the trips duration in Jan

In [5]:
print(df.tpep_dropoff_datetime.dt.to_period('M').unique())
print('\n')
print(df.tpep_pickup_datetime.dt.to_period('M').unique())

<PeriodArray>
['2024-01', '2023-12', '2009-01', '2002-12', '2024-02']
Length: 5, dtype: period[M]


<PeriodArray>
['2024-01', '2023-12', '2009-01', '2002-12', '2024-02']
Length: 5, dtype: period[M]


In [6]:
# from multiple years
# calculate the std. for all available records regardless of years and months
df['duration'] =  df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df['duration_minutes'] = df['duration'].dt.total_seconds()/60

In [7]:
# check the new var. duration_minutes
print(df['duration_minutes'].info())
print('\n')
print(df['duration_minutes'].describe())
print('\n')
print('standard deviation is', df['duration_minutes'].std())

<class 'pandas.core.series.Series'>
RangeIndex: 2964624 entries, 0 to 2964623
Series name: duration_minutes
Non-Null Count    Dtype  
--------------    -----  
2964624 non-null  float64
dtypes: float64(1)
memory usage: 22.6 MB
None


count    2.964624e+06
mean     1.561295e+01
std      3.485105e+01
min     -1.356667e+01
25%      7.150000e+00
50%      1.163333e+01
75%      1.868333e+01
max      9.455400e+03
Name: duration_minutes, dtype: float64


standard deviation is 34.851053592192876


In [None]:
# no missing/null instances --> good, no need to fillna
# min is negative --> drop-off happens before pick-up, which is impossible --> outliers 
# for now std is 34.85 (?? not any of the choices)

In [8]:
# filter to keep Jan records (from any years)
df_jan = df[(df['tpep_dropoff_datetime'].dt.month==1)|(df['tpep_pickup_datetime'].dt.month==1)]

print(df_jan['duration_minutes'].describe())
print('\n')
print('standard deviation is', df_jan['duration_minutes'].std())

count    2.964616e+06
mean     1.561297e+01
std      3.485110e+01
min     -1.356667e+01
25%      7.150000e+00
50%      1.163333e+01
75%      1.868333e+01
max      9.455400e+03
Name: duration_minutes, dtype: float64


standard deviation is 34.85109769959988


In [9]:
# remove the negative ones and check the stdev again
df_jan[df_jan.duration_minutes >=0].duration_minutes.std()

34.851347907077844

#### Q3 Dropping outliers -- keep only the records where the duration was between 1 and 60 minutes (inclusive)

In [10]:
# if dropping happens to the original data set
df_filtered = df[(df['duration_minutes']>=1)&(df['duration_minutes']<=60)]
print('the fraction/percentage of records left is', round((df_filtered.shape[0]/df.shape[0])*100,0), '%')

the fraction/percentage of records left is 98.0 %


In [11]:
# if dropping happens to the january data set
df_jan_filtered = df_jan[(df_jan['duration_minutes']>=1)&(df_jan['duration_minutes']<=60)]
print('the fraction/percentage of records left is', round((df_jan_filtered.shape[0]/df_jan.shape[0])*100,0), '%')

the fraction/percentage of records left is 98.0 %


#### Q4 one-hot encoding

In [12]:
df_ids = [col for col in df.columns if any(keyword in col for keyword in ['ID','id','Id'])]
df_ids

['VendorID', 'RatecodeID', 'PULocationID', 'DOLocationID']

In [13]:
df = df_jan_filtered.copy()

In [14]:
df[df_ids].info()

<class 'pandas.core.frame.DataFrame'>
Index: 2898898 entries, 0 to 2964623
Data columns (total 4 columns):
 #   Column        Dtype  
---  ------        -----  
 0   VendorID      int32  
 1   RatecodeID    float64
 2   PULocationID  int32  
 3   DOLocationID  int32  
dtypes: float64(1), int32(3)
memory usage: 77.4 MB


In [15]:
# convert PULocationID & DOLocationID to string
df[['PULocationID', 'DOLocationID']] = df[['PULocationID', 'DOLocationID']].astype('str')

In [20]:
# convert to list of dictionaries (orient = 'records')
data_dict = df[['PULocationID', 'DOLocationID']].to_dict(orient='records')

In [22]:
from sklearn.feature_extraction import DictVectorizer
# Initialize and fit DictVectorizer
vectorizer = DictVectorizer()
feature_matrix = vectorizer.fit_transform(data_dict)

In [23]:
print('the dimensionality (number of columns) is',feature_matrix.shape[1])

the dimensionality (number of columns) is 518


In [24]:
feature_matrix.shape

(2898898, 518)

#### Q5 Trainig a model