#### Feature Engineering  

Dataset: 
- _cabs_clean.csv_
- _neighborhoods_clean.csv_
- _trips_clean.csv_
- _weather_records_clean.csv_

Author: Luis Sergio Pastrana Lemus  
Date: 2025-07-17

# Feature engineering – Zuber Travel Activity Dataset

## __1. Libraries__.

In [1]:
from pathlib import Path
import sys

# Define project root dynamically, gets the current directory from which the notebook belongs and moves one level upper
project_root = Path.cwd().parent

# Add src to sys.path if it is not already
if str(project_root) not in sys.path:

    sys.path.append(str(project_root))

# Import function directly (more controlled than import *)
from src import *

from functools import partial
from IPython.display import display, HTML
import numpy as np
import os
import pandas as pd

## __2. Path to Data file__.

In [2]:
# Build route to data file and upload
data_file_path = project_root / "data" / "processed" / "clean"
df_cabs_clean = load_dataset_from_csv(data_file_path, "cabs_clean.csv", sep=',', header='infer')
df_neighborhoods_clean = load_dataset_from_csv(data_file_path, "neighborhoods_clean.csv", sep=',', header='infer')
df_trips_clean = load_dataset_from_csv(data_file_path, "trips_clean.csv", sep=',', header='infer')
df_weather_records_clean = load_dataset_from_csv(data_file_path, "weather_records_clean.csv", sep=',', header='infer')

In [3]:
# Format notebook output
format_notebook()

## __Functions__.

In [4]:
# Function for calculating ...

## 3 __Casting to data types__.

#### 3.1 Casting to string data type.

In [5]:
# df_cabs_clean 'vehicle_id' to string
df_cabs_clean['vehicle_id'] = df_cabs_clean['vehicle_id'].astype('string')
df_cabs_clean['vehicle_id'].dtypes

string[python]

In [6]:
# df_cabs_clean 'company_name' to string
df_cabs_clean['company_name'] = df_cabs_clean['company_name'].astype('string')
df_cabs_clean['company_name'].dtypes

string[python]

In [7]:
# df_neighborhoods_clean 'name' to string
df_neighborhoods_clean['name'] = df_neighborhoods_clean['name'].astype('string')
df_neighborhoods_clean['name'].dtypes

string[python]

In [8]:
# df_weather_records_clean 'description' to string
df_weather_records_clean['description'] = df_weather_records_clean['description'].astype('string')
df_weather_records_clean['description'].dtypes

string[python]

#### 3.2 Casting to category data type.

In [9]:
# df_cabs_clean 'company_name' to category
df_cabs_clean['company_name'] = df_cabs_clean['company_name'].astype('category')
df_cabs_clean['company_name'].dtypes

CategoricalDtype(categories=['4623_27290_jay_kim', 'blue_ribbon_taxi_association_inc', 'chicago_independents', 'chicago_medallion_leasing_inc',
                  'chicago_medallion_management', 'choice_taxi_association', 'dispatch_taxi_affiliation', 'koam_taxi_association',
                  'star_north_management_llc', 'taxi_affiliation_services_yellow', 'top_cab_affiliation'],
, ordered=False, categories_dtype=string)

In [10]:
# df_weather_records_clean 'description' to category
df_weather_records_clean['description'] = df_weather_records_clean['description'].astype('category')
df_weather_records_clean['description'].dtypes

CategoricalDtype(categories=['broken_clouds', 'drizzle', 'few_clouds', 'fog', 'haze', 'heavy_intensity_rain', 'light_intensity_drizzle', 'light_rain',
                  'light_snow', 'mist', 'moderate_rain', 'overcast_clouds', 'proximity_thunderstorm', 'proximity_thunderstorm_with_rain',
                  'scattered_clouds', 'sky_is_clear', 'thunderstorm_with_drizzle', 'thunderstorm_with_light_rain', 'thunderstorm_with_rain'],
, ordered=False, categories_dtype=string)

#### 3.3 Casting to datetime data type.

In [11]:
# df_trips_clean 'start_ts' to datetime
df_trips_clean = normalize_datetime(df_trips_clean, include=['start_ts'], frmt='%Y-%m-%d %H:%M:%S%z')
df_trips_clean['start_ts'].dtypes

datetime64[ns, UTC]

In [12]:
# df_trips_clean 'end_ts' to datetime
df_trips_clean = normalize_datetime(df_trips_clean, include=['end_ts'], frmt='%Y-%m-%d %H:%M:%S%z')
df_trips_clean['end_ts'].dtypes

datetime64[ns, UTC]

In [13]:
# df_weather_records_clean 'date_and_time' to datetime
df_weather_records_clean = normalize_datetime(df_weather_records_clean, include=['date_and_time'], frmt='%Y-%m-%d %H:%M:%S%z')
df_weather_records_clean['date_and_time'].dtypes

datetime64[ns, UTC]

## 4. Feature Engineering.

### 4.1 Patterns or trends that exist in ride frequency, location or time of day.

### 4.1 Patterns or trends that exist in ride frequency, location or time of day.

#### 4.1.1 Ride frequency - location.

In [14]:
df_ride_frequency_location = df_trips_clean.loc[:, ['trip_id', 'cab_id', 'pickup_location_id', 'dropoff_location_id']]


In [15]:
# Dropoff location frequency ride
df_ride_frequency_location_dropoff = df_ride_frequency_location.groupby('dropoff_location_id')['trip_id'].count()
df_ride_frequency_location_dropoff = df_ride_frequency_location_dropoff.reset_index().rename(columns={'dropoff_location_id': 'neighborhood_id', 
                                                                                                      'trip_id': 'trip_count'})
df_ride_frequency_location_dropoff

Unnamed: 0,neighborhood_id,trip_count
0,0,7
1,1,5
2,2,5
3,3,6
4,4,2
...,...,...
62,82,1
63,85,1
64,87,1
65,88,1


In [16]:
df_ride_frequency_location_dropoff = df_ride_frequency_location_dropoff.merge(df_neighborhoods_clean, on='neighborhood_id', how='left')
df_ride_frequency_location_dropoff

Unnamed: 0,neighborhood_id,trip_count,name
0,0,7,albany_park
1,1,5,andersonville
2,2,5,archer_heights
3,3,6,armour_square
4,4,2,ashburn
...,...,...,...
62,82,1,uptown
63,85,1,west_elsdon
64,87,1,west_loop
65,88,1,west_pullman


In [17]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "ride_frequency_location_dropoff_feature.csv"

df_ride_frequency_location_dropoff.to_csv(processed_path, index=False)

In [18]:
# Pickup location frequency ride
df_ride_frequency_location_pickup = df_ride_frequency_location.groupby('pickup_location_id')['trip_id'].count()
df_ride_frequency_location_pickup = df_ride_frequency_location_pickup.reset_index().rename(columns={'pickup_location_id': 'neighborhood_id', 
                                                                                                    'trip_id': 'trip_count'})
df_ride_frequency_location_pickup

Unnamed: 0,neighborhood_id,trip_count
0,0,5
1,1,2
2,2,2
3,3,2
4,4,3
...,...,...
61,82,1
62,85,1
63,87,1
64,90,1


In [19]:
df_ride_frequency_location_pickup = df_ride_frequency_location_pickup.merge(df_neighborhoods_clean, on='neighborhood_id', how='left')
df_ride_frequency_location_pickup

Unnamed: 0,neighborhood_id,trip_count,name
0,0,5,albany_park
1,1,2,andersonville
2,2,2,archer_heights
3,3,2,armour_square
4,4,3,ashburn
...,...,...,...
61,82,1,uptown
62,85,1,west_elsdon
63,87,1,west_loop
64,90,1,west_town


In [20]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "ride_frequency_location_pickup_feature.csv"

df_ride_frequency_location_pickup.to_csv(processed_path, index=False)

#### 4.1.2 Ride frequency - time of day.

In [21]:
df_ride_frequency_location = df_trips_clean.loc[:, ['trip_id', 'cab_id', 'start_ts', 'end_ts']]

In [22]:
df_ride_frequency_location['start_hour'] = df_ride_frequency_location['start_ts'].dt.hour 
df_ride_frequency_location['end_hour'] = df_ride_frequency_location['end_ts'].dt.hour 
df_ride_frequency_location

Unnamed: 0,trip_id,cab_id,start_ts,end_ts,start_hour,end_hour
0,1,151,2017-11-07 21:00:00+00:00,2017-11-07 21:00:00+00:00,21,21
1,2,151,2017-11-18 21:00:00+00:00,2017-11-18 21:00:00+00:00,21,21
2,3,176,2017-11-03 14:00:00+00:00,2017-11-03 15:00:00+00:00,14,15
3,4,74,2017-11-24 04:00:00+00:00,2017-11-24 04:00:00+00:00,4,4
4,5,39,2017-11-12 03:00:00+00:00,2017-11-12 03:00:00+00:00,3,3
...,...,...,...,...,...,...
195,196,4,2017-11-17 20:00:00+00:00,2017-11-17 20:00:00+00:00,20,20
196,197,81,2017-11-08 13:00:00+00:00,2017-11-08 13:00:00+00:00,13,13
197,198,81,2017-11-20 12:00:00+00:00,2017-11-20 12:00:00+00:00,12,12
198,199,81,2017-11-01 12:00:00+00:00,2017-11-01 12:00:00+00:00,12,12


In [23]:
# Start_ts frequency ride
df_ride_frequency_start_ts = df_ride_frequency_location.groupby('start_hour')['trip_id'].count()
df_ride_frequency_start_ts = df_ride_frequency_start_ts.reset_index().rename(columns={'trip_id': 'trip_count'})
df_ride_frequency_start_ts

Unnamed: 0,start_hour,trip_count
0,0,12
1,1,3
2,2,19
3,3,9
4,4,15
5,5,5
6,6,4
7,7,2
8,8,4
9,9,5


In [25]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "ride_frequency_start_ts_feature.csv"

df_ride_frequency_start_ts.to_csv(processed_path, index=False)

In [24]:
# End_ts frequency ride
df_ride_frequency_end_ts = df_ride_frequency_location.groupby('end_hour')['trip_id'].count()
df_ride_frequency_end_ts = df_ride_frequency_end_ts.reset_index().rename(columns={'trip_id': 'trip_count'})
df_ride_frequency_end_ts

Unnamed: 0,end_hour,trip_count
0,0,11
1,1,3
2,2,19
3,3,10
4,4,15
5,5,5
6,6,4
7,7,2
8,8,4
9,9,5


In [26]:
project_root = Path.cwd().parent
processed_path = project_root / "data" / "processed" / "feature" / "ride_frequency_end_ts_feature.csv"

df_ride_frequency_end_ts.to_csv(processed_path, index=False)

### 4.2 Weather affect the number of trips taken.