# Feature Selection

In [1]:
import time
import datetime
import pandas as pd
import numpy as np
import warnings
from itertools import product
from sklearn.model_selection import train_test_split

warnings.simplefilter('ignore')

## config

In [2]:
DATA_FILE_PATHS = 'D:/rahnema/final project/dataset/'

OUTPUT_PATH = 'D:/rahnema/final project/label/label.parquet'

## Load Data

In [3]:
def load_data(file_paths, start_date=None):
    df = pd.read_parquet(file_paths)
    df['date'] = df['tpep_pickup_datetime'].dt.date.astype(str)

    if start_date:
        df = df[df['date'] > start_date].reset_index(drop=True)

    return df


rides_df = load_data(DATA_FILE_PATHS, '2023-01-01')
print(rides_df.shape)
rides_df.head()

(12595923, 20)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,date
0,2,2023-01-02 00:00:37,2023-01-02 00:13:15,1.0,9.29,1.0,N,70,4,1,35.9,1.0,0.5,8.18,0.0,1.0,49.08,2.5,0.0,2023-01-02
1,2,2023-01-02 00:01:53,2023-01-02 00:34:16,1.0,20.4,2.0,N,132,238,1,70.0,0.0,0.5,15.86,6.55,1.0,95.16,0.0,1.25,2023-01-02
2,2,2023-01-02 00:04:59,2023-01-02 00:12:03,5.0,1.68,1.0,N,142,229,1,10.0,1.0,0.5,2.25,0.0,1.0,17.25,2.5,0.0,2023-01-02
3,2,2023-01-02 00:00:28,2023-01-02 00:08:45,1.0,1.74,1.0,N,164,224,1,10.7,1.0,0.5,0.0,0.0,1.0,15.7,2.5,0.0,2023-01-02
4,2,2023-01-02 00:00:08,2023-01-02 00:04:30,6.0,0.63,1.0,N,144,231,1,6.5,1.0,0.5,0.0,0.0,1.0,11.5,2.5,0.0,2023-01-02


# aggregate data and labeling

In [4]:
def labeling(rides_df: pd.DataFrame):
    aggregated_df = rides_df.groupby(['date', 'PULocationID']).size().reset_index(name='count')
    unique_dates = rides_df['date'].unique()
    unique_pu_location_ids = rides_df['PULocationID'].unique()
    all_combinations = list(product(unique_dates, unique_pu_location_ids))
    combinations_df = pd.DataFrame(all_combinations, columns=['date', 'PULocationID'])
    label_df = aggregated_df.merge(combinations_df, how='right', on=['date', 'PULocationID']).fillna(0)
    return label_df


labels_df = labeling(rides_df)
print(labels_df.shape)
labels_df.head()

(31964, 3)


Unnamed: 0,date,PULocationID,count
0,2023-01-02,70,503.0
1,2023-01-02,132,6419.0
2,2023-01-02,142,2028.0
3,2023-01-02,164,1462.0
4,2023-01-02,144,567.0


## Extracting  Features (Day of week / Day of Month)

In [5]:
feature_df = labels_df.sort_values(['date'])

In [7]:
# Extracting new features based on pickup date

feature_df['date']=feature_df['date'].astype('datetime64')

# Extract the pickup day of the month from the datetime column
feature_df['PU_day_of_month'] = feature_df['date'].dt.day.astype(np.uint8)

# Extract the pickup day of the week (0-6) from the datetime column
feature_df['PU_day_of_week'] = feature_df['date'].dt.weekday.astype(np.uint8)


## Extracting  Features (last_day_deman_cnt / last_week_deman_cnt)


In [8]:
feature_df['last_day'] = ''
feature_df['last_week'] = ''
for i in feature_df.index:
    feature_df['last_day'].loc[i] = feature_df['date'].loc[i] - datetime.timedelta(days=1)
    feature_df['last_week'].loc[i] = feature_df['date'].loc[i] - datetime.timedelta(days=7)

In [None]:
def last_demand_count(df,last_date,LID):
    return (df[(df['date'] == last_date) & (df['PULocationID'] == LID)]['count']).values[0]

feature_df['last_day_demand_cnt'] = 0
feature_df['last_week_demand_cnt'] = 0
for i in feature_df.index:
    try:
        feature_df['last_day_demand_cnt'].loc[i] = last_demand_count(feature_df,feature_df['last_day'].loc[i],
        feature_df['PULocationID'].loc[i])
    except IndexError:
        feature_df['last_day_demand_cnt'].loc[i] = 0

    try:
        feature_df['last_week_demand_cnt'].loc[i] = last_demand_count(feature_df,feature_df['last_week'].loc[i],
        feature_df['PULocationID'].loc[i])
    except IndexError:
        feature_df['last_week_demand_cnt'].loc[i] = 0


In [11]:
feature_df[(feature_df['PULocationID'] == 79)].head(30)

Unnamed: 0,date,PULocationID,count,PU_day_of_month,PU_day_of_week,last_day,last_week,last_day_demand_cnt,last_week_demand_cnt
26,2023-01-02,79,1144.0,2,0,2023-01-01 00:00:00,2022-12-26 00:00:00,0,0
288,2023-01-03,79,1248.0,3,1,2023-01-02 00:00:00,2022-12-27 00:00:00,1144,0
550,2023-01-04,79,1555.0,4,2,2023-01-03 00:00:00,2022-12-28 00:00:00,1248,0
812,2023-01-05,79,1723.0,5,3,2023-01-04 00:00:00,2022-12-29 00:00:00,1555,0
1074,2023-01-06,79,2607.0,6,4,2023-01-05 00:00:00,2022-12-30 00:00:00,1723,0
1336,2023-01-07,79,4470.0,7,5,2023-01-06 00:00:00,2022-12-31 00:00:00,2607,0
1598,2023-01-08,79,3421.0,8,6,2023-01-07 00:00:00,2023-01-01 00:00:00,4470,0
1860,2023-01-09,79,1221.0,9,0,2023-01-08 00:00:00,2023-01-02 00:00:00,3421,1144
2122,2023-01-10,79,1458.0,10,1,2023-01-09 00:00:00,2023-01-03 00:00:00,1221,1248
2384,2023-01-11,79,1801.0,11,2,2023-01-10 00:00:00,2023-01-04 00:00:00,1458,1555


In [12]:
feature_df.to_parquet(OUTPUT_PATH)