# Feature Selection

In [15]:
import datetime
import numpy as np
import pandas as pd
import time
import warnings

from itertools import product
from sklearn.model_selection import train_test_split

warnings.simplefilter('ignore')

### config

In [16]:
DATA_FILE_PATHS = 'D:/rahnema/final project/dataset/'
OUTPUT_PATH = 'D:/rahnema/final project/label/feature.parquet'
START_DATE = '2023-01-01'

### Load Data

In [40]:
def load_data(file_paths, start_date=None):
    df = pd.read_parquet(file_paths)
    df['date'] = df['tpep_pickup_datetime'].dt.date.astype(str)

    if start_date:
        df = df[df['date'] > start_date].reset_index(drop=True)

    return df


rides_df = load_data(DATA_FILE_PATHS, START_DATE)
print(rides_df.shape)
rides_df.head()

(12595923, 20)


Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,date
0,2,2023-01-02 00:00:37,2023-01-02 00:13:15,1.0,9.29,1.0,N,70,4,1,35.9,1.0,0.5,8.18,0.0,1.0,49.08,2.5,0.0,2023-01-02
1,2,2023-01-02 00:01:53,2023-01-02 00:34:16,1.0,20.4,2.0,N,132,238,1,70.0,0.0,0.5,15.86,6.55,1.0,95.16,0.0,1.25,2023-01-02
2,2,2023-01-02 00:04:59,2023-01-02 00:12:03,5.0,1.68,1.0,N,142,229,1,10.0,1.0,0.5,2.25,0.0,1.0,17.25,2.5,0.0,2023-01-02
3,2,2023-01-02 00:00:28,2023-01-02 00:08:45,1.0,1.74,1.0,N,164,224,1,10.7,1.0,0.5,0.0,0.0,1.0,15.7,2.5,0.0,2023-01-02
4,2,2023-01-02 00:00:08,2023-01-02 00:04:30,6.0,0.63,1.0,N,144,231,1,6.5,1.0,0.5,0.0,0.0,1.0,11.5,2.5,0.0,2023-01-02


### aggregate data and labeling

In [41]:
def labeling(rides_df: pd.DataFrame):
    aggregated_df = rides_df.groupby(['date', 'PULocationID']).size().reset_index(name='count')
    unique_dates = rides_df['date'].unique()
    unique_pu_location_ids = rides_df['PULocationID'].unique()
    all_combinations = list(product(unique_dates, unique_pu_location_ids))
    combinations_df = pd.DataFrame(all_combinations, columns=['date', 'PULocationID'])
    label_df = aggregated_df.merge(combinations_df, how='right', on=['date', 'PULocationID']).fillna(0)
    return label_df


labels_df = labeling(rides_df)
print(labels_df.shape)
labels_df.head()

(31964, 3)


Unnamed: 0,date,PULocationID,count
0,2023-01-02,70,503.0
1,2023-01-02,132,6419.0
2,2023-01-02,142,2028.0
3,2023-01-02,164,1462.0
4,2023-01-02,144,567.0


### Day of week / Day of Month Features

In [42]:
feature_df['count']=feature_df['count'] + 100

feature_df['date']=feature_df['date'].astype('datetime64')

feature_df['PU_day_of_month'] = feature_df['date'].dt.day.astype(np.uint8)

feature_df['PU_day_of_week'] = feature_df['date'].dt.weekday.astype(np.uint8)

print(feature_df.shape)
feature_df.head()


(31964, 9)


Unnamed: 0,date,PULocationID,count,PU_day_of_month,PU_day_of_week,last_day,last_week,last_day_demand,last_week_demand
0,2023-01-02,70,603.0,2,0,2023-01-01 00:00:00,2022-12-26 00:00:00,0.0,0.0
195,2023-01-02,235,103.0,2,0,2023-01-01 00:00:00,2022-12-26 00:00:00,0.0,0.0
2,2023-01-02,142,2128.0,2,0,2023-01-01 00:00:00,2022-12-26 00:00:00,0.0,0.0
3,2023-01-02,164,1562.0,2,0,2023-01-01 00:00:00,2022-12-26 00:00:00,0.0,0.0
4,2023-01-02,144,667.0,2,0,2023-01-01 00:00:00,2022-12-26 00:00:00,0.0,0.0


### last_day_deman_count / last_week_deman_count Features


In [43]:
feature_df['last_day'] = ''
feature_df['last_week'] = ''

for i in feature_df.index:
    feature_df['last_day'].loc[i] = feature_df['date'].loc[i] - datetime.timedelta(days=1)
    feature_df['last_week'].loc[i] = feature_df['date'].loc[i] - datetime.timedelta(days=7)

In [44]:
feature_df = feature_df.sort_values(['date'])
feature_df['last_day_demand'] = feature_df.groupby(['PULocationID'])['count'].shift(1)
feature_df['last_week_demand'] = feature_df.groupby(['PULocationID'])['count'].shift(7)
feature_df['last_day_demand'].fillna(0,inplace = True)
feature_df['last_week_demand'].fillna(0,inplace = True)

print(feature_df.shape)
feature_df.head()

(31964, 9)


Unnamed: 0,date,PULocationID,count,PU_day_of_month,PU_day_of_week,last_day,last_week,last_day_demand,last_week_demand
0,2023-01-02,70,603.0,2,0,2023-01-01 00:00:00,2022-12-26 00:00:00,0.0,0.0
31,2023-01-02,186,3360.0,2,0,2023-01-01 00:00:00,2022-12-26 00:00:00,0.0,0.0
167,2023-01-02,3,102.0,2,0,2023-01-01 00:00:00,2022-12-26 00:00:00,0.0,0.0
168,2023-01-02,147,102.0,2,0,2023-01-01 00:00:00,2022-12-26 00:00:00,0.0,0.0
169,2023-01-02,122,102.0,2,0,2023-01-01 00:00:00,2022-12-26 00:00:00,0.0,0.0


### checking

In [45]:
feature_df[(feature_df['PULocationID'] == 79)].tail(5)

Unnamed: 0,date,PULocationID,count,PU_day_of_month,PU_day_of_week,last_day,last_week,last_day_demand,last_week_demand
31204,2023-04-29,79,4681.0,29,5,2023-04-28 00:00:00,2023-04-22 00:00:00,3245.0,4741.0
31466,2023-04-30,79,3449.0,30,6,2023-04-29 00:00:00,2023-04-23 00:00:00,4681.0,3202.0
31728,2023-05-01,79,100.0,1,0,2023-04-30 00:00:00,2023-04-24 00:00:00,3449.0,1521.0
30156,2023-05-02,79,101.0,2,1,2023-05-01 00:00:00,2023-04-25 00:00:00,100.0,1777.0
30680,2023-05-03,79,101.0,3,2,2023-05-02 00:00:00,2023-04-26 00:00:00,101.0,2112.0


In [46]:
feature_df.to_parquet(OUTPUT_PATH)