# Feature Selection / Data Cleaning

In [1]:
import time

import datetime
import pandas as pd
import numpy as np

import warnings
warnings.simplefilter('ignore')

### Load Data

In [2]:
df = pd.read_parquet('../data/label.parquet', engine='pyarrow')
df.head()

Unnamed: 0,date,PULocationID,count
0,2023-01-31,68,1
1,2023-01-31,113,1
2,2023-01-31,114,2
3,2023-01-31,132,1
4,2023-01-31,158,1


### Extracting  Features (Day of week / Day of Month)

In [3]:
# Extracting new features based on pickup date

df['date']=df['date'].astype('datetime64')

# Extract the pickup day of the month from the datetime column
df['PU_day_of_month'] = df['date'].dt.day.astype(np.uint8)

# Extract the pickup day of the week (0-6) from the datetime column
df['PU_day_of_week'] = df['date'].dt.weekday.astype(np.uint8)

df.head(5)

Unnamed: 0,date,PULocationID,count,PU_day_of_month,PU_day_of_week
0,2023-01-31,68,1,31,1
1,2023-01-31,113,1,31,1
2,2023-01-31,114,2,31,1
3,2023-01-31,132,1,31,1
4,2023-01-31,158,1,31,1


### Extracting  Features (last_day_deman_cnt / last_week_deman_cnt)

In [4]:
df['last_day']=''
df['last_week']=''
for i in df.index:
    df['last_day'].loc[i] = df['date'].loc[i]- datetime.timedelta(days=1)
    df['last_week'].loc[i] = df['date'].loc[i]- datetime.timedelta(days=7)

In [5]:
def last_demand_count(df,last_date,LID):
    return (df[(df['date'] == last_date) & (df['PULocationID'] == LID)]['count']).values[0]


df['last_day_demand_cnt'] = ''
df['last_week_demand_cnt'] = ''
for i in df.index:
    try:
        df['last_day_demand_cnt'].loc[i] = last_demand_count(df,df['last_day'].loc[i],df['PULocationID'].loc[i])
    except IndexError:
        df['last_day_demand_cnt'].loc[i] = 0
    
    try:    
            df['last_week_demand_cnt'].loc[i] = last_demand_count(df,df['last_week'].loc[i],df['PULocationID'].loc[i])
    except IndexError:
        df['last_week_demand_cnt'].loc[i] = 0
        
df.head(5)   

Unnamed: 0,date,PULocationID,count,PU_day_of_month,PU_day_of_week,last_day,last_week,last_day_demand_cnt,last_week_demand_cnt
0,2023-01-31,68,1,31,1,2023-01-30 00:00:00,2023-01-24 00:00:00,0,0
1,2023-01-31,113,1,31,1,2023-01-30 00:00:00,2023-01-24 00:00:00,0,0
2,2023-01-31,114,2,31,1,2023-01-30 00:00:00,2023-01-24 00:00:00,0,0
3,2023-01-31,132,1,31,1,2023-01-30 00:00:00,2023-01-24 00:00:00,0,0
4,2023-01-31,158,1,31,1,2023-01-30 00:00:00,2023-01-24 00:00:00,0,0


### write new Data set

In [6]:
df.to_parquet('../data/featureDS.parquet')

### Test

In [7]:
df[(df['PULocationID'] == 79)].head(30)

Unnamed: 0,date,PULocationID,count,PU_day_of_month,PU_day_of_week,last_day,last_week,last_day_demand_cnt,last_week_demand_cnt
73,2023-02-01,79,1838,1,2,2023-01-31 00:00:00,2023-01-25 00:00:00,0,0
293,2023-02-02,79,2191,2,3,2023-02-01 00:00:00,2023-01-26 00:00:00,1838,0
520,2023-02-03,79,3107,3,4,2023-02-02 00:00:00,2023-01-27 00:00:00,2191,0
740,2023-02-04,79,3848,4,5,2023-02-03 00:00:00,2023-01-28 00:00:00,3107,0
953,2023-02-05,79,3123,5,6,2023-02-04 00:00:00,2023-01-29 00:00:00,3848,0
1166,2023-02-06,79,1305,6,0,2023-02-05 00:00:00,2023-01-30 00:00:00,3123,0
1391,2023-02-07,79,1665,7,1,2023-02-06 00:00:00,2023-01-31 00:00:00,1305,0
1612,2023-02-08,79,1797,8,2,2023-02-07 00:00:00,2023-02-01 00:00:00,1665,1838
1836,2023-02-09,79,2227,9,3,2023-02-08 00:00:00,2023-02-02 00:00:00,1797,2191
2060,2023-02-10,79,3043,10,4,2023-02-09 00:00:00,2023-02-03 00:00:00,2227,3107
