In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Setting
sns.despine(left=True, bottom=True)
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format','{:.4f}'.format)
sns.set()

%matplotlib inline

In [2]:
jams = pd.read_parquet('..\\dataset\\aggregate_median_jams_bogor_eda.parquet')

In [4]:
jams.sample(3)

Unnamed: 0,street,median_length,median_delay,median_speed_kmh,total_records,id,date,median_level,geometry,date_time,weekday,month,day,is_weekday,hour,is_working_hour,is_morning,is_lunch,is_late_afternoon,is_late_night
18951,pahlawan,1143.0,61.0,22.46,7,34197103,2022-07-17,1.0,"MULTILINESTRING ((106.794985 -6.607735, 106.79...",2022-07-17 17:00:00,6,7,17,0,17,0,0,0,1,0
7260,brigjen saptadji hadi prawira,1631.0,113.0,22.28,5,33722176,2022-07-10,2.0,"MULTILINESTRING ((106.771742 -6.576096, 106.77...",2022-07-10 21:00:00,6,7,10,0,21,0,0,0,0,0
5881,unknown,182.0,141.0,3.81,8,33669403,2022-07-09,3.0,"MULTILINESTRING ((106.847722 -6.654044, 106.84...",2022-07-09 17:00:00,5,7,9,0,17,0,0,0,1,0


## Preprocess

In [6]:
jams_preprocess = jams.copy()
jams_preprocess = jams_preprocess.drop(['id', 'date', 'geometry', 'date_time'], axis=1)

In [7]:
jams_preprocess['time_span'] = np.where((jams.hour >= 3) & (jams.hour < 6), 1, 
                                        np.where((jams.hour >= 6) & (jams.hour < 10), 2,
                                        np.where((jams.hour >= 10) & (jams.hour < 15), 3,
                                        np.where((jams.hour >= 15) & (jams.hour < 18), 4,
                                        np.where((jams.hour >= 18) & (jams.hour < 22), 5, 6)))))

In [8]:
jams_preprocess.sample(3)

Unnamed: 0,street,median_length,median_delay,median_speed_kmh,total_records,median_level,weekday,month,day,is_weekday,hour,is_working_hour,is_morning,is_lunch,is_late_afternoon,is_late_night,time_span
96443,n6 jalan raya baru,1980.0,72.0,33.34,3,1.0,4,9,2,1,7,0,1,0,0,0,2
66310,jenderal sudirman,1263.0,167.5,16.39,6,3.0,6,8,14,0,13,0,0,1,0,0,3
33903,merdeka,713.0,134.0,11.8,9,2.0,1,7,26,1,19,0,0,0,0,0,5


In [10]:
jams_preprocess.isna().sum().max()

0

In [9]:
jams_preprocess = jams_preprocess.drop_duplicates()

In [27]:
jams_preprocess.sample(3)

Unnamed: 0,street,median_length,median_delay,median_speed_kmh,total_records,median_level,weekday,month,day,is_weekday,hour,is_working_hour,is_morning,is_lunch,is_late_afternoon,is_late_night,time_span
16049,exit baranangsiang,585.0,434.0,4.5,58,4.0,5,7,16,0,14,0,0,0,0,0,3
51514,tol jagorawi,3237.0,62.0,59.38,1,1.0,5,8,6,0,4,0,0,0,0,0,1
54745,pemuda,622.0,95.0,14.64,1,3.0,6,8,7,0,14,0,0,0,0,0,3


## Feature Engineering

### Feature Engineer: Selected Numerical

In [30]:
temp_df = (
    jams_preprocess
    .loc[:, ['median_length', 'median_delay', 'median_speed_kmh', 'total_records', 'median_level', 'street']]
    .groupby('street')
    .agg(['mean', 'median', 'std'])
)

temp_df = temp_df.fillna(0)
temp_df.columns = [f'{col[0]}__{col[1]}' for col in temp_df.columns]

In [31]:
temp_df.head()

Unnamed: 0_level_0,median_length__mean,median_length__median,median_length__std,median_delay__mean,median_delay__median,median_delay__std,median_speed_kmh__mean,median_speed_kmh__median,median_speed_kmh__std,total_records__mean,total_records__median,total_records__std,median_level__mean,median_level__median,median_level__std
street,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
achmad adnawijaya,705.8876,795.0,274.5424,115.1278,93.0,81.8696,13.1786,13.98,4.566,11.8303,8.0,11.7387,2.3871,2.0,0.8124
ah nasution,294.0,294.0,0.0,251.1562,145.25,299.1715,4.1166,3.805,1.8658,2.8125,2.0,2.2352,2.25,2.0,0.9504
airlangga,175.0,175.0,0.0,136.0,136.0,2.8284,3.7,3.7,0.0566,4.0,4.0,1.4142,3.5,3.5,0.7071
akses tol tajur,178.9839,182.0,4.8862,178.3226,159.0,65.768,3.4908,3.495,0.9548,13.6452,12.0,11.8983,4.0,4.0,0.0
al hasanah,205.0,205.0,0.0,119.0,119.0,0.0,4.52,4.52,0.0,9.0,9.0,0.0,4.0,4.0,0.0


### Occurance

In [36]:
# overall occurance
temp_df = (
    jams_preprocess
    .loc[:, ['street', 'median_level']]
    .groupby('street', as_index=False)
    .count()
)

temp_df = temp_df.rename(columns={'median_level': 'overall_occurance'})

In [38]:
# occurance per time span
time_span = 1
temp_df = (
    jams_preprocess
    .query('time_span == @time_span')
    .loc[:, ['street', 'median_level']]
    .groupby('street')
    .count()
)

temp_df = temp_df.rename(columns={'median_level': f'occurance_time_span_{time_span}'})

In [40]:
# occurance per weekday
weekday = 1
temp_df = (
    jams_preprocess
    .query('weekday == @weekday')
    .loc[:, ['street', 'median_level']]
    .groupby('street')
    .count()
)

temp_df = temp_df.rename(columns={'median_level': f'occurance_weekday_{weekday}'})

### Percentile Groupby

In [69]:
col = 'median_length'
prcntile = 75

def groupby_return_percentile(dataframe=jams_preprocess, prcntile=prcntile, col=col):
    return (
        dataframe
        .loc[:, [col, 'street']]
        .groupby('street')
        .apply(lambda x: np.percentile(x, prcntile))
        .reset_index()
        .rename(columns={0: f'percentile_{prcntile}_{col}'})
    )

## Pipeline Feature Engineering

In [73]:
jams_feat_eng = jams_preprocess.copy()

In [60]:
def feat_eng_sel_numerical(dataframe=jams_preprocess):
    temp_df = (
        dataframe
        .loc[:, ['median_length', 'median_delay', 'median_speed_kmh', 'total_records', 'median_level', 'street']]
        .groupby('street')
        .agg(['mean', 'median', 'std'])
    )

    temp_df = temp_df.fillna(0)
    temp_df.columns = [f'{col[0]}__{col[1]}' for col in temp_df.columns]
    return temp_df

def feat_eng_overall_occurance(dataframe=jams_preprocess):
    # overall occurance
    temp_df = (
        dataframe
        .loc[:, ['street', 'median_level']]
        .groupby('street', as_index=False)
        .count()
    )

    temp_df = temp_df.rename(columns={'median_level': 'overall_occurance'})
    return temp_df

def feat_eng_occurance_timespan(dataframe=jams_preprocess, time_span=1):
    temp_df = (
        dataframe
        .query('time_span == @time_span')
        .loc[:, ['street', 'median_level']]
        .groupby('street', as_index=False)
        .count()
    )

    temp_df = temp_df.rename(columns={'median_level': f'occurance_time_span_{time_span}'})
    return temp_df

In [61]:
def feat_eng_occurance_weekday(dataframe=jams_preprocess, weekday=1):
    temp_df = (
        dataframe
        .query('weekday == @weekday')
        .loc[:, ['street', 'median_level']]
        .groupby('street', as_index=False)
        .count()
    )

    temp_df = temp_df.rename(columns={'median_level': f'occurance_weekday_{weekday}'})
    return temp_df

In [70]:
def pipeline_feat_eng(dataframe=jams_feat_eng):
    # sourcery skip: use-itertools-product
    
    # numerical
    temp_df = feat_eng_sel_numerical()
    dataframe = dataframe.merge(temp_df, how='left', left_on='street', right_index=True)
    dataframe = dataframe.drop_duplicates()
    
    # percentiles
    sel_cols = ['median_length', 'median_delay', 'median_speed_kmh', 'total_records', 'median_level']
    percs = [5, 25, 75, 95]
    
    for col in sel_cols:
        for prcntile in percs:
            temp_df = groupby_return_percentile(prcntile=prcntile, col=col)
            dataframe = dataframe.merge(temp_df, how='left', on='street')
            dataframe = dataframe.drop_duplicates()
            
    # overall_occurance
    temp_df = feat_eng_overall_occurance()
    dataframe = dataframe.merge(temp_df, how='left', on='street')
    dataframe = dataframe.drop_duplicates()
    
    # occurance_per_time_span
    for time_span in jams_preprocess.time_span.unique():
        temp_df = feat_eng_occurance_timespan(time_span=time_span)
        dataframe = dataframe.merge(temp_df, how='left', on='street')
        dataframe = dataframe.drop_duplicates()
        
    # occurance_per_weekday
    for weekday in jams_preprocess.weekday.unique():
        temp_df = feat_eng_occurance_weekday(weekday=weekday)
        dataframe = dataframe.merge(temp_df, how='left', on='street')
        dataframe = dataframe.drop_duplicates()
    
    return dataframe

In [74]:
jams_feat_eng = pipeline_feat_eng(jams_feat_eng)

In [78]:
for col in jams_feat_eng.select_dtypes('float64').columns:
    jams_feat_eng[col] = jams_feat_eng[col].astype('float32')
    
for col in jams_feat_eng.select_dtypes('int64').columns:
    jams_feat_eng[col] = jams_feat_eng[col].astype('int32')

In [79]:
jams_feat_eng.to_parquet('..\\dataset\\jams_bogor_feat_eng.parquet')