In [3]:
import numpy as np
import pandas as pd
import random
import warnings
import yaml

from datetime import datetime, timedelta
from random import randint

In [4]:
random.seed(2611)
warnings.filterwarnings('ignore')

config_path = "../config.yaml"
with open(config_path) as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

# Open Dataset

In [57]:
df_jam = pd.read_parquet('../data/interim/cleaned_aggregate_jams_Bogor.parquet.gzip')
len(set(df_jam['street']))

46

In [6]:
df_irregularities = pd.read_parquet("../data/interim/cleaned_aggregate_irregularities_Bogor.parquet.gzip")
df_irregularities.shape

(2621, 16)

In [8]:
df_weather = pd.read_parquet('../data/external/weather_data_Bogor_2022-07-06-09-00-00_2022-09-04-21-00-00.parquet.gzip')
df_weather.shape

(1432, 11)

In [9]:
df_holiday = pd.read_parquet("../data/external/holiday_2022.parquet.gzip")
df_holiday

Unnamed: 0,holiday,date
0,New Year's Day,2022-01-01
1,Chinese New Year,2022-02-01
2,Isra and Mi'raj,2022-02-28
3,Nyepi,2022-03-03
4,Good Friday,2022-04-15
5,Labor Day,2022-05-01
6,Eid al-Fitr,2022-05-02
7,Eid al-Fitr Holiday,2022-05-03
8,Buddha Day,2022-05-16
9,Ascension Day,2022-05-26


# Jams Data Preprocessing

In [10]:
day = (max(df_jam['time']) - min(df_jam['time'])).days
hour = 24
street = len(set(df_jam['street']))

if day * hour * street == df_jam.shape[0]:
    print("Complete historical data")
else:
    print("Incomplete historical data")
    print("Proportion with complete: {}%".format(
        round((df_jam.shape[0]/(day * hour * street)) * 100, 2)
    ))

Incomplete historical data
Proportion with complete: 54.06%


In [11]:
df_jam = df_jam.loc[:,[
    'time',
    'street',
    'level',
    'median_length',
    'median_delay',
    'median_speed_kmh'
]]
df_jam.dtypes

time                datetime64[ns]
street                      object
level                        int64
median_length              float64
median_delay               float64
median_speed_kmh           float64
dtype: object

In [43]:
speed_constanta = {
    1: [61, 80],
    2: [41, 60],
    3: [21, 40],
    4: [1, 20]
}

def clean_delay_data(data, is_external):
    min_level = min(data['level'])
    if min_level == 5 or is_external:
        return 850 + randint(100, 500)
    else:
        used_data = data[data['level'] == min_level]
        min_data = used_data[used_data['median_speed_kmh'] == min(used_data['median_speed_kmh'])].iloc[0,:]
        gap = min_data['median_speed_kmh'] - 0.0
        delay_close_to_0 = min_data['median_delay']
        constanta = np.mean(
            [row['median_delay']/row['median_speed_kmh'] for _, row in used_data.iterrows()]
        )
        delay_to_0 = constanta * gap
        return delay_close_to_0 + delay_to_0 + randint(1, 10)

def get_related_data(data, timestamp, all_data):
    result = data[data['time'] < timestamp]
    is_external = False
    if result.shape[0] == 0 or min(result['level']) == 5:
        result = data[data['time'] > timestamp].iloc[0:5]
    if min(result['level']) == 5:
        result = all_data[all_data['time'] < timestamp]
        is_external = True
    return result, is_external

def get_median_speed(data):
    min_level = min(data['level'])
    if min_level < 5:
        used_speed = np.mean(data[data['level'] == min_level]['median_speed_kmh'])
        constanta = randint(speed_constanta[min_level][0], speed_constanta[min_level][1])/100
        result = used_speed / constanta
    else:
        result = 15.32 # https://publikasiilmiah.ums.ac.id/bitstream/handle/11617/8159/B79_Robby%20Hartono.pdf
    return result

def get_median_length(data, current_median_speed):
    min_level = min(data['level'])
    used_data = data[data['level'] == min_level]
    constanta = np.mean(
        [row['median_length']/row['median_speed_kmh'] for _, row in used_data.iterrows()]
    )
    return constanta * current_median_speed

def get_median_delay(data, current_median_speed):
    min_level = min(data['level'])
    used_data = data[data['level'] == min_level]
    constanta = np.mean(
        [row['median_delay']/row['median_speed_kmh'] for _, row in used_data.iterrows()]
    )
    return (constanta/(10*min_level)) * current_median_speed # Pake asumsi


In [46]:
lst_street = list(set(df_jam['street']))
start_timestamp = datetime.strptime(config['dataset']['timestamp']['start_timestamp'], '%Y-%m-%d %H:%M:%S.%f')
end_timestamp = datetime.strptime(config['dataset']['timestamp']['end_timestamp'], '%Y-%m-%d %H:%M:%S.%f')

completed_jam = pd.DataFrame(columns=df_jam.columns)

for street in lst_street:
    used_data = df_jam[df_jam['street'] == street]
    used_data.sort_values(by=['time'], 
        ascending=True, 
        inplace=True
    )

    street_data = pd.DataFrame(columns=df_jam.columns)
    current_timestamp = start_timestamp
    
    while current_timestamp <= end_timestamp:
        current_data = used_data[used_data['time'] == current_timestamp]
        related_data, is_external = get_related_data(used_data, current_timestamp, df_jam)
        if current_data.shape[0] == 0:
            median_speed = get_median_speed(related_data)
            median_length = get_median_length(related_data, median_speed)
            median_delay = get_median_delay(related_data, median_speed)
            current_data = pd.DataFrame(data=[[
                current_timestamp,
                street,
                0,
                median_length,
                median_delay,
                median_speed
            ]], columns=df_jam.columns)
        elif current_data['median_delay'].values[0] == -1:
            current_data['median_delay'] = clean_delay_data(related_data, is_external)
        street_data = street_data.append(current_data, ignore_index=True)
        current_timestamp += timedelta(seconds=3600)

    completed_jam = completed_jam.append(
        street_data,
        ignore_index=True
    )


In [47]:
tc = completed_jam[completed_jam['street'] == 'KH Soleh Iskandar (Jalur Lambat)']
tc[tc['level'] == 5]

Unnamed: 0,time,street,level,median_length,median_delay,median_speed_kmh
11624,2022-07-06 09:00:00,KH Soleh Iskandar (Jalur Lambat),5,167.0,1021,0.0
11625,2022-07-06 10:00:00,KH Soleh Iskandar (Jalur Lambat),5,167.0,1273,0.0
11626,2022-07-06 11:00:00,KH Soleh Iskandar (Jalur Lambat),5,167.0,1287,0.0
11627,2022-07-06 12:00:00,KH Soleh Iskandar (Jalur Lambat),5,167.0,1319,0.0
11628,2022-07-06 13:00:00,KH Soleh Iskandar (Jalur Lambat),5,167.0,1178,0.0
...,...,...,...,...,...,...
13072,2022-09-04 17:00:00,KH Soleh Iskandar (Jalur Lambat),5,167.0,936.033051,0.0
13073,2022-09-04 18:00:00,KH Soleh Iskandar (Jalur Lambat),5,167.0,937.033051,0.0
13074,2022-09-04 19:00:00,KH Soleh Iskandar (Jalur Lambat),5,167.0,939.033051,0.0
13075,2022-09-04 20:00:00,KH Soleh Iskandar (Jalur Lambat),5,167.0,932.033051,0.0


In [58]:
cleaned_dataset_path_template = "../data/interim/complete_aggregate_{}_{}.parquet.gzip"

city = 'Bogor'
type = 'jams'

completed_jam.to_parquet(cleaned_dataset_path_template.format(type, city), compression="gzip") 

# Irregularities Data Preprocessing

In [54]:
df_irregularities = df_irregularities.loc[:,[
    'time',
    'street',
    'median_regular_speed',
    'median_delay_seconds'
]]
df_irregularities

Unnamed: 0,time,street,median_regular_speed,median_delay_seconds
0,2022-07-16 15:00:00.000,Achmad Adnawijaya,13.84,791.0
1,2022-08-04 10:00:00.000,Achmad Adnawijaya,24.70,595.0
2,2022-08-04 11:00:00.000,Achmad Adnawijaya,18.74,450.5
3,2022-08-11 18:00:00.000,Achmad Adnawijaya,21.52,907.0
4,2022-08-11 19:00:00.000,Achmad Adnawijaya,21.52,907.0
...,...,...,...,...
2616,2022-08-30 07:00:00.000,Veteran,20.38,568.0
2617,2022-09-02 16:00:00.000,Veteran,24.04,559.5
2618,2022-09-03 16:00:00.000,Veteran,21.85,516.0
2619,2022-09-03 17:00:00.000,Veteran,21.79,541.0


In [55]:
df_irregularities[df_irregularities['street'].isin(list(completed_jam['street']))]

Unnamed: 0,time,street,median_regular_speed,median_delay_seconds
0,2022-07-16 15:00:00.000,Achmad Adnawijaya,13.84,791.0
1,2022-08-04 10:00:00.000,Achmad Adnawijaya,24.70,595.0
2,2022-08-04 11:00:00.000,Achmad Adnawijaya,18.74,450.5
3,2022-08-11 18:00:00.000,Achmad Adnawijaya,21.52,907.0
4,2022-08-11 19:00:00.000,Achmad Adnawijaya,21.52,907.0
...,...,...,...,...
2616,2022-08-30 07:00:00.000,Veteran,20.38,568.0
2617,2022-09-02 16:00:00.000,Veteran,24.04,559.5
2618,2022-09-03 16:00:00.000,Veteran,21.85,516.0
2619,2022-09-03 17:00:00.000,Veteran,21.79,541.0


In [78]:
def get_median_data(data, column):
    if data.shape[0] == 1:
        return data[column].values[0]
    return np.quantile(data.iloc[:-1,:][column], q=0.5)

In [79]:
lst_street = list(set(completed_jam['street']))
start_timestamp = datetime.strptime(config['dataset']['timestamp']['start_timestamp'], '%Y-%m-%d %H:%M:%S.%f')
end_timestamp = datetime.strptime(config['dataset']['timestamp']['end_timestamp'], '%Y-%m-%d %H:%M:%S.%f')

completed_irregularities = pd.DataFrame(columns=df_irregularities.columns)

for street in lst_street:
    street_df = df_irregularities[df_irregularities['street'] == street]

    street_data = pd.DataFrame(columns=df_irregularities.columns)
    current_timestamp = start_timestamp

    while current_timestamp <= end_timestamp:
        irregularities_data = street_df[street_df['time'] == current_timestamp]
        if irregularities_data.shape[0] == 0:
            jam_data = completed_jam[
                (completed_jam['street'] == street) & \
                    (completed_jam['time'] <= current_timestamp)
            ]
            median_regular_speed = get_median_data(jam_data, 'median_speed_kmh')
            median_delay_seconds = get_median_data(jam_data, 'median_delay')
            irregularities_data = pd.DataFrame(data=[[
                current_timestamp,
                street,
                median_regular_speed,
                median_delay_seconds
            ]], columns=df_irregularities.columns)
        street_data = street_data.append(irregularities_data, ignore_index=True)
        current_timestamp += timedelta(seconds=3600)
    
    completed_irregularities = completed_irregularities.append(street_data, ignore_index=True)

In [80]:
completed_irregularities[completed_irregularities['street'] == 'Surya Kencana']

Unnamed: 0,time,street,median_regular_speed,median_delay_seconds
0,2022-07-06 09:00:00,Surya Kencana,9.310000,214.0
1,2022-07-06 10:00:00,Surya Kencana,9.310000,214.0
2,2022-07-06 11:00:00,Surya Kencana,9.770000,222.5
3,2022-07-06 12:00:00,Surya Kencana,10.230000,214.0
4,2022-07-06 13:00:00,Surya Kencana,9.865000,222.5
...,...,...,...,...
1448,2022-09-04 17:00:00,Surya Kencana,20.365369,12.828167
1449,2022-09-04 18:00:00,Surya Kencana,20.361842,12.83957
1450,2022-09-04 19:00:00,Surya Kencana,20.353517,12.839822
1451,2022-09-04 20:00:00,Surya Kencana,20.345192,12.840074


In [81]:
cleaned_dataset_path_template = "../data/interim/complete_aggregate_{}_{}.parquet.gzip"

city = 'Bogor'
type = 'irregularities'

completed_irregularities.to_parquet(cleaned_dataset_path_template.format(type, city), compression="gzip") 

# Merge Dataset

In [5]:
city = 'Bogor'

completed_jam = pd.read_parquet("../data/interim/complete_aggregate_jam_{}.parquet.gzip".format(city))
completed_irregularities = pd.read_parquet("../data/interim/complete_aggregate_irregularities_{}.parquet.gzip".format(city))

In [6]:
completed_jam.dtypes

time                datetime64[ns]
street                      object
level                        int64
median_length              float64
median_delay               float64
median_speed_kmh           float64
dtype: object

In [37]:
completed_irregularities.dtypes

time                    datetime64[ns]
street                          object
median_regular_speed           float64
median_delay_seconds           float64
dtype: object

In [9]:
weather_data = pd.read_parquet("../data/external/weather_data_Bogor.parquet.gzip")
weather_data

Unnamed: 0,timestamp,temp,feels_like,pressure,humidity,dew_point,clouds,wind_speed,wind_deg,weather_type,rain_intensity
0,2022-07-06 09:00:00,301.92,308.31,1013,85,299.14,79,0.76,97,Clouds,0.00
1,2022-07-06 10:00:00,302.51,308.91,1013,80,298.70,92,0.48,67,Clouds,0.00
2,2022-07-06 11:00:00,302.86,309.86,1013,80,299.04,93,0.77,350,Clouds,0.00
3,2022-07-06 12:00:00,303.38,310.38,1012,78,299.11,84,0.40,10,Rain,0.26
4,2022-07-06 13:00:00,302.68,306.65,1011,68,296.15,88,0.77,269,Rain,2.89
...,...,...,...,...,...,...,...,...,...,...,...
1448,2022-09-04 17:00:00,301.38,303.98,1010,68,294.91,100,1.04,343,Rain,0.70
1449,2022-09-04 18:00:00,300.67,303.84,1011,78,296.49,100,0.77,1,Rain,0.16
1450,2022-09-04 19:00:00,300.47,303.73,1012,81,296.92,100,0.60,14,Clouds,0.00
1451,2022-09-04 20:00:00,300.21,303.42,1013,84,297.27,100,0.58,97,Rain,0.14


In [10]:
weather_data = weather_data.loc[:,['timestamp', 'rain_intensity']]
weather_data


Unnamed: 0,timestamp,rain_intensity
0,2022-07-06 09:00:00,0.00
1,2022-07-06 10:00:00,0.00
2,2022-07-06 11:00:00,0.00
3,2022-07-06 12:00:00,0.26
4,2022-07-06 13:00:00,2.89
...,...,...
1448,2022-09-04 17:00:00,0.70
1449,2022-09-04 18:00:00,0.16
1450,2022-09-04 19:00:00,0.00
1451,2022-09-04 20:00:00,0.14


In [11]:
holiday_data = pd.read_parquet("../data/external/holiday_2022.parquet.gzip")
holiday_data

Unnamed: 0,holiday,date
0,New Year's Day,2022-01-01
1,Chinese New Year,2022-02-01
2,Isra and Mi'raj,2022-02-28
3,Nyepi,2022-03-03
4,Good Friday,2022-04-15
5,Labor Day,2022-05-01
6,Eid al-Fitr,2022-05-02
7,Eid al-Fitr Holiday,2022-05-03
8,Buddha Day,2022-05-16
9,Ascension Day,2022-05-26


In [12]:
holiday_data.dtypes

holiday            object
date       datetime64[ns]
dtype: object

In [13]:
final_df = completed_jam.join(
    completed_irregularities.set_index(['time', 'street']),
    on=['time', 'street'],
    how='inner',
    rsuffix='_irregular'
)
final_df

Unnamed: 0,time,street,level,median_length,median_delay,median_speed_kmh,median_regular_speed,median_delay_seconds
0,2022-07-06 09:00:00,Surya Kencana,3,744.000000,214.000000,9.310000,9.310,214.0
1,2022-07-06 10:00:00,Surya Kencana,3,1165.000000,231.000000,10.230000,9.310,214.0
2,2022-07-06 11:00:00,Surya Kencana,2,992.000000,110.000000,12.360000,9.770,222.5
3,2022-07-06 12:00:00,Surya Kencana,3,1165.000000,237.000000,9.500000,10.230,214.0
4,2022-07-06 13:00:00,Surya Kencana,3,1012.000000,385.500000,6.900000,9.865,222.5
...,...,...,...,...,...,...,...,...
66833,2022-09-04 17:00:00,N9 Jalak Harupat,2,1026.000000,71.000000,21.810000,22.845,62.0
66834,2022-09-04 18:00:00,N9 Jalak Harupat,0,1255.031641,7.926650,28.280303,22.840,62.0
66835,2022-09-04 19:00:00,N9 Jalak Harupat,2,1026.000000,63.000000,22.890000,22.845,62.0
66836,2022-09-04 20:00:00,N9 Jalak Harupat,2,675.000000,67.000000,18.540000,22.850,62.0


In [14]:
final_df = final_df.join(
    weather_data.set_index(["timestamp"]),
    on=['time'],
    how='inner',
)

final_df

Unnamed: 0,time,street,level,median_length,median_delay,median_speed_kmh,median_regular_speed,median_delay_seconds,rain_intensity
0,2022-07-06 09:00:00,Surya Kencana,3,744.000000,214.000000,9.310000,9.310000,214.000000,0.0
1453,2022-07-06 09:00:00,RE Martadinata,0,1555.620467,7.893071,25.423729,25.423729,7.893071,0.0
2906,2022-07-06 09:00:00,Binamarga,0,1617.307692,8.365385,29.346154,29.346154,8.365385,0.0
4359,2022-07-06 09:00:00,Jenderal Sudirman,3,461.000000,80.000000,13.350000,13.350000,80.000000,0.0
5812,2022-07-06 09:00:00,Taman Cimanggu Raya,0,2047.916667,9.375000,33.583333,33.583333,9.375000,0.0
...,...,...,...,...,...,...,...,...,...
61025,2022-09-04 21:00:00,Cilebut Raya,0,1439.817561,11.573804,23.706937,19.270000,60.500000,0.0
62478,2022-09-04 21:00:00,Semplak Raya,3,539.000000,152.000000,8.610000,19.470000,79.000000,0.0
63931,2022-09-04 21:00:00,Pahlawan,0,1799.920448,9.599060,32.400000,19.705000,74.000000,0.0
65384,2022-09-04 21:00:00,N9 KH Soleh Iskandar,3,442.000000,108.000000,11.540000,23.987499,76.000000,0.0


In [27]:
final_df.dtypes

time                    datetime64[ns]
street                          object
level                            int64
median_length                  float64
median_delay                   float64
median_speed_kmh               float64
median_regular_speed           float64
median_delay_seconds           float64
rain_intensity                 float64
dtype: object

In [42]:
def get_nearest_holiday_gap(time, data):
    nearest = 365
    for i in range(data.shape[0]):
        curr = np.abs((time - data.iloc[i,1]).days)
        if curr < nearest:
            nearest = np.abs((time - data.iloc[i,1]).days)
    if nearest > 7:
        nearest = -1
    return nearest

In [43]:
final_df['holiday_gap'] = final_df['time'].apply(get_nearest_holiday_gap, args=(holiday_data,))
final_df


Unnamed: 0,time,street,level,median_length,median_delay,median_speed_kmh,median_regular_speed,median_delay_seconds,rain_intensity,holiday_gap
0,2022-07-06 09:00:00,Surya Kencana,3,744.000000,214.000000,9.310000,9.310000,214.000000,0.0,3
1453,2022-07-06 09:00:00,RE Martadinata,0,1555.620467,7.893071,25.423729,25.423729,7.893071,0.0,3
2906,2022-07-06 09:00:00,Binamarga,0,1617.307692,8.365385,29.346154,29.346154,8.365385,0.0,3
4359,2022-07-06 09:00:00,Jenderal Sudirman,3,461.000000,80.000000,13.350000,13.350000,80.000000,0.0,3
5812,2022-07-06 09:00:00,Taman Cimanggu Raya,0,2047.916667,9.375000,33.583333,33.583333,9.375000,0.0,3
...,...,...,...,...,...,...,...,...,...,...
61025,2022-09-04 21:00:00,Cilebut Raya,0,1439.817561,11.573804,23.706937,19.270000,60.500000,0.0,-1
62478,2022-09-04 21:00:00,Semplak Raya,3,539.000000,152.000000,8.610000,19.470000,79.000000,0.0,-1
63931,2022-09-04 21:00:00,Pahlawan,0,1799.920448,9.599060,32.400000,19.705000,74.000000,0.0,-1
65384,2022-09-04 21:00:00,N9 KH Soleh Iskandar,3,442.000000,108.000000,11.540000,23.987499,76.000000,0.0,-1


In [46]:
final_df.reset_index(inplace=True, drop=True)
final_df

Unnamed: 0,time,street,level,median_length,median_delay,median_speed_kmh,median_regular_speed,median_delay_seconds,rain_intensity,holiday_gap
0,2022-07-06 09:00:00,Surya Kencana,3,744.000000,214.000000,9.310000,9.310000,214.000000,0.0,3
1,2022-07-06 09:00:00,RE Martadinata,0,1555.620467,7.893071,25.423729,25.423729,7.893071,0.0,3
2,2022-07-06 09:00:00,Binamarga,0,1617.307692,8.365385,29.346154,29.346154,8.365385,0.0,3
3,2022-07-06 09:00:00,Jenderal Sudirman,3,461.000000,80.000000,13.350000,13.350000,80.000000,0.0,3
4,2022-07-06 09:00:00,Taman Cimanggu Raya,0,2047.916667,9.375000,33.583333,33.583333,9.375000,0.0,3
...,...,...,...,...,...,...,...,...,...,...
66833,2022-09-04 21:00:00,Cilebut Raya,0,1439.817561,11.573804,23.706937,19.270000,60.500000,0.0,-1
66834,2022-09-04 21:00:00,Semplak Raya,3,539.000000,152.000000,8.610000,19.470000,79.000000,0.0,-1
66835,2022-09-04 21:00:00,Pahlawan,0,1799.920448,9.599060,32.400000,19.705000,74.000000,0.0,-1
66836,2022-09-04 21:00:00,N9 KH Soleh Iskandar,3,442.000000,108.000000,11.540000,23.987499,76.000000,0.0,-1


In [51]:
time_series_split = {
    'train': datetime.strptime(config['modeling']['time_series']['train_set'], '%Y-%m-%d %H:%M:%S.%f'),
    'test': datetime.strptime(config['modeling']['time_series']['test_set'], '%Y-%m-%d %H:%M:%S.%f')
}

classification_split = {
    'train': datetime.strptime(config['modeling']['classification']['train_set'], '%Y-%m-%d %H:%M:%S.%f'),
    'valid': datetime.strptime(config['modeling']['classification']['valid_set'], '%Y-%m-%d %H:%M:%S.%f'),
    'test': datetime.strptime(config['modeling']['classification']['test_set'], '%Y-%m-%d %H:%M:%S.%f')
}

In [53]:
def get_time_series_split(time, data):
    if time >= data['train'] and time < data['test']:
        return 'train'
    else:
        return 'test'

def get_classification_split(time, data):
    if time >= data['train'] and time < data['valid']:
        return 'train'
    elif time >= data['valid'] and time < data['test']:
        return 'valid'
    else:
        return 'test'

In [54]:
final_df['time_series_split'] = final_df['time'].apply(get_time_series_split, args=(time_series_split,))
final_df['classification_split'] = final_df['time'].apply(get_classification_split, args=(classification_split,))

In [55]:
final_df

Unnamed: 0,time,street,level,median_length,median_delay,median_speed_kmh,median_regular_speed,median_delay_seconds,rain_intensity,holiday_gap,time_series_split,classification_split
0,2022-07-06 09:00:00,Surya Kencana,3,744.000000,214.000000,9.310000,9.310000,214.000000,0.0,3,train,train
1,2022-07-06 09:00:00,RE Martadinata,0,1555.620467,7.893071,25.423729,25.423729,7.893071,0.0,3,train,train
2,2022-07-06 09:00:00,Binamarga,0,1617.307692,8.365385,29.346154,29.346154,8.365385,0.0,3,train,train
3,2022-07-06 09:00:00,Jenderal Sudirman,3,461.000000,80.000000,13.350000,13.350000,80.000000,0.0,3,train,train
4,2022-07-06 09:00:00,Taman Cimanggu Raya,0,2047.916667,9.375000,33.583333,33.583333,9.375000,0.0,3,train,train
...,...,...,...,...,...,...,...,...,...,...,...,...
66833,2022-09-04 21:00:00,Cilebut Raya,0,1439.817561,11.573804,23.706937,19.270000,60.500000,0.0,-1,test,test
66834,2022-09-04 21:00:00,Semplak Raya,3,539.000000,152.000000,8.610000,19.470000,79.000000,0.0,-1,test,test
66835,2022-09-04 21:00:00,Pahlawan,0,1799.920448,9.599060,32.400000,19.705000,74.000000,0.0,-1,test,test
66836,2022-09-04 21:00:00,N9 KH Soleh Iskandar,3,442.000000,108.000000,11.540000,23.987499,76.000000,0.0,-1,test,test


In [56]:
final_df.to_parquet(
    "../data/processed/final_dataset_Bogor.parquet.gzip",
    index=False,
    compression="gzip"
)