# Configure Dependencies

In [1]:
import calendar
import pandas as pd
import json
import numpy as np
import os
import random
import requests
import yaml

from datetime import datetime, timedelta

In [2]:
config_path = "../config.yaml"
with open(config_path) as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

# Holiday Dataset

In [4]:
df_holiday = pd.read_parquet("../data/external/holiday_2022.parquet.gzip")
df_holiday

Unnamed: 0,holiday,date
0,New Year's Day,2022-01-01
1,Chinese New Year,2022-02-01
2,Isra and Mi'raj,2022-02-28
3,Nyepi,2022-03-03
4,Good Friday,2022-04-15
5,Labor Day,2022-05-01
6,Eid al-Fitr,2022-05-02
7,Eid al-Fitr Holiday,2022-05-03
8,Buddha Day,2022-05-16
9,Ascension Day,2022-05-26


In [4]:
df_holiday.dtypes

holiday            object
date       datetime64[ns]
dtype: object

# Weather Dataset

In [59]:
class WeatherScrapper:

    api_key = config['dataset']['weather']['api_key']
    gmt = config['dataset']['weather']['gmt']
    url_template = "http://api.openweathermap.org/data/3.0/onecall/timemachine?lat={}&lon={}&dt={}&appid={}"

    def convert_timestamp_to_unix(self, timestamp):
        utc_timestamp = timestamp - timedelta(hours=self.gmt)
        return calendar.timegm(timestamp.timetuple()) 
    
    def get_response(self, data, timestamp):
        url = self.url_template.format(
            data['mean_lat'].values[0],
            data['mean_long'].values[0],
            timestamp,
            self.api_key
        )
        return requests.get(url).json()

In [24]:
df_jam_corr = pd.read_parquet("../data/misc/jam_corr.parquet.gzip")
grouped_df_jam_corr = df_jam_corr.groupby(by=['street']).mean()
grouped_df_jam_corr.reset_index(inplace=True)
grouped_df_jam_corr.rename(columns={'index': 'street'}, inplace=True)
grouped_df_jam_corr.head(5)

Unnamed: 0,street,mean_long,mean_lat,var_long,var_lat
0,Achmad Adnawijaya,106.816823,-6.585711,2.552315e-07,4.110984e-06
1,Binamarga,106.810444,-6.602946,2.861442e-06,1.536205e-06
2,Brigjen Saptadji Hadi Prawira,106.767091,-6.57009,7.499105e-06,1.259566e-05
3,Cilebut Raya,106.799611,-6.549396,2.814504e-07,1.477661e-05
4,Ciomas Raya,106.771886,-6.601993,5.340735e-06,1.300086e-07


In [83]:
grouped_df_jam_corr.describe()

Unnamed: 0,mean_long,mean_lat,var_long,var_lat
count,46.0,46.0,46.0,46.0
mean,106.795465,-6.587558,8.040143e-06,1.182446e-05
std,0.02206,0.027023,1.345957e-05,1.976435e-05
min,106.741099,-6.663127,1.192194e-08,2.885991e-08
25%,106.783576,-6.602141,4.09338e-07,1.561889e-06
50%,106.796979,-6.586828,3.607304e-06,5.514204e-06
75%,106.808428,-6.567061,7.919897e-06,1.253747e-05
max,106.850726,-6.549396,6.038437e-05,0.0001163747


In [28]:
start_timestamp = datetime.strptime("2022-07-06 09:00:00.000", '%Y-%m-%d %H:%M:%S.%f')
end_timestamp = datetime.strptime("2022-09-06 00:00:00.000", '%Y-%m-%d %H:%M:%S.%f')

possible_timestamp = []
curr_timestamp = start_timestamp
while curr_timestamp <= end_timestamp:
    possible_timestamp.append(curr_timestamp)
    curr_timestamp += timedelta(hours=1)

print(len(possible_timestamp))

1480


In [68]:
scrapper = WeatherScrapper()

selected_timestamp = random.sample(possible_timestamp, 100)

data = []

for timestamp in selected_timestamp:
    selected_street = random.sample(list(grouped_df_jam_corr['street']), 5)
    unix_timestamp = scrapper.convert_timestamp_to_unix(timestamp)
    timestamp_data = []
    for street in selected_street:
        response = scrapper.get_response(
            data=grouped_df_jam_corr[grouped_df_jam_corr['street'] == street],
            timestamp=unix_timestamp
        )['data'][0]
        timestamp_data.append([
            response['temp'],
            response['feels_like'],
            response['pressure'],
            response['humidity'],
            response['dew_point'],
            response['clouds'],
            response['wind_speed'],
            response['wind_deg']
        ])
    timestamp_data_df = pd.DataFrame(timestamp_data, columns=[
        'temp',
        'feels_like',
        'pressure',
        'humidity',
        'dew_point',
        'clouds',
        'wind_speed',
        'win_deg'
    ])
    data.append(
        [timestamp]+[np.var(timestamp_data_df[col]) for col in timestamp_data_df.columns]
    )

var_df = pd.DataFrame(data, columns=['timestamp']+[col+"_var" for col in timestamp_data_df.columns])
var_df


Unnamed: 0,timestamp,temp_var,feels_like_var,pressure_var,humidity_var,dew_point_var,clouds_var,wind_speed_var,win_deg_var
0,2022-09-05 21:00:00,0.010440,0.012584,0.00,0.0,0.010440,0.00,0.000000e+00,0.00
1,2022-08-20 09:00:00,0.327056,1.640200,0.00,0.0,0.305520,0.00,0.000000e+00,0.00
2,2022-08-22 11:00:00,0.067176,0.692376,7.84,0.0,0.065320,2.56,5.776000e-03,116.64
3,2022-09-01 03:00:00,0.065216,0.284264,11.76,0.0,0.055976,0.00,4.034400e-02,11405.76
4,2022-07-11 16:00:00,0.030616,0.170816,7.84,0.0,0.029384,0.00,2.073600e-02,2043.04
...,...,...,...,...,...,...,...,...,...
95,2022-09-04 04:00:00,0.494040,2.951080,0.00,0.0,0.446256,0.00,0.000000e+00,0.00
96,2022-08-10 08:00:00,0.381024,2.393544,0.00,0.0,0.345784,0.00,0.000000e+00,0.00
97,2022-08-20 14:00:00,0.023040,0.066440,7.84,0.0,0.020744,0.00,6.400000e-05,2540.16
98,2022-07-11 12:00:00,0.323920,0.258856,7.84,36.0,0.651064,0.00,1.232595e-32,0.00


In [73]:
var_df.to_parquet('../data/misc/sample_weather_variance.parquet.gzip',
    index=False,
    compression='gzip'
)

In [72]:
var_df.describe().loc['mean',:]

temp_var             0.127972
feels_like_var       0.539999
pressure_var         4.868800
humidity_var         1.327200
dew_point_var        0.138166
clouds_var           1.126400
wind_speed_var       0.025641
win_deg_var       1071.137600
Name: mean, dtype: float64

Standard Deviation and Variance value on weather related attribute and coordinates on chosen location were small, so we can create assumption that on every location that being choosen, the weather condition is almost same