In [1]:
import pandas as pd
import holidays
from datetime import date

In [2]:
INPUT_DATA_FOLDER = "../data"
REPORTS_DATA_FILE = "ISW_vector.csv"

OUTPUT_FOLDER = "../data/all_data_preprocessed"
ISW_OUTPUT_DATA_FILE = "all_isw.csv"
ALARMS_OUTPUT_DATA_FILE = "all_alarms.csv"
WEATHER_EVENTS_OUTPUT_DATA_FILE = "all_weather_by_hour.csv"
MERGED_OUTPUT_DATA_FILE = "weather_regions_alarms_merged.csv"
ALL_MERGED_DATA_FILE = "all_merged"

MODEL_FOLDER = "model"

tfidf_transformer_model = "tfidf_transformer"
count_vectorizer_model = "count_vectorizer"

tfidf_transformer_version = "v1"
count_vectorizer_version = "v1"

In [3]:
df = pd.read_parquet(f"{OUTPUT_FOLDER}/{ALL_MERGED_DATA_FILE}.parquet")

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643944 entries, 0 to 643943
Columns: 1039 entries, day_datetimeEpoch to 999
dtypes: float64(1030), int64(9)
memory usage: 5.0 GB


In [5]:
df['timestamp'] = pd.to_datetime(df['hour_datetimeEpoch'], unit='s').dt.strftime('%Y-%m-%d %H:%M:%S')

In [6]:
df['timestamp']= pd.to_datetime(df['timestamp'])

In [7]:
df = df.set_index(['timestamp','region_id' ])
duplicates = df.index.duplicated(keep=False)
df = df[~df.index.duplicated(keep='first')]
df.reset_index(inplace=True, drop=False)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622656 entries, 0 to 622655
Columns: 1040 entries, timestamp to 999
dtypes: datetime64[ns](1), float64(1030), int64(9)
memory usage: 4.8 GB


In [9]:
df[['event_all_region', 'timestamp']]

Unnamed: 0,event_all_region,timestamp
0,0.0,2022-02-24 22:00:00
1,0.0,2022-02-24 23:00:00
2,0.0,2022-02-25 00:00:00
3,0.0,2022-02-25 01:00:00
4,0.0,2022-02-25 02:00:00
...,...,...
622651,0.0,2025-03-01 17:00:00
622652,0.0,2025-03-01 18:00:00
622653,0.0,2025-03-01 19:00:00
622654,0.0,2025-03-01 20:00:00


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622656 entries, 0 to 622655
Columns: 1040 entries, timestamp to 999
dtypes: datetime64[ns](1), float64(1030), int64(9)
memory usage: 4.8 GB


In [11]:
hours = [1, 2, 3, 6, 12, 18, 24]

In [12]:
for h in hours:
    df = df.sort_values(by=['region_id', 'timestamp'])

    df_shifted = df[['region_id', 'timestamp', 'event_all_region']].copy()
    df_shifted['timestamp'] = df_shifted['timestamp'] + pd.Timedelta(hours=h)
    df_shifted = df_shifted.rename(columns={'event_all_region': f'event_{h}h_ago'})

    df = df.merge(df_shifted, on=['region_id', 'timestamp'], how='left')

    df[f'event_{h}h_ago'] = df[f'event_{h}h_ago'].fillna(0).astype(int)

In [13]:
df[['event_1h_ago','timestamp','region_id','event_all_region']]

Unnamed: 0,event_1h_ago,timestamp,region_id,event_all_region
0,0,2022-02-24 22:00:00,1,0.0
1,0,2022-02-24 23:00:00,1,0.0
2,0,2022-02-25 00:00:00,1,0.0
3,0,2022-02-25 01:00:00,1,0.0
4,0,2022-02-25 02:00:00,1,0.0
...,...,...,...,...
622651,0,2025-03-01 17:00:00,25,0.0
622652,0,2025-03-01 18:00:00,25,0.0
622653,0,2025-03-01 19:00:00,25,0.0
622654,0,2025-03-01 20:00:00,25,0.0


Unnamed: 0,event_1h_ago,timestamp,region_id,event_all_region
25944,0,2022-02-24 22:00:00,2,0.0
25945,0,2022-02-24 23:00:00,2,0.0
25946,0,2022-02-25 00:00:00,2,0.0
25947,0,2022-02-25 01:00:00,2,0.0
25948,0,2022-02-25 02:00:00,2,0.0
...,...,...,...,...
51883,0,2025-03-01 17:00:00,2,0.0
51884,0,2025-03-01 18:00:00,2,0.0
51885,0,2025-03-01 19:00:00,2,0.0
51886,0,2025-03-01 20:00:00,2,0.0


In [15]:
df.drop('timestamp', axis=1, inplace=True)

In [16]:
df.to_parquet(f"{OUTPUT_FOLDER}/{ALL_MERGED_DATA_FILE}.parquet", index=False, engine="pyarrow")

In [17]:
df['region_id'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 13, 14, 15, 16, 17, 18,
       19, 20, 21, 22, 23, 24, 25])