In [33]:
import pandas as pd
import holidays
from datetime import date

In [34]:
INPUT_DATA_FOLDER = "../data"
REPORTS_DATA_FILE = "ISW_vector.csv"

OUTPUT_FOLDER = "../data/all_data_preprocessed"
ISW_OUTPUT_DATA_FILE = "all_isw.csv"
ALARMS_OUTPUT_DATA_FILE = "all_alarms.csv"
WEATHER_EVENTS_OUTPUT_DATA_FILE = "all_weather_by_hour.csv"
MERGED_OUTPUT_DATA_FILE = "weather_regions_alarms_merged.csv"
ALL_MERGED_DATA_FILE = "all_merged"

MODEL_FOLDER = "model"

tfidf_transformer_model = "tfidf_transformer"
count_vectorizer_model = "count_vectorizer"

tfidf_transformer_version = "v1"
count_vectorizer_version = "v1"

In [35]:
df = pd.read_parquet(f"{OUTPUT_FOLDER}/{ALL_MERGED_DATA_FILE}.parquet")

In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622057 entries, 0 to 622056
Columns: 1039 entries, day_datetimeEpoch to 999
dtypes: float64(1030), int64(9)
memory usage: 4.8 GB


In [37]:
df['timestamp'] = pd.to_datetime(df['hour_datetimeEpoch'], unit='s').dt.strftime('%Y-%m-%d %H:%M:%S')

In [38]:
df['timestamp']= pd.to_datetime(df['timestamp'])
df = df.set_index(pd.DatetimeIndex(df['timestamp']))

In [39]:
hourly_has_event = df.groupby('region_id')['event_all_region'].resample('h').sum().gt(0).astype(int)
hours_with_events_per_day = hourly_has_event.groupby('region_id').resample('D', level=1).sum()

hours_with_events_per_day = hours_with_events_per_day.reset_index()

hours_with_events_per_day.rename(columns={'event_all_region': 'event_lastDay_region'}, inplace=True)

df['date'] = df['timestamp'].dt.date
hours_with_events_per_day['date'] = hours_with_events_per_day['timestamp'].dt.date

df = df.merge(
    hours_with_events_per_day[['region_id', 'date', 'event_lastDay_region']],
    on=['region_id', 'date'],
    how='left'
)



In [40]:
df[['event_lastDay_region', 'event_all_region', 'region_id']]

Unnamed: 0,event_lastDay_region,event_all_region,region_id
0,0,0.0,3
1,0,0.0,3
2,2,0.0,3
3,2,0.0,3
4,2,0.0,3
...,...,...,...
622052,12,1.0,16
622053,12,1.0,16
622054,12,1.0,16
622055,12,1.0,16


In [41]:
df.event_lastDay_region.unique()

array([ 0,  2, 11, 12,  8,  6,  7,  4,  3,  5, 10, 15, 13, 14,  1,  9, 16,
       17, 23, 19, 24, 18, 22, 21, 20])

In [42]:
df['timestamp'] = pd.to_datetime(df['hour_datetimeEpoch'], unit='s').dt.strftime('%Y-%m-%d %H:%M:%S')
df['timestamp']= pd.to_datetime(df['timestamp'])

In [43]:
df = df.set_index(['timestamp','region_id' ])
duplicates = df.index.duplicated(keep=False)
df = df[~df.index.duplicated(keep='first')]
df.reset_index(inplace=True, drop=False) 

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596712 entries, 0 to 596711
Columns: 1042 entries, timestamp to event_lastDay_region
dtypes: datetime64[ns](1), float64(1030), int64(10), object(1)
memory usage: 4.6+ GB


In [45]:
df.index

RangeIndex(start=0, stop=596712, step=1)

In [46]:
df[['event_all_region', 'timestamp']]

Unnamed: 0,event_all_region,timestamp
0,0.0,2022-02-24 22:00:00
1,0.0,2022-02-24 23:00:00
2,0.0,2022-02-25 00:00:00
3,0.0,2022-02-25 01:00:00
4,0.0,2022-02-25 02:00:00
...,...,...
596707,1.0,2025-03-01 17:00:00
596708,1.0,2025-03-01 18:00:00
596709,1.0,2025-03-01 19:00:00
596710,1.0,2025-03-01 20:00:00


In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596712 entries, 0 to 596711
Columns: 1042 entries, timestamp to event_lastDay_region
dtypes: datetime64[ns](1), float64(1030), int64(10), object(1)
memory usage: 4.6+ GB


In [48]:
hours = [1, 2, 3, 6, 12, 18, 24]

In [49]:
for h in hours:
    df = df.sort_values(by=['region_id', 'timestamp'])

    df_shifted = df[['region_id', 'timestamp', 'event_all_region']].copy()
    df_shifted['timestamp'] = df_shifted['timestamp'] + pd.Timedelta(hours=h)
    df_shifted = df_shifted.rename(columns={'event_all_region': f'event_{h}h_ago'})

    df = df.merge(df_shifted, on=['region_id', 'timestamp'], how='left')

    df[f'event_{h}h_ago'] = df[f'event_{h}h_ago'].fillna(0).astype(int)

In [50]:
df[['event_1h_ago','timestamp','region_id','event_all_region', 'event_lastDay_region']]

Unnamed: 0,event_1h_ago,timestamp,region_id,event_all_region,event_lastDay_region
0,0,2022-02-24 22:00:00,2,0.0,0
1,0,2022-02-24 23:00:00,2,0.0,0
2,0,2022-02-25 00:00:00,2,0.0,14
3,0,2022-02-25 01:00:00,2,0.0,14
4,0,2022-02-25 02:00:00,2,0.0,14
...,...,...,...,...,...
596707,0,2025-03-01 17:00:00,25,0.0,6
596708,0,2025-03-01 18:00:00,25,0.0,6
596709,0,2025-03-01 19:00:00,25,0.0,6
596710,0,2025-03-01 20:00:00,25,0.0,6


In [51]:
class UkrainianECBHolidays(holidays.HolidayBase):
    def __init__(self, years=None, **kwargs):
        self.country = "UA"
        super().__init__(years=years, **kwargs)

    def _populate(self, year):
        self[date(year, 1, 1)] = "New Year's Day"
        self[date(year, 1, 7)] = "Orthodox Christmas"
        self[date(year, 3, 8)] = "International Women's Day"
        self[date(year, 5, 1)] = "Labour Day"
        self[date(year, 5, 8)] = "Day of Remembrance and Reconciliation"
        self[date(year, 5, 9)] = "Victory Day"
        self[date(year, 6, 28)] = "Constitution Day of Ukraine"
        self[date(year, 8, 24)] = "Independence Day of Ukraine"
        self[date(year, 12, 25)] = "Christmas (Western)"

class RussianECBHolidays(holidays.HolidayBase):
    def __init__(self, years=None, **kwargs):
        self.country = "RU"
        super().__init__(years=years, **kwargs)

    def _populate(self, year):
        self[date(year, 1, 1)] = "New Year's Day (Russia)"
        self[date(year, 1, 7)] = "Orthodox Christmas (Russia)"
        self[date(year, 2, 23)] = "Defender of the Fatherland Day (Russia)"
        self[date(year, 3, 8)] = "International Women's Day (Russia)"
        self[date(year, 5, 1)] = "Spring and Labor Day (Russia)"
        self[date(year, 5, 9)] = "Victory Day (Russia)"
        self[date(year, 6, 12)] = "Russia Day"
        self[date(year, 11, 4)] = "Unity Day (Russia)"

In [52]:
ua_holidays = UkrainianECBHolidays(years=range(2020, 2032))
ru_holidays = RussianECBHolidays(years=range(2020, 2032))

In [53]:
df['ru_holiday'] = df['date'].apply(lambda x: 1 if x in ru_holidays else 0)
df['ua_holiday'] = df['date'].apply(lambda x: 1 if x in ua_holidays else 0)

df.drop('date', axis=1, inplace=True)

In [54]:
df.drop('timestamp', axis=1, inplace=True)

KeyError: "['date'] not found in axis"

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 596712 entries, 0 to 596711
Columns: 1049 entries, region_id to ua_holiday
dtypes: float64(1030), int64(19)
memory usage: 4.7 GB


In [56]:
df.to_parquet(f"{OUTPUT_FOLDER}/{ALL_MERGED_DATA_FILE}.parquet", index=False, engine="pyarrow")

In [16]:
df

Unnamed: 0,region_id,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,...,997,998,999,event_1h_ago,event_2h_ago,event_3h_ago,event_6h_ago,event_12h_ago,event_18h_ago,event_24h_ago
0,2,1645740000,7.2,-1.0,2.6,-1.8,74.2,0.0,0.00,143.9,...,0.504,0.477,0.611,0,0,0,0,0,0,0
1,2,1645740000,7.2,-1.0,2.6,-1.8,74.2,0.0,0.00,143.9,...,0.504,0.477,0.611,0,0,0,0,0,0,0
2,2,1645740000,7.2,-1.0,2.6,-1.8,74.2,0.0,0.00,143.9,...,0.504,0.477,0.611,0,0,0,0,0,0,0
3,2,1645740000,7.2,-1.0,2.6,-1.8,74.2,0.0,0.00,143.9,...,0.504,0.477,0.611,0,0,0,0,0,0,0
4,2,1645740000,7.2,-1.0,2.6,-1.8,74.2,0.0,0.00,143.9,...,0.504,0.477,0.611,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
596707,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.556,0.555,0.734,0,0,0,0,0,1,1
596708,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.556,0.555,0.734,0,0,0,0,0,1,1
596709,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.556,0.555,0.734,0,0,0,0,0,1,1
596710,25,1740780000,0.6,-2.4,-0.7,-2.8,86.3,2.0,4.17,41.0,...,0.556,0.555,0.734,0,0,0,0,0,1,0
