In [1]:
import pandas as pd
import holidays
from datetime import date

In [2]:
INPUT_DATA_FOLDER = "../data"
REPORTS_DATA_FILE = "ISW_vector.csv"

OUTPUT_FOLDER = "../data/all_data_preprocessed"
ISW_OUTPUT_DATA_FILE = "all_isw.csv"
ALARMS_OUTPUT_DATA_FILE = "all_alarms.csv"
WEATHER_EVENTS_OUTPUT_DATA_FILE = "all_weather_by_hour.csv"
MERGED_OUTPUT_DATA_FILE = "weather_regions_alarms_merged.csv"
ALL_MERGED_DATA_FILE = "all_merged"

MODEL_FOLDER = "model"

tfidf_transformer_model = "tfidf_transformer"
count_vectorizer_model = "count_vectorizer"

tfidf_transformer_version = "v1"
count_vectorizer_version = "v1"

In [3]:
df = pd.read_parquet(f"{OUTPUT_FOLDER}/{ALL_MERGED_DATA_FILE}.parquet")

In [4]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643944 entries, 0 to 643943
Columns: 1039 entries, day_datetimeEpoch to 999
dtypes: float64(1030), int64(9)
memory usage: 5.0 GB


In [5]:
df[['hour_datetime', 'hour_datetimeEpoch','alarms_in_regions']].head(6)

Unnamed: 0,hour_datetime,hour_datetimeEpoch,alarms_in_regions
0,0.0,1645740000,10.0
1,0.041667,1645743600,10.0
2,0.083333,1645747200,10.0
3,0.125,1645750800,10.0
4,0.166667,1645754400,10.0
5,0.208333,1645758000,10.0


In [6]:
df['timestamp'] = pd.to_datetime(df['hour_datetimeEpoch'], unit='s').dt.strftime('%Y-%m-%d %H:%M:%S')
df['date'] = pd.to_datetime(df['hour_datetimeEpoch'], unit='s').dt.date

In [7]:
df = df.set_index(pd.DatetimeIndex(df['timestamp']))
df.drop_duplicates(subset=['timestamp'], inplace=True, keep='first')
df[['alarms_in_regions', 'event_all_region']].head(6)

Unnamed: 0_level_0,alarms_in_regions,event_all_region
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2022-02-24 22:00:00,10.0,0.0
2022-02-24 23:00:00,10.0,0.0
2022-02-25 00:00:00,10.0,0.0
2022-02-25 01:00:00,10.0,0.0
2022-02-25 02:00:00,10.0,0.0
2022-02-25 03:00:00,10.0,0.0


In [8]:
df['event_lastDay_region'] = df['event_all_region'].resample('D', label='right', closed='left').sum()
df['event_lastDay_region'] = df['event_lastDay_region'].bfill()
df[['event_lastDay_region', 'event_all_region', 'region_id']]

Unnamed: 0_level_0,event_lastDay_region,event_all_region,region_id
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-02-24 22:00:00,0.0,0.0,3
2022-02-24 23:00:00,0.0,0.0,3
2022-02-25 00:00:00,0.0,0.0,3
2022-02-25 01:00:00,2.0,0.0,3
2022-02-25 02:00:00,2.0,0.0,3
...,...,...,...
2025-03-01 17:00:00,,0.0,24
2025-03-01 18:00:00,,0.0,24
2025-03-01 19:00:00,,0.0,24
2025-03-01 20:00:00,,0.0,24


In [9]:
df['event_1h'] = df['alarms_in_regions'].shift(periods=1, freq='h')
df['event_2h'] = df['alarms_in_regions'].shift(periods=2, freq='h')
df['event_3h'] = df['alarms_in_regions'].shift(periods=3, freq='h')
df['event_6h'] = df['alarms_in_regions'].shift(periods=6, freq='h')
df['event_12h'] = df['alarms_in_regions'].shift(periods=12, freq='h')
df['event_18h'] = df['alarms_in_regions'].shift(periods=18, freq='h')
df['event_24h'] = df['alarms_in_regions'].shift(periods=24, freq='h')

In [10]:
df.fillna(0, inplace=True)

In [11]:
class UkrainianECBHolidays(holidays.HolidayBase):
    def __init__(self, years=None, **kwargs):
        self.country = "UA"
        super().__init__(years=years, **kwargs)

    def _populate(self, year):
        self[date(year, 1, 1)] = "New Year's Day"
        self[date(year, 1, 7)] = "Orthodox Christmas"
        self[date(year, 3, 8)] = "International Women's Day"
        self[date(year, 5, 1)] = "Labour Day"
        self[date(year, 5, 8)] = "Day of Remembrance and Reconciliation"
        self[date(year, 5, 9)] = "Victory Day"
        self[date(year, 6, 28)] = "Constitution Day of Ukraine"
        self[date(year, 8, 24)] = "Independence Day of Ukraine"
        self[date(year, 12, 25)] = "Christmas (Western)"

class RussianECBHolidays(holidays.HolidayBase):
    def __init__(self, years=None, **kwargs):
        self.country = "RU"
        super().__init__(years=years, **kwargs)

    def _populate(self, year):
        self[date(year, 1, 1)] = "New Year's Day (Russia)"
        self[date(year, 1, 7)] = "Orthodox Christmas (Russia)"
        self[date(year, 2, 23)] = "Defender of the Fatherland Day (Russia)"
        self[date(year, 3, 8)] = "International Women's Day (Russia)"
        self[date(year, 5, 1)] = "Spring and Labor Day (Russia)"
        self[date(year, 5, 9)] = "Victory Day (Russia)"
        self[date(year, 6, 12)] = "Russia Day"
        self[date(year, 11, 4)] = "Unity Day (Russia)"

In [12]:
ua_holidays = UkrainianECBHolidays(years=range(2020, 2032))
ru_holidays = RussianECBHolidays(years=range(2020, 2032))

In [13]:

df['ru_holiday'] = df['date'].apply(lambda x: 1 if x in ru_holidays else 0)
df['ua_holiday'] = df['date'].apply(lambda x: 1 if x in ua_holidays else 0)

df.drop('date', axis=1, inplace=True)

In [14]:
cols = list(df.columns)

move_after = 'alarms_in_regions'
to_move = ['event_1h', 'event_2h', 'event_3h', 'event_6h', 'event_12h', 'event_18h', 'event_24h', 'ru_holiday', 'ua_holiday', 'event_lastDay_region']

cols = [col for col in cols if col not in to_move]
insert_at = cols.index(move_after) + 1

for i, col in enumerate(to_move):
    cols.insert(insert_at + i, col)

df = df[cols]

In [15]:
df.head(10)

Unnamed: 0_level_0,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,...,991,992,993,994,995,996,997,998,999,timestamp
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-02-24 22:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611,2022-02-24 22:00:00
2022-02-24 23:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611,2022-02-24 23:00:00
2022-02-25 00:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611,2022-02-25 00:00:00
2022-02-25 01:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611,2022-02-25 01:00:00
2022-02-25 02:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611,2022-02-25 02:00:00
2022-02-25 03:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611,2022-02-25 03:00:00
2022-02-25 04:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611,2022-02-25 04:00:00
2022-02-25 05:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611,2022-02-25 05:00:00
2022-02-25 06:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611,2022-02-25 06:00:00
2022-02-25 07:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611,2022-02-25 07:00:00


In [16]:
df.drop('timestamp', axis=1, inplace=True)


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 25944 entries, 2022-02-24 22:00:00 to 2025-03-01 21:00:00
Columns: 1049 entries, day_datetimeEpoch to 999
dtypes: float64(1038), int64(11)
memory usage: 207.8 MB


In [18]:
df.head(10)

Unnamed: 0_level_0,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,...,990,991,992,993,994,995,996,997,998,999
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-02-24 22:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
2022-02-24 23:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
2022-02-25 00:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
2022-02-25 01:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
2022-02-25 02:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
2022-02-25 03:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
2022-02-25 04:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
2022-02-25 05:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
2022-02-25 06:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
2022-02-25 07:00:00,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611


In [19]:
df.reset_index(inplace=True)

In [20]:
df.drop('timestamp', axis=1, inplace=True)
df.head(10)

Unnamed: 0,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,...,990,991,992,993,994,995,996,997,998,999
0,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
1,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
2,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
3,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
4,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
5,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
6,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
7,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
8,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611
9,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.524,0.702,0.57,0.539,0.521,0.503,0.524,0.504,0.477,0.611


In [21]:
df.to_parquet(f"{OUTPUT_FOLDER}/{ALL_MERGED_DATA_FILE}.parquet", index=False, engine="pyarrow")

In [22]:
df.region_id.unique()

array([ 3, 24])