### Importing Libraries and Loading the Data

In [1]:
import pandas as pd

regions = pd.read_csv("../data/regions.csv")
alarms = pd.read_csv("../prepared_data/alarms_prepared.csv")
weather = pd.read_csv("../prepared_data/weather_prepared.csv")
isw = pd.read_csv("../prepared_data/isw_prepared.csv")

### Preview the data

In [2]:
regions

Unnamed: 0,region,center_city_ua,center_city_en,region_alt,region_id
0,АР Крим,Сімферополь,Simferopol,Крим,1
1,Вінницька,Вінниця,Vinnytsia,Вінниччина,2
2,Волинська,Луцьк,Lutsk,Волинь,3
3,Дніпропетровська,Дніпро,Dnipro,Дніпропетровщина,4
4,Донецька,Донецьк,Donetsk,Донеччина,5
5,Житомирська,Житомир,Zhytomyr,Житомирщина,6
6,Закарпатська,Ужгород,Uzhgorod,Закарпаття,7
7,Запорізька,Запоріжжя,Zaporozhye,Запоріжжя,8
8,Івано-Франківська,Івано-Франківськ,Ivano-Frankivsk,Івано-Франківщина,9
9,Київська,Київ,Kyiv,Київщина,10


In [3]:
weather.head()

Unnamed: 0.1,Unnamed: 0,city_latitude,city_longitude,day_tempmax,day_tempmin,day_temp,day_precipcover,day_moonphase,hour_datetimeEpoch,hour_temp,...,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions,datetime,city
0,0,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645653600,2.4,...,275.6,1020.0,0.0,91.5,0.0,0.1,0.0,Overcast,2022-02-24 00:00:00,Луцьк
1,1,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645657200,2.4,...,280.3,1021.0,0.2,88.2,0.0,0.1,0.0,Partially cloudy,2022-02-24 01:00:00,Луцьк
2,2,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645660800,2.9,...,310.0,1022.0,10.0,100.0,0.0,0.1,0.0,Overcast,2022-02-24 02:00:00,Луцьк
3,3,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645664400,2.3,...,295.1,1021.0,0.1,92.0,0.0,0.1,0.0,Overcast,2022-02-24 03:00:00,Луцьк
4,4,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645668000,1.9,...,305.8,1021.0,0.0,93.8,0.0,0.1,0.0,Overcast,2022-02-24 04:00:00,Луцьк


Before performing the merge, we examine the columns that will be used as keys to ensure they contain **compatible values**. This helps us verify consistency between datasets and avoid mismatches during merging.

In [4]:
weather["city"].unique()

array(['Луцьк', 'Кропивницький', 'Дніпро', 'Київ', 'Херсон', 'Чернівці',
       'Чернігів', 'Одеса', 'Миколаїв', 'Харків', 'Хмельницький',
       'Донецьк', 'Ужгород', 'Запоріжжя', 'Рівне', 'Житомир', 'Тернопіль',
       'Полтава', 'Львів', 'Івано-Франківськ', 'Черкаси', 'Суми',
       'Вінниця'], dtype=object)

In [5]:
regions["center_city_ua"].unique()

array(['Сімферополь', 'Вінниця', 'Луцьк', 'Дніпро', 'Донецьк', 'Житомир',
       'Ужгород', 'Запоріжжя', 'Івано-Франківськ', 'Київ',
       'Кропивницький', 'Луганськ', 'Львів', 'Миколаїв', 'Одеса',
       'Полтава', 'Рівне', 'Суми', 'Тернопіль', 'Харків', 'Херсон',
       'Хмельницький', 'Черкаси', 'Чернівці', 'Чернігів'], dtype=object)

We can see that the `regions` dataset contains two additional cities — **Simferopol** and **Luhansk** — which are not present in the weather or alarm datasets. Since we don't have any related data for these cities, this is not an issue.

All other city names match correctly, so we can safely proceed with the merge using this feature.


In [6]:
weather_reg = pd.merge(weather, regions, left_on="city", right_on="center_city_ua")

In [7]:
weather_reg

Unnamed: 0.1,Unnamed: 0,city_latitude,city_longitude,day_tempmax,day_tempmin,day_temp,day_precipcover,day_moonphase,hour_datetimeEpoch,hour_temp,...,hour_solarenergy,hour_uvindex,hour_conditions,datetime,city,region,center_city_ua,center_city_en,region_alt,region_id
0,0,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645653600,2.4,...,0.1,0.0,Overcast,2022-02-24 00:00:00,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
1,1,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645657200,2.4,...,0.1,0.0,Partially cloudy,2022-02-24 01:00:00,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
2,2,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645660800,2.9,...,0.1,0.0,Overcast,2022-02-24 02:00:00,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
3,3,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645664400,2.3,...,0.1,0.0,Overcast,2022-02-24 03:00:00,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
4,4,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645668000,1.9,...,0.1,0.0,Overcast,2022-02-24 04:00:00,Луцьк,Волинська,Луцьк,Lutsk,Волинь,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608299,608299,49.5879,34.5517,0.3,-6.9,-3.0,0.00,0.05,1740848400,-1.3,...,0.0,0.0,Overcast,2025-03-01 19:00:00,Полтава,Полтавська,Полтава,Poltava,Полтавщина,16
608300,608300,49.5879,34.5517,0.3,-6.9,-3.0,0.00,0.05,1740852000,-1.8,...,0.0,0.0,Overcast,2025-03-01 20:00:00,Полтава,Полтавська,Полтава,Poltava,Полтавщина,16
608301,608301,49.5879,34.5517,0.3,-6.9,-3.0,0.00,0.05,1740855600,-1.0,...,0.0,0.0,Overcast,2025-03-01 21:00:00,Полтава,Полтавська,Полтава,Poltava,Полтавщина,16
608302,608302,49.5879,34.5517,0.3,-6.9,-3.0,0.00,0.05,1740859200,-1.7,...,0.0,0.0,Overcast,2025-03-01 22:00:00,Полтава,Полтавська,Полтава,Poltava,Полтавщина,16


After merging the datasets, we can see that the resulting DataFrame contains an **unnamed column**, which we will drop as it doesn't provide any useful information.

We will also remove the `center_city_en` column, since English city names are not needed for our analysis, and the `center_city_ua` column, as it duplicates the data already available in the `city` column.

These columns do not add value to the modeling process and will be excluded to keep the dataset clean and focused.

In [8]:
weather_reg.drop(["Unnamed: 0", "center_city_en", "center_city_ua"], axis=1,
                 inplace=True)

In [9]:
weather_reg

Unnamed: 0,city_latitude,city_longitude,day_tempmax,day_tempmin,day_temp,day_precipcover,day_moonphase,hour_datetimeEpoch,hour_temp,hour_humidity,...,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions,datetime,city,region,region_alt,region_id
0,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645653600,2.4,89.18,...,91.5,0.0,0.1,0.0,Overcast,2022-02-24 00:00:00,Луцьк,Волинська,Волинь,3
1,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645657200,2.4,87.90,...,88.2,0.0,0.1,0.0,Partially cloudy,2022-02-24 01:00:00,Луцьк,Волинська,Волинь,3
2,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645660800,2.9,88.58,...,100.0,0.0,0.1,0.0,Overcast,2022-02-24 02:00:00,Луцьк,Волинська,Волинь,3
3,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645664400,2.3,86.63,...,92.0,0.0,0.1,0.0,Overcast,2022-02-24 03:00:00,Луцьк,Волинська,Волинь,3
4,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645668000,1.9,87.85,...,93.8,0.0,0.1,0.0,Overcast,2022-02-24 04:00:00,Луцьк,Волинська,Волинь,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
608299,49.5879,34.5517,0.3,-6.9,-3.0,0.00,0.05,1740848400,-1.3,73.09,...,100.0,0.0,0.0,0.0,Overcast,2025-03-01 19:00:00,Полтава,Полтавська,Полтавщина,16
608300,49.5879,34.5517,0.3,-6.9,-3.0,0.00,0.05,1740852000,-1.8,81.17,...,100.0,0.0,0.0,0.0,Overcast,2025-03-01 20:00:00,Полтава,Полтавська,Полтавщина,16
608301,49.5879,34.5517,0.3,-6.9,-3.0,0.00,0.05,1740855600,-1.0,68.31,...,99.6,0.0,0.0,0.0,Overcast,2025-03-01 21:00:00,Полтава,Полтавська,Полтавщина,16
608302,49.5879,34.5517,0.3,-6.9,-3.0,0.00,0.05,1740859200,-1.7,71.36,...,98.2,0.0,0.0,0.0,Overcast,2025-03-01 22:00:00,Полтава,Полтавська,Полтавщина,16


In [10]:
alarms

Unnamed: 0.1,Unnamed: 0,region_city,all_region,start_epoch,end_epoch,hour_time
0,0,Львівська,1,1645688597,1645696348,2022-02-24 07:00:00
1,1,Львівська,1,1645688597,1645696348,2022-02-24 08:00:00
2,2,Львівська,1,1645688597,1645696348,2022-02-24 09:00:00
3,3,Львівська,1,1645688597,1645696348,2022-02-24 10:00:00
4,4,Чернігівська,1,1645711243,1645722703,2022-02-24 14:00:00
...,...,...,...,...,...,...
179378,179416,Житомирська,1,1740871567,1740883406,2025-03-01 23:00:00
179379,179417,Житомирська,1,1740871567,1740883406,2025-03-02 00:00:00
179380,179418,Житомирська,1,1740871567,1740883406,2025-03-02 01:00:00
179381,179419,Житомирська,1,1740871567,1740883406,2025-03-02 02:00:00


In [11]:
alarms.drop("Unnamed: 0", axis=1, inplace=True)

To clearly distinguish which columns originated from the `alarms` dataset after the merge, we will add the prefix `alarms_` to all relevant column names.

In [12]:
alarms_copy = alarms.copy().add_prefix("alarms_")

In [13]:
alarms_copy["alarms_region_city"].unique()

array(['Львівська', 'Чернігівська', 'Вінницька', 'Харківська',
       'Тернопільська', 'Київська', 'Рівненська', 'Черкаська', 'Одеська',
       'Запорізька', 'Волинська', 'Житомирська', 'Херсонська',
       'Миколаївська', 'Хмельницька', 'Івано-Франківська',
       'Дніпропетровська', 'Кіровоградська', 'Чернівецька', 'Полтавська',
       'Сумська', 'Донецька', 'Закарпатська'], dtype=object)

In [14]:
weather_reg["region"].unique()

array(['Волинська', 'Кіровоградська', 'Дніпропетровська', 'Київська',
       'Херсонська', 'Чернівецька', 'Чернігівська', 'Одеська',
       'Миколаївська', 'Харківська', 'Хмельницька', 'Донецька',
       'Закарпатська', 'Запорізька', 'Рівненська', 'Житомирська',
       'Тернопільська', 'Полтавська', 'Львівська', 'Івано-Франківська',
       'Черкаська', 'Сумська', 'Вінницька'], dtype=object)

We can see that the columns `"region_alt"`, `"region_id"`, and `"city"` are no longer needed in the dataset. This information is **duplicated in other columns**, and we **won’t be using it for any further merging or analysis**.

As a result, these columns will be safely **dropped** to streamline the dataset.


In [15]:
weather_reg.drop(["region_alt", "region_id", "city"], axis=1, inplace=True)

In [16]:
weather_reg_alarms = weather_reg.merge(alarms_copy,
                                       how="left",
                                       left_on=["region", "datetime"],
                                       right_on=["alarms_region_city", "alarms_hour_time"])

### Look at the merged dataset

In [17]:
weather_reg_alarms[["datetime", "alarms_hour_time", "region", "alarms_region_city", "alarms_all_region"]]

Unnamed: 0,datetime,alarms_hour_time,region,alarms_region_city,alarms_all_region
0,2022-02-24 00:00:00,,Волинська,,
1,2022-02-24 01:00:00,,Волинська,,
2,2022-02-24 02:00:00,,Волинська,,
3,2022-02-24 03:00:00,,Волинська,,
4,2022-02-24 04:00:00,,Волинська,,
...,...,...,...,...,...
634116,2025-03-01 19:00:00,2025-03-01 19:00:00,Полтавська,Полтавська,1.0
634117,2025-03-01 20:00:00,2025-03-01 20:00:00,Полтавська,Полтавська,1.0
634118,2025-03-01 21:00:00,2025-03-01 21:00:00,Полтавська,Полтавська,1.0
634119,2025-03-01 22:00:00,2025-03-01 22:00:00,Полтавська,Полтавська,1.0


We can see that the merge was successful and the data is structured as expected. However, based on our earlier analysis of the alarms dataset, we remember that the column `alarms_all_region` was used to indicate whether an alarm applied to the **entire region**.

For the **Kyiv region**, this requires special handling:
If the `region` is `Київська` and `alarms_all_region` is `0`, it actually refers to the city of **Kyiv**, not the surrounding region.

To correct this, we update the `region` value to `Київ`.

In [18]:
weather_reg_alarms.loc[
    (weather_reg_alarms['region'] == 'Київська') & (weather_reg_alarms['alarms_all_region'] == 0), 'region'] = 'Київ'

Since the `alarms_all_region` column was only needed to make the correction for Kyiv, and no longer contains any useful information for further analysis, we can now **safely drop it**.

We will also remove other columns that were only necessary for the merging process and are no longer needed in the final dataset. This helps keep our data clean and focused on relevant features.


In [19]:
weather_reg_alarms.drop(["alarms_hour_time", "alarms_region_city", "alarms_all_region"], axis=1, inplace=True)

In [20]:
weather_reg_alarms.head()

Unnamed: 0,city_latitude,city_longitude,day_tempmax,day_tempmin,day_temp,day_precipcover,day_moonphase,hour_datetimeEpoch,hour_temp,hour_humidity,...,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions,datetime,region,alarms_start_epoch,alarms_end_epoch
0,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645653600,2.4,89.18,...,0.0,91.5,0.0,0.1,0.0,Overcast,2022-02-24 00:00:00,Волинська,,
1,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645657200,2.4,87.9,...,0.2,88.2,0.0,0.1,0.0,Partially cloudy,2022-02-24 01:00:00,Волинська,,
2,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645660800,2.9,88.58,...,10.0,100.0,0.0,0.1,0.0,Overcast,2022-02-24 02:00:00,Волинська,,
3,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645664400,2.3,86.63,...,0.1,92.0,0.0,0.1,0.0,Overcast,2022-02-24 03:00:00,Волинська,,
4,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645668000,1.9,87.85,...,0.0,93.8,0.0,0.1,0.0,Overcast,2022-02-24 04:00:00,Волинська,,


In [21]:
weather_reg_alarms.sort_values(by=["datetime"], inplace=True)

Create a new column `alarm_active` to indicate whether an **alarm was active during a specific hour**.

In [22]:
weather_reg_alarms["is_alarm"] = weather_reg_alarms["alarms_start_epoch"].notna().astype(int)

In [23]:
weather_reg_alarms[["datetime", "is_alarm", "alarms_start_epoch"]]

Unnamed: 0,datetime,is_alarm,alarms_start_epoch
0,2022-02-24 00:00:00,0,
180915,2022-02-24 00:00:00,0,
8017,2022-02-24 00:00:00,0,
83511,2022-02-24 00:00:00,0,
108043,2022-02-24 00:00:00,0,
...,...,...,...
595963,2025-03-01 23:00:00,1,1.740872e+09
226494,2025-03-01 23:00:00,1,1.740863e+09
614517,2025-03-01 23:00:00,0,
344645,2025-03-01 23:00:00,1,1.740863e+09


### Creating the Target Variable for Regression

For our regression task, we want the model to predict the **start time of the next alarm** (`alarms_start_epoch`) based on weather conditions and other features.

However, since most rows in the dataset do not correspond to an active alarm, the `alarms_start_epoch` column contains many missing (`NaN`) values. To provide a complete target for each row, we apply the following approach:

#### ✅ Using Backfill to Fill Missing Values

We fill the missing values in the `alarms_start_epoch` column using the **backfill** method.

In [24]:
alarm_cols = ['alarms_start_epoch', 'alarms_end_epoch']
weather_reg_alarms[alarm_cols] = weather_reg_alarms[alarm_cols].bfill()

In [25]:
weather_reg_alarms.info()

<class 'pandas.core.frame.DataFrame'>
Index: 634121 entries, 0 to 634120
Data columns (total 31 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   city_latitude        634121 non-null  float64
 1   city_longitude       634121 non-null  float64
 2   day_tempmax          634121 non-null  float64
 3   day_tempmin          634121 non-null  float64
 4   day_temp             634121 non-null  float64
 5   day_precipcover      634121 non-null  float64
 6   day_moonphase        634121 non-null  float64
 7   hour_datetimeEpoch   634121 non-null  int64  
 8   hour_temp            634121 non-null  float64
 9   hour_humidity        634121 non-null  float64
 10  hour_dew             634121 non-null  float64
 11  hour_precip          634121 non-null  float64
 12  hour_precipprob      634121 non-null  float64
 13  hour_snow            634121 non-null  float64
 14  hour_snowdepth       634121 non-null  float64
 15  hour_preciptype      6

In [26]:
weather_reg.shape

(608304, 28)

In [27]:
isw.head()

Unnamed: 0,advanced,air,army,artillery,authority,avdiivka,bakhmut,belarus,border,brigade,...,significant,southeast,southwest,state,unit,unspecified,use,wagner,within,hour_time
0,0.0,0.271181,0.0,0.0,0.0,0.0,0.0,0.606153,0.270938,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.424699,0.0,0.0,2022-02-25 00:00:00
1,0.0,0.271181,0.0,0.0,0.0,0.0,0.0,0.606153,0.270938,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.424699,0.0,0.0,2022-02-25 01:00:00
2,0.0,0.271181,0.0,0.0,0.0,0.0,0.0,0.606153,0.270938,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.424699,0.0,0.0,2022-02-25 02:00:00
3,0.0,0.271181,0.0,0.0,0.0,0.0,0.0,0.606153,0.270938,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.424699,0.0,0.0,2022-02-25 03:00:00
4,0.0,0.271181,0.0,0.0,0.0,0.0,0.0,0.606153,0.270938,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.424699,0.0,0.0,2022-02-25 04:00:00


In [28]:
final = weather_reg_alarms.merge(isw,
                                 how="left",
                                 left_on=["datetime"],
                                 right_on=["hour_time"])

In [29]:
final.head()

Unnamed: 0,city_latitude,city_longitude,day_tempmax,day_tempmin,day_temp,day_precipcover,day_moonphase,hour_datetimeEpoch,hour_temp,hour_humidity,...,significant,southeast,southwest,state,unit,unspecified,use,wagner,within,hour_time
0,50.7469,25.3263,4.9,0.7,2.6,4.17,0.77,1645653600,2.4,89.18,...,,,,,,,,,,
1,49.2336,28.4486,5.0,0.7,2.8,4.17,0.77,1645653600,2.1,91.76,...,,,,,,,,,,
2,48.5085,32.2656,6.2,-1.3,2.2,0.0,0.77,1645653600,0.0,82.64,...,,,,,,,,,,
3,49.4168,26.9743,4.7,0.2,2.3,8.33,0.77,1645653600,2.2,88.52,...,,,,,,,,,,
4,47.8289,35.1626,8.0,-2.0,3.3,0.0,0.77,1645653600,1.0,80.38,...,,,,,,,,,,


In [30]:
final.drop(["datetime", "hour_time"], axis=1, inplace=True)

In [31]:
final.fillna(0, inplace=True)

In [32]:
final.to_csv("../prepared_data/final_dataset.csv", index=False)