In [1]:
import pandas as pd
import datetime
import weather_scrap as w
import alerts_in_ua as a
from sklearn.preprocessing import LabelEncoder

### Merging regions and weather

In [2]:
df_regions = pd.read_csv(f"../data/regions.csv")

In [3]:
df_regions.head(5)

Unnamed: 0,region,center_city_ua,center_city_en,region_alt,region_id
0,АР Крим,Сімферополь,Simferopol,Крим,1
1,Вінницька,Вінниця,Vinnytsia,Вінниччина,2
2,Волинська,Луцьк,Lutsk,Волинь,3
3,Дніпропетровська,Дніпро,Dnipro,Дніпропетровщина,4
4,Донецька,Донецьк,Donetsk,Донеччина,5


In [4]:
df_regions["region"].unique()

array(['АР Крим', 'Вінницька', 'Волинська', 'Дніпропетровська',
       'Донецька', 'Житомирська', 'Закарпатська', 'Запорізька',
       'Івано-Франківська', 'Київська', 'Кіровоградська', 'Луганська',
       'Львівська', 'Миколаївська', 'Одеська', 'Полтавська', 'Рівненська',
       'Сумська', 'Тернопільська', 'Харківська', 'Херсонська',
       'Хмельницька', 'Черкаська', 'Чернівецька', 'Чернігівська'],
      dtype=object)

In [5]:
df_regions["region"] = df_regions["region"].apply(lambda x: x + " обл.")

In [6]:
weather_regions = [
    "Vinnytsia, Ukraine",
    "Lutsk, Ukraine",
    "Dnipro, Ukraine",
    "Donetsk, Ukraine",
    "Zhytomyr, Ukraine",
    "Uzhhorod, Ukraine",
    "Zaporizhia, Ukraine",
    "Ivano-Frankivsk, Ukraine",
    "Kyiv, Ukraine",
    "Kropyvnytskyi, Ukraine",
    "Lviv, Ukraine",
    "Mykolaiv, Ukraine",
    "Odessa, Ukraine",
    "Poltava, Ukraine",
    "Rivne, Ukraine",
    "Sumy, Ukraine",
    "Ternopil, Ukraine",
    "Kharkiv, Ukraine",
    "Kherson, Ukraine",
    "Khmelnytskyi, Ukraine",
    "Cherkasy, Ukraine",
    "Chernivtsi, Ukraine",
    "Chernigiv, Ukraine"
]

In [7]:
all_data = []

for region in weather_regions:
    forecast = w.generate_forecast(region)
    
    df_region = pd.DataFrame(forecast)
    all_data.append(df_region)

df_weather = pd.concat(all_data, ignore_index=True)
df_weather.head()

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_uvindex,hour_conditions
0,"Вінниця, Україна",2025-04-21,1745182800,24.9,8.5,17.3,2.7,41.1,0.0,0.0,...,0.0,,3.2,2.2,134.2,1012.0,24.1,0.0,0.0,Clear
1,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,0.0,,5.0,4.0,93.8,1013.0,24.1,0.0,0.0,Clear
2,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,0.0,,5.0,2.5,113.2,1013.0,24.1,0.0,0.0,Clear
3,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,0.0,,3.2,2.5,127.5,1013.0,24.1,2.7,0.0,Clear
4,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,0.0,,4.7,3.6,97.8,1014.0,24.1,5.6,0.0,Clear


In [8]:
df_weather.columns

Index(['city_resolvedAddress', 'day_datetime', 'day_datetimeEpoch',
       'day_tempmax', 'day_tempmin', 'day_temp', 'day_dew', 'day_humidity',
       'day_precip', 'day_precipcover', 'day_solarradiation',
       'day_solarenergy', 'day_uvindex', 'day_sunrise', 'day_sunset',
       'day_moonphase', 'hour_datetime', 'hour_datetimeEpoch', 'hour_temp',
       'hour_humidity', 'hour_dew', 'hour_precip', 'hour_precipprob',
       'hour_snow', 'hour_snowdepth', 'hour_preciptype', 'hour_windgust',
       'hour_windspeed', 'hour_winddir', 'hour_pressure', 'hour_visibility',
       'hour_cloudcover', 'hour_uvindex', 'hour_conditions'],
      dtype='object')

In [9]:
df_weather["city"] = df_weather["city_resolvedAddress"].apply(lambda x: x.split(",")[0])
df_weather["city"] = df_weather["city"].replace('Хмельницька область', "Хмельницький")

In [10]:
df_weather.head(5)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_uvindex,hour_conditions,city
0,"Вінниця, Україна",2025-04-21,1745182800,24.9,8.5,17.3,2.7,41.1,0.0,0.0,...,,3.2,2.2,134.2,1012.0,24.1,0.0,0.0,Clear,Вінниця
1,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,,5.0,4.0,93.8,1013.0,24.1,0.0,0.0,Clear,Вінниця
2,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,,5.0,2.5,113.2,1013.0,24.1,0.0,0.0,Clear,Вінниця
3,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,,3.2,2.5,127.5,1013.0,24.1,2.7,0.0,Clear,Вінниця
4,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,,4.7,3.6,97.8,1014.0,24.1,5.6,0.0,Clear,Вінниця


In [11]:
df_weather_reg = pd.merge(df_weather, df_regions, left_on="city",right_on="center_city_ua")

In [12]:
df_weather_reg.head(5)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_visibility,hour_cloudcover,hour_uvindex,hour_conditions,city,region,center_city_ua,center_city_en,region_alt,region_id
0,"Вінниця, Україна",2025-04-21,1745182800,24.9,8.5,17.3,2.7,41.1,0.0,0.0,...,24.1,0.0,0.0,Clear,Вінниця,Вінницька обл.,Вінниця,Vinnytsia,Вінниччина,2
1,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,24.1,0.0,0.0,Clear,Вінниця,Вінницька обл.,Вінниця,Vinnytsia,Вінниччина,2
2,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,24.1,0.0,0.0,Clear,Вінниця,Вінницька обл.,Вінниця,Vinnytsia,Вінниччина,2
3,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,24.1,2.7,0.0,Clear,Вінниця,Вінницька обл.,Вінниця,Vinnytsia,Вінниччина,2
4,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,24.1,5.6,0.0,Clear,Вінниця,Вінницька обл.,Вінниця,Vinnytsia,Вінниччина,2


In [13]:
df_weather_reg["region"] = df_weather_reg["region"].apply(lambda x: x + " область")

In [14]:
kyiv_reg = df_weather_reg[df_weather_reg["region"] == "Київська область"].copy()
kyiv_reg["region"] = "Київ"
kyiv_reg["region_id"] = 1

df_weather_reg = pd.concat([df_weather_reg, kyiv_reg])

### Merging weather, alarms and regions together

In [15]:
df_alarms = pd.DataFrame(a.get_alerts())
df_alarms.head()

Unnamed: 0,region,status
0,Автономна Республіка Крим,A
1,Волинська область,N
2,Вінницька область,N
3,Дніпропетровська область,N
4,Донецька область,A


In [16]:
df_alarms["status"] = df_alarms["status"].apply(lambda x: 1 if x == 'A' else 0)

In [17]:
df_weather_reg_al = df_weather_reg.merge(df_alarms, how="left", left_on="region", right_on="region")

In [18]:
df_weather_reg_al["day_datetime"] = pd.to_datetime(df_weather_reg_al["day_datetime"])

### Merging vectorized ISW to the rest

In [19]:
df_isw_vect = pd.read_csv("../data/ISW_vector.csv")

In [20]:
df_isw_vect.tail()

Unnamed: 0,date,content,lemma_content,stem_content,keywords
1079,2025-02-27,russian offensive campaign assessment 2025 chr...,russian offensive campaign assessment 2025 chr...,russian offens campaign assess 2025 christina ...,0.439 0.451 0.931 0.459 0.629 0.448 0.432 0.48...
1080,2025-02-28,russian offensive campaign assessment 2025 oli...,russian offensive campaign assessment 2025 oli...,russian offens campaign assess 2025 olivia gib...,0.52 0.512 0.925 0.791 0.511 0.532 0.539 0.507...
1081,2025-03-01,russian offensive campaign assessment 2025 dav...,russian offensive campaign assessment 2025 dav...,russian offens campaign assess 2025 davit gasp...,0.48 0.48 0.451 0.781 0.443 0.7 0.452 0.466 0....
1082,2025-04-17,russian offensive campaign assessment 2025 gra...,russian offensive campaign assessment 2025 gra...,russian offens campaign assess 2025 grace mapp...,0.504 0.518 0.738 0.775 0.493 0.55 0.518 0.495...
1083,2025-04-20,russian offensive campaign assessment 2025 dar...,russian offensive campaign assessment 2025 dar...,russian offens campaign assess 2025 daria novi...,0.513 0.674 0.795 0.485 0.532 0.554 0.519 0.50...


In [21]:
yesterday_str = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
df_isw_vect = df_isw_vect[df_isw_vect['date'] == yesterday_str]

In [22]:
#df_isw_vect["date"] = pd.to_datetime(df_isw_vect["date"])
#df_isw_vect["date_tomorrow_datetime"] = df_isw_vect["date"] + datetime.timedelta(days=1)

In [23]:
#df_ready = df_weather_reg_al.merge(df_isw_vect, how="left", left_on="day_datetime", right_on="date_tomorrow_datetime")
df_isw_vect = pd.concat([df_isw_vect] * len(df_weather_reg_al), ignore_index=True)
df_ready = pd.concat([df_weather_reg_al.reset_index(drop=True), df_isw_vect.reset_index(drop=True)], axis=1)

In [24]:
df_ready.head()

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,center_city_ua,center_city_en,region_alt,region_id,status,date,content,lemma_content,stem_content,keywords
0,"Вінниця, Україна",2025-04-21,1745182800,24.9,8.5,17.3,2.7,41.1,0.0,0.0,...,Вінниця,Vinnytsia,Вінниччина,2,,2025-04-20,russian offensive campaign assessment 2025 dar...,russian offensive campaign assessment 2025 dar...,russian offens campaign assess 2025 daria novi...,0.513 0.674 0.795 0.485 0.532 0.554 0.519 0.50...
1,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,Вінниця,Vinnytsia,Вінниччина,2,,2025-04-20,russian offensive campaign assessment 2025 dar...,russian offensive campaign assessment 2025 dar...,russian offens campaign assess 2025 daria novi...,0.513 0.674 0.795 0.485 0.532 0.554 0.519 0.50...
2,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,Вінниця,Vinnytsia,Вінниччина,2,,2025-04-20,russian offensive campaign assessment 2025 dar...,russian offensive campaign assessment 2025 dar...,russian offens campaign assess 2025 daria novi...,0.513 0.674 0.795 0.485 0.532 0.554 0.519 0.50...
3,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,Вінниця,Vinnytsia,Вінниччина,2,,2025-04-20,russian offensive campaign assessment 2025 dar...,russian offensive campaign assessment 2025 dar...,russian offens campaign assess 2025 daria novi...,0.513 0.674 0.795 0.485 0.532 0.554 0.519 0.50...
4,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,Вінниця,Vinnytsia,Вінниччина,2,,2025-04-20,russian offensive campaign assessment 2025 dar...,russian offensive campaign assessment 2025 dar...,russian offens campaign assess 2025 daria novi...,0.513 0.674 0.795 0.485 0.532 0.554 0.519 0.50...


In [25]:
df_ready.columns

Index(['city_resolvedAddress', 'day_datetime', 'day_datetimeEpoch',
       'day_tempmax', 'day_tempmin', 'day_temp', 'day_dew', 'day_humidity',
       'day_precip', 'day_precipcover', 'day_solarradiation',
       'day_solarenergy', 'day_uvindex', 'day_sunrise', 'day_sunset',
       'day_moonphase', 'hour_datetime', 'hour_datetimeEpoch', 'hour_temp',
       'hour_humidity', 'hour_dew', 'hour_precip', 'hour_precipprob',
       'hour_snow', 'hour_snowdepth', 'hour_preciptype', 'hour_windgust',
       'hour_windspeed', 'hour_winddir', 'hour_pressure', 'hour_visibility',
       'hour_cloudcover', 'hour_uvindex', 'hour_conditions', 'city', 'region',
       'center_city_ua', 'center_city_en', 'region_alt', 'region_id', 'status',
       'date', 'content', 'lemma_content', 'stem_content', 'keywords'],
      dtype='object')

In [26]:
df_ready.rename({"status": "event_all_region"}, axis=1, inplace=True)

In [27]:
df_ready.head()

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,center_city_ua,center_city_en,region_alt,region_id,event_all_region,date,content,lemma_content,stem_content,keywords
0,"Вінниця, Україна",2025-04-21,1745182800,24.9,8.5,17.3,2.7,41.1,0.0,0.0,...,Вінниця,Vinnytsia,Вінниччина,2,,2025-04-20,russian offensive campaign assessment 2025 dar...,russian offensive campaign assessment 2025 dar...,russian offens campaign assess 2025 daria novi...,0.513 0.674 0.795 0.485 0.532 0.554 0.519 0.50...
1,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,Вінниця,Vinnytsia,Вінниччина,2,,2025-04-20,russian offensive campaign assessment 2025 dar...,russian offensive campaign assessment 2025 dar...,russian offens campaign assess 2025 daria novi...,0.513 0.674 0.795 0.485 0.532 0.554 0.519 0.50...
2,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,Вінниця,Vinnytsia,Вінниччина,2,,2025-04-20,russian offensive campaign assessment 2025 dar...,russian offensive campaign assessment 2025 dar...,russian offens campaign assess 2025 daria novi...,0.513 0.674 0.795 0.485 0.532 0.554 0.519 0.50...
3,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,Вінниця,Vinnytsia,Вінниччина,2,,2025-04-20,russian offensive campaign assessment 2025 dar...,russian offensive campaign assessment 2025 dar...,russian offens campaign assess 2025 daria novi...,0.513 0.674 0.795 0.485 0.532 0.554 0.519 0.50...
4,"Вінниця, Україна",2025-04-22,1745269200,25.6,9.9,18.2,3.6,40.1,0.0,0.0,...,Вінниця,Vinnytsia,Вінниччина,2,,2025-04-20,russian offensive campaign assessment 2025 dar...,russian offensive campaign assessment 2025 dar...,russian offens campaign assess 2025 daria novi...,0.513 0.674 0.795 0.485 0.532 0.554 0.519 0.50...


In [28]:
df = df_ready.fillna(df_ready.median(numeric_only=True))
df['hour_preciptype'] = df['hour_preciptype'].astype(str)

In [29]:
df_encoded = pd.get_dummies(df, columns=['hour_preciptype'], prefix='hour_preciptype')

In [30]:
bool_columns = df_encoded.select_dtypes(include=['bool']).columns
df_encoded[bool_columns] = df_encoded[bool_columns].astype(int)

In [31]:
preciptype = [
    "hour_preciptype_['freezingrain']",
    "hour_preciptype_['ice']",
    "hour_preciptype_['rain', 'snow']",
    "hour_preciptype_['rain']",
    "hour_preciptype_['snow']"
]

for col in preciptype:
    if col not in df_encoded.columns:
        df_encoded[col] = 0

In [34]:
todrop = [
    'city_resolvedAddress',
    'day_datetime', 
    'city', 
    'region',
    'center_city_ua', 
    'center_city_en', 
    'region_alt', 
    'date', 
    'content', 
    'lemma_content', 
    'stem_content']

In [35]:
df_encoded = df_encoded.drop(todrop, axis=1)

In [36]:
df_encoded.columns

Index(['day_datetimeEpoch', 'day_tempmax', 'day_tempmin', 'day_temp',
       'day_dew', 'day_humidity', 'day_precip', 'day_precipcover',
       'day_solarradiation', 'day_solarenergy', 'day_uvindex', 'day_sunrise',
       'day_sunset', 'day_moonphase', 'hour_datetime', 'hour_datetimeEpoch',
       'hour_temp', 'hour_humidity', 'hour_dew', 'hour_precip',
       'hour_precipprob', 'hour_snow', 'hour_snowdepth', 'hour_windgust',
       'hour_windspeed', 'hour_winddir', 'hour_pressure', 'hour_visibility',
       'hour_cloudcover', 'hour_uvindex', 'hour_conditions', 'region_id',
       'event_all_region', 'keywords', 'hour_preciptype_None',
       'hour_preciptype_['rain']', 'hour_preciptype_['freezingrain']',
       'hour_preciptype_['ice']', 'hour_preciptype_['rain', 'snow']',
       'hour_preciptype_['snow']'],
      dtype='object')

In [34]:
df_encoded.shape

(598, 40)

In [35]:
temp_df = df_encoded['keywords'].str.split(expand=True)
temp_df = temp_df.astype('float64')
df_encoded_v2 = pd.concat([df_encoded.drop('keywords', axis=1), temp_df], axis=1)

In [36]:
le = LabelEncoder()
df_encoded_v2['hour_conditions'] = le.fit_transform(df['hour_conditions'])

In [37]:
df_encoded_v2['hour_datetime'] = pd.to_datetime(df_encoded_v2['hour_datetime'], format='%H:%M:%S')
df_encoded_v2['day_sunrise'] = pd.to_datetime(df_encoded_v2['day_sunrise'], format='%H:%M:%S')
df_encoded_v2['day_sunset'] = pd.to_datetime(df_encoded_v2['day_sunset'], format='%H:%M:%S')
df_encoded_v2['hour_datetime'] = (df_encoded_v2['hour_datetime'].dt.hour * 3600 +
                                  df_encoded_v2['hour_datetime'].dt.minute * 60 +
                                  df_encoded_v2['hour_datetime'].dt.second) / 86400

df_encoded_v2['day_sunrise'] = (df_encoded_v2['day_sunrise'].dt.hour * 3600 +
                                df_encoded_v2['day_sunrise'].dt.minute * 60 +
                                df_encoded_v2['day_sunrise'].dt.second) / 86400

df_encoded_v2['day_sunset'] = (df_encoded_v2['day_sunset'].dt.hour * 3600 +
                               df_encoded_v2['day_sunset'].dt.minute * 60 +
                               df_encoded_v2['day_sunset'].dt.second) / 86400

In [38]:
df_encoded_v2.head()

Unnamed: 0,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,...,990,991,992,993,994,995,996,997,998,999
0,1745182800,24.5,8.5,17.3,3.3,42.3,0.0,0.0,286.9,24.9,...,0.563,0.824,0.66,0.643,0.553,0.645,0.624,0.582,0.557,0.719
1,1745182800,24.5,8.5,17.3,3.3,42.3,0.0,0.0,286.9,24.9,...,0.563,0.824,0.66,0.643,0.553,0.645,0.624,0.582,0.557,0.719
2,1745269200,25.5,10.0,18.1,4.3,42.4,0.0,0.0,283.8,24.6,...,,,,,,,,,,
3,1745269200,25.5,10.0,18.1,4.3,42.4,0.0,0.0,283.8,24.6,...,,,,,,,,,,
4,1745269200,25.5,10.0,18.1,4.3,42.4,0.0,0.0,283.8,24.6,...,,,,,,,,,,


In [None]:
#фічі