In [1]:
import pandas as pd
import datetime
import weather_scrap as w
import alerts_in_ua as a
from sklearn.preprocessing import LabelEncoder

### Merging regions and weather

In [2]:
df_regions = pd.read_csv(f"../data/regions.csv")

In [3]:
df_regions.head(5)

Unnamed: 0,region,center_city_ua,center_city_en,region_alt,region_id
0,АР Крим,Сімферополь,Simferopol,Крим,1
1,Вінницька,Вінниця,Vinnytsia,Вінниччина,2
2,Волинська,Луцьк,Lutsk,Волинь,3
3,Дніпропетровська,Дніпро,Dnipro,Дніпропетровщина,4
4,Донецька,Донецьк,Donetsk,Донеччина,5


In [4]:
df_regions["region"] = df_regions["region"].apply(lambda x: x + " обл.")

In [5]:
weather = w.generate_forecast("Lviv, Ukraine")
df_weather = pd.DataFrame(weather)
df_weather.head(5)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_uvindex,hour_conditions
0,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,0.0,,15.1,5.0,180.3,1013.0,24.1,98.9,0.0,Overcast
1,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,0.0,,18.4,6.5,174.5,1013.0,24.1,100.0,1.0,Overcast
2,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,0.0,,18.0,6.5,184.4,1013.0,24.1,100.0,1.0,Overcast
3,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,0.0,,18.0,5.8,185.1,1012.0,24.1,100.0,1.0,Overcast
4,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,0.0,,15.8,5.0,167.4,1012.0,24.1,100.0,5.0,Overcast


In [6]:
df_weather.columns

Index(['city_resolvedAddress', 'day_datetime', 'day_datetimeEpoch',
       'day_tempmax', 'day_tempmin', 'day_temp', 'day_dew', 'day_humidity',
       'day_precip', 'day_precipcover', 'day_solarradiation',
       'day_solarenergy', 'day_uvindex', 'day_sunrise', 'day_sunset',
       'day_moonphase', 'hour_datetime', 'hour_datetimeEpoch', 'hour_temp',
       'hour_humidity', 'hour_dew', 'hour_precip', 'hour_precipprob',
       'hour_snow', 'hour_snowdepth', 'hour_preciptype', 'hour_windgust',
       'hour_windspeed', 'hour_winddir', 'hour_pressure', 'hour_visibility',
       'hour_cloudcover', 'hour_uvindex', 'hour_conditions'],
      dtype='object')

In [7]:
df_weather["city"] = df_weather["city_resolvedAddress"].apply(lambda x: x.split(",")[0])
df_weather["city"] = df_weather["city"].replace('Хмельницька область', "Хмельницький")

In [8]:
df_weather.head(5)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_uvindex,hour_conditions,city
0,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,,15.1,5.0,180.3,1013.0,24.1,98.9,0.0,Overcast,Львів
1,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,,18.4,6.5,174.5,1013.0,24.1,100.0,1.0,Overcast,Львів
2,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,,18.0,6.5,184.4,1013.0,24.1,100.0,1.0,Overcast,Львів
3,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,,18.0,5.8,185.1,1012.0,24.1,100.0,1.0,Overcast,Львів
4,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,,15.8,5.0,167.4,1012.0,24.1,100.0,5.0,Overcast,Львів


In [9]:
df_weather_reg = pd.merge(df_weather, df_regions, left_on="city",right_on="center_city_ua")

In [10]:
df_weather_reg.head(5)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_visibility,hour_cloudcover,hour_uvindex,hour_conditions,city,region,center_city_ua,center_city_en,region_alt,region_id
0,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,24.1,98.9,0.0,Overcast,Львів,Львівська обл.,Львів,Lviv,Львівщина,13
1,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,24.1,100.0,1.0,Overcast,Львів,Львівська обл.,Львів,Lviv,Львівщина,13
2,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,24.1,100.0,1.0,Overcast,Львів,Львівська обл.,Львів,Lviv,Львівщина,13
3,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,24.1,100.0,1.0,Overcast,Львів,Львівська обл.,Львів,Lviv,Львівщина,13
4,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,24.1,100.0,5.0,Overcast,Львів,Львівська обл.,Львів,Lviv,Львівщина,13


In [11]:
df_weather_reg["region"] = df_weather_reg["region"].apply(lambda x: x + " область")

In [12]:
kyiv_reg = df_weather_reg[df_weather_reg["region"] == "Київська область"].copy()
kyiv_reg["region"] = "Київ"

df_weather_reg = pd.concat([df_weather_reg, kyiv_reg])

### Merging weather, alarms and regions together

In [13]:
df_alarms = pd.DataFrame(a.get_alerts())
df_alarms.head()

Unnamed: 0,region,status
0,Автономна Республіка Крим,A
1,Волинська область,N
2,Вінницька область,N
3,Дніпропетровська область,N
4,Донецька область,N


In [14]:
df_alarms["status"] = df_alarms["status"].apply(lambda x: 1 if x == 'A' else 0)

In [15]:
df_weather_reg_al = df_weather_reg.merge(df_alarms, how="left", left_on="region", right_on="region")

In [16]:
df_weather_reg_al["day_datetime"] = pd.to_datetime(df_weather_reg_al["day_datetime"])

### Merging vectorized ISW to the rest

In [17]:
df_isw_vect = pd.read_csv("../data/ISW_vector.csv")

In [18]:
df_isw_vect.tail()

Unnamed: 0,date,content,lemma_content,stem_content,keywords
1078,2025-02-26,russian offensive campaign assessment 2025 dav...,russian offensive campaign assessment 2025 dav...,russian offens campaign assess 2025 davit gasp...,0.443 0.43 0.808 0.686 0.453 0.453 0.474 0.443...
1079,2025-02-27,russian offensive campaign assessment 2025 chr...,russian offensive campaign assessment 2025 chr...,russian offens campaign assess 2025 christina ...,0.458 0.924 0.447 0.625 0.454 0.482 0.476 0.45...
1080,2025-02-28,russian offensive campaign assessment 2025 oli...,russian offensive campaign assessment 2025 oli...,russian offens campaign assess 2025 olivia gib...,0.507 0.501 0.9 0.777 0.498 0.543 0.516 0.505 ...
1081,2025-03-01,russian offensive campaign assessment 2025 dav...,russian offensive campaign assessment 2025 dav...,russian offens campaign assess 2025 davit gasp...,0.472 0.483 0.796 0.701 0.47 0.466 0.489 0.443...
1082,2025-04-17,russian offensive campaign assessment 2025 gra...,russian offensive campaign assessment 2025 gra...,russian offens campaign assess 2025 grace mapp...,0.506 0.505 0.75 0.789 0.492 0.545 0.529 0.487...


In [19]:
yesterday_str = (datetime.datetime.now() - datetime.timedelta(days=1)).strftime('%Y-%m-%d')
df_isw_vect = df_isw_vect[df_isw_vect['date'] == yesterday_str]

In [20]:
df_isw_vect["date"] = pd.to_datetime(df_isw_vect["date"])
df_isw_vect["date_tomorrow_datetime"] = df_isw_vect["date"] + datetime.timedelta(days=1)

In [21]:
df_ready = df_weather_reg_al.merge(df_isw_vect, how="left", left_on="day_datetime", right_on="date_tomorrow_datetime")

In [22]:
df_ready.columns

Index(['city_resolvedAddress', 'day_datetime', 'day_datetimeEpoch',
       'day_tempmax', 'day_tempmin', 'day_temp', 'day_dew', 'day_humidity',
       'day_precip', 'day_precipcover', 'day_solarradiation',
       'day_solarenergy', 'day_uvindex', 'day_sunrise', 'day_sunset',
       'day_moonphase', 'hour_datetime', 'hour_datetimeEpoch', 'hour_temp',
       'hour_humidity', 'hour_dew', 'hour_precip', 'hour_precipprob',
       'hour_snow', 'hour_snowdepth', 'hour_preciptype', 'hour_windgust',
       'hour_windspeed', 'hour_winddir', 'hour_pressure', 'hour_visibility',
       'hour_cloudcover', 'hour_uvindex', 'hour_conditions', 'city', 'region',
       'center_city_ua', 'center_city_en', 'region_alt', 'region_id', 'status',
       'date', 'content', 'lemma_content', 'stem_content', 'keywords',
       'date_tomorrow_datetime'],
      dtype='object')

In [23]:
df_ready.rename({"status": "event_all_region"}, axis=1, inplace=True)

In [24]:
df_ready.head()

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,center_city_en,region_alt,region_id,event_all_region,date,content,lemma_content,stem_content,keywords,date_tomorrow_datetime
0,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,Lviv,Львівщина,13,,NaT,,,,,NaT
1,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,Lviv,Львівщина,13,,NaT,,,,,NaT
2,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,Lviv,Львівщина,13,,NaT,,,,,NaT
3,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,Lviv,Львівщина,13,,NaT,,,,,NaT
4,"Львів, Україна",2025-04-19,1745010000,18.3,12.8,15.1,10.6,76.2,23.3,37.5,...,Lviv,Львівщина,13,,NaT,,,,,NaT


In [25]:
df = df_ready.fillna(df_ready.median(numeric_only=True))
df['hour_preciptype'] = df['hour_preciptype'].astype(str)

In [26]:
df_encoded = pd.get_dummies(df, columns=['hour_preciptype'], prefix='hour_preciptype')

In [27]:
bool_columns = df_encoded.select_dtypes(include=['bool']).columns
df_encoded[bool_columns] = df_encoded[bool_columns].astype(int)

In [28]:
preciptype = [
    "hour_preciptype_['freezingrain']",
    "hour_preciptype_['ice']",
    "hour_preciptype_['rain', 'snow']",
    "hour_preciptype_['rain']",
    "hour_preciptype_['snow']"
]

for col in preciptype:
    if col not in df_encoded.columns:
        df_encoded[col] = 0

In [31]:
todrop = [
    'city_resolvedAddress', 
    'day_datetime','city', 
    'region',
    'center_city_ua', 
    'center_city_en', 
    'region_alt',
    'date', 
    'content', 
    'lemma_content', 
    'stem_content',
    'date_tomorrow_datetime']

In [32]:
df_encoded = df_encoded.drop(todrop, axis=1)

In [33]:
df_encoded.columns

Index(['day_datetimeEpoch', 'day_tempmax', 'day_tempmin', 'day_temp',
       'day_dew', 'day_humidity', 'day_precip', 'day_precipcover',
       'day_solarradiation', 'day_solarenergy', 'day_uvindex', 'day_sunrise',
       'day_sunset', 'day_moonphase', 'hour_datetime', 'hour_datetimeEpoch',
       'hour_temp', 'hour_humidity', 'hour_dew', 'hour_precip',
       'hour_precipprob', 'hour_snow', 'hour_snowdepth', 'hour_windgust',
       'hour_windspeed', 'hour_winddir', 'hour_pressure', 'hour_visibility',
       'hour_cloudcover', 'hour_uvindex', 'hour_conditions', 'region_id',
       'event_all_region', 'keywords', 'hour_preciptype_None',
       'hour_preciptype_['rain']', 'hour_preciptype_['freezingrain']',
       'hour_preciptype_['ice']', 'hour_preciptype_['rain', 'snow']',
       'hour_preciptype_['snow']'],
      dtype='object')

In [34]:
df_encoded.shape

(35, 40)

In [None]:
temp_df = df_encoded['keywords'].str.split(expand=True)
temp_df = temp_df.astype('float64')
df_encoded_v2 = pd.concat([df_encoded.drop('keywords', axis=1), temp_df], axis=1)

In [None]:
le = LabelEncoder()
df_encoded_v2['hour_conditions'] = le.fit_transform(df['hour_conditions'])

In [None]:
df_encoded_v2['hour_datetime'] = pd.to_datetime(df_encoded_v2['hour_datetime'], format='%H:%M:%S')
df_encoded_v2['day_sunrise'] = pd.to_datetime(df_encoded_v2['day_sunrise'], format='%H:%M:%S')
df_encoded_v2['day_sunset'] = pd.to_datetime(df_encoded_v2['day_sunset'], format='%H:%M:%S')
df_encoded_v2['hour_datetime'] = (df_encoded_v2['hour_datetime'].dt.hour * 3600 +
                                  df_encoded_v2['hour_datetime'].dt.minute * 60 +
                                  df_encoded_v2['hour_datetime'].dt.second) / 86400

df_encoded_v2['day_sunrise'] = (df_encoded_v2['day_sunrise'].dt.hour * 3600 +
                                df_encoded_v2['day_sunrise'].dt.minute * 60 +
                                df_encoded_v2['day_sunrise'].dt.second) / 86400

df_encoded_v2['day_sunset'] = (df_encoded_v2['day_sunset'].dt.hour * 3600 +
                               df_encoded_v2['day_sunset'].dt.minute * 60 +
                               df_encoded_v2['day_sunset'].dt.second) / 86400

In [None]:
#alarms_in_regions
#погода по всіх областях одночасно
#фічі