In [1]:
import datetime
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [3]:
INPUT_DATA_FOLDER = "../../data"
REPORTS_DATA_FILE = "ISW_vector.csv"

OUTPUT_FOLDER = "../../data/all_data_preprocessed"
ISW_OUTPUT_DATA_FILE = "all_isw.csv"
ALARMS_OUTPUT_DATA_FILE = "all_alarms.csv"
WEATHER_EVENTS_OUTPUT_DATA_FILE = "all_weather_by_hour.csv"
MERGED_OUTPUT_DATA_FILE = "weather_regions_alarms_merged.csv"
ALL_MERGED_DATA_FILE = "all_merged"

MODEL_FOLDER = "model"

In [4]:
def isNaN(num):
    return num != num

## Reading data

In [5]:
df_isw = pd.read_csv(f"{INPUT_DATA_FOLDER}/{REPORTS_DATA_FILE}", sep=",")

In [6]:
df_isw.head(5)

Unnamed: 0,date,content,lemma_content,stem_content,keywords
0,2022-02-24,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,0.448 0.451 0.761 0.65 0.423 0.468 0.473 0.426...
1,2022-02-25,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,0.469 0.501 0.748 0.702 0.484 0.489 0.462 0.52...
2,2022-02-26,mason clark george barros and katya stepanenko...,mason clark george barros katya stepanenko 3pm...,mason clark georg barro katya stepanenko 3pm e...,0.37 0.424 0.733 0.349 0.542 0.354 0.387 0.404...
3,2022-02-27,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 4p...,0.454 0.494 0.812 0.681 0.449 0.466 0.515 0.50...
4,2022-02-28,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 33...,0.459 0.454 0.751 0.721 0.444 0.471 0.454 0.50...


## Preparing ISW reports

## Reading models

In [7]:
df_isw.head(5)

Unnamed: 0,date,content,lemma_content,stem_content,keywords
0,2022-02-24,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,0.448 0.451 0.761 0.65 0.423 0.468 0.473 0.426...
1,2022-02-25,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,0.469 0.501 0.748 0.702 0.484 0.489 0.462 0.52...
2,2022-02-26,mason clark george barros and katya stepanenko...,mason clark george barros katya stepanenko 3pm...,mason clark georg barro katya stepanenko 3pm e...,0.37 0.424 0.733 0.349 0.542 0.354 0.387 0.404...
3,2022-02-27,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 4p...,0.454 0.494 0.812 0.681 0.449 0.466 0.515 0.50...
4,2022-02-28,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 33...,0.459 0.454 0.751 0.721 0.444 0.471 0.454 0.50...


In [8]:
df_isw["report_date"] = pd.to_datetime(df_isw["date"])

In [9]:
df_isw['date_tomorrow_datetime'] = df_isw['report_date'].apply(lambda x: x+datetime.timedelta(days=1))

In [10]:
df_isw.drop("date", axis=1, inplace=True)

In [11]:
df_isw.head(5)

Unnamed: 0,content,lemma_content,stem_content,keywords,report_date,date_tomorrow_datetime
0,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,0.448 0.451 0.761 0.65 0.423 0.468 0.473 0.426...,2022-02-24,2022-02-25
1,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,0.469 0.501 0.748 0.702 0.484 0.489 0.462 0.52...,2022-02-25,2022-02-26
2,mason clark george barros and katya stepanenko...,mason clark george barros katya stepanenko 3pm...,mason clark georg barro katya stepanenko 3pm e...,0.37 0.424 0.733 0.349 0.542 0.354 0.387 0.404...,2022-02-26,2022-02-27
3,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 4p...,0.454 0.494 0.812 0.681 0.449 0.466 0.515 0.50...,2022-02-27,2022-02-28
4,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 33...,0.459 0.454 0.751 0.721 0.444 0.471 0.454 0.50...,2022-02-28,2022-03-01


In [12]:
df_isw.to_csv(f"{OUTPUT_FOLDER}/{ISW_OUTPUT_DATA_FILE}", sep=";", index=False)

## Prepare events data

In [13]:
EVENTS_DATA_FILE = "alarms.csv"

In [14]:
df_events = pd.read_csv(f"{INPUT_DATA_FOLDER}/{EVENTS_DATA_FILE}", sep=";")

In [15]:
df_events.head(5)

Unnamed: 0,id,region_id,region_city,all_region,start,end
0,52432,12,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28
1,53292,23,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43
2,52080,3,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42
3,52857,19,Харківська обл.,1,2022-02-24 20:11:47,2022-02-24 20:59:47
4,52700,18,Тернопільська обл.,1,2022-02-25 01:59:36,2022-02-25 09:00:19


In [16]:
df_events["region_city"].unique()

array(['Львівська обл.', 'Чернігівська обл.', 'Вінницька обл.',
       'Харківська обл.', 'Тернопільська обл.', 'Київ', 'Рівненська обл.',
       'Черкаська обл.', 'Одеська обл.', 'Запорізька обл.',
       'Волинська обл.', 'Житомирська обл.', 'Херсонська обл.',
       'Миколаївська обл.', 'Хмельницька обл.', 'Івано-Франківська обл.',
       'Дніпропетровська обл.', 'Кіровоградська обл.', 'Чернівецька обл.',
       'Полтавська обл.', 'Київська обл.', 'Сумська обл.',
       'Донецька обл.', 'Закарпатська обл.', 'Крим'], dtype=object)

In [17]:
df_events_v2 = df_events.drop(["id","region_id"],axis=1)

In [18]:
df_events_v2.head(5)

Unnamed: 0,region_city,all_region,start,end
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28
1,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43
2,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42
3,Харківська обл.,1,2022-02-24 20:11:47,2022-02-24 20:59:47
4,Тернопільська обл.,1,2022-02-25 01:59:36,2022-02-25 09:00:19


In [19]:
df_events_v2[isNaN(df_events_v2).any(axis=1)].head(5)

Unnamed: 0,region_city,all_region,start,end


In [20]:
df_events_v2["start"] = pd.to_datetime(df_events_v2["start"])
df_events_v2["end"] = pd.to_datetime(df_events_v2["end"])

In [21]:
df_events_v2["start_hour"] = df_events_v2['start'].dt.floor('h')
df_events_v2["end_hour"] = df_events_v2['end'].dt.ceil('h')

In [22]:
df_events_v2["day_date"] = df_events_v2["start"].dt.date

df_events_v2["start_hour_datetimeEpoch"] = df_events_v2['start_hour'].apply(lambda x: int(x.timestamp())  if not isNaN(x) else None)
df_events_v2["end_hour_datetimeEpoch"] = df_events_v2['end_hour'].apply(lambda x: int(x.timestamp())  if not isNaN(x) else None)

df_events_v2.head(10)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800
1,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600
2,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000
3,Харківська обл.,1,2022-02-24 20:11:47,2022-02-24 20:59:47,2022-02-24 20:00:00,2022-02-24 21:00:00,2022-02-24,1645732800,1645736400
4,Тернопільська обл.,1,2022-02-25 01:59:36,2022-02-25 09:00:19,2022-02-25 01:00:00,2022-02-25 10:00:00,2022-02-25,1645750800,1645783200
5,Вінницька обл.,1,2022-02-25 04:01:42,2022-02-25 08:35:42,2022-02-25 04:00:00,2022-02-25 09:00:00,2022-02-25,1645761600,1645779600
6,Харківська обл.,1,2022-02-25 04:56:47,2022-02-25 05:40:47,2022-02-25 04:00:00,2022-02-25 06:00:00,2022-02-25,1645761600,1645768800
7,Чернігівська обл.,1,2022-02-25 06:46:43,2022-02-25 06:52:43,2022-02-25 06:00:00,2022-02-25 07:00:00,2022-02-25,1645768800,1645772400
8,Львівська обл.,1,2022-02-25 06:53:17,2022-02-25 07:56:28,2022-02-25 06:00:00,2022-02-25 08:00:00,2022-02-25,1645768800,1645776000
9,Київ,1,2022-02-25 07:19:04,2022-02-25 07:49:04,2022-02-25 07:00:00,2022-02-25 08:00:00,2022-02-25,1645772400,1645776000


In [23]:
df_events_v2[df_events_v2["all_region"]==1].shape

(55788, 9)

In [24]:
df_events_v2[df_events_v2["all_region"]==1].head(5)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800
1,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600
2,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000
3,Харківська обл.,1,2022-02-24 20:11:47,2022-02-24 20:59:47,2022-02-24 20:00:00,2022-02-24 21:00:00,2022-02-24,1645732800,1645736400
4,Тернопільська обл.,1,2022-02-25 01:59:36,2022-02-25 09:00:19,2022-02-25 01:00:00,2022-02-25 10:00:00,2022-02-25,1645750800,1645783200


In [25]:
df_events_v2[df_events_v2["all_region"]==0].shape

(0, 9)

In [26]:
df_events_v2[df_events_v2["all_region"]==0].head(5)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch


In [27]:
df_events_v2.to_csv(f"{OUTPUT_FOLDER}/{ALARMS_OUTPUT_DATA_FILE}", sep=";", index=False)

## Prepare weather

In [28]:
WEATHER_DATA_FILE = "weather_by_hour.csv"

In [29]:
df_weather = pd.read_csv(f"{INPUT_DATA_FOLDER}/{WEATHER_DATA_FILE}", sep=",")
df_weather.head(5)

Unnamed: 0,city_latitude,city_longitude,city_resolvedAddress,city_address,city_timezone,city_tzoffset,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,...,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions,hour_icon,hour_source,hour_stations
0,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1020.0,0.0,91.5,0.0,,0.0,Overcast,snow,obs,remote
1,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.2,88.2,0.0,,0.0,Partially cloudy,fog,obs,remote
2,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1022.0,10.0,100.0,,,,Overcast,cloudy,obs,33177099999
3,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.1,92.0,0.0,,0.0,Overcast,fog,obs,remote
4,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.0,93.8,0.0,,0.0,Overcast,cloudy,obs,remote


In [30]:
df_weather["day_datetime"] = pd.to_datetime(df_weather["day_datetime"])

In [31]:
df_weather.shape

(608304, 65)

In [32]:
df_weather.head(10)

Unnamed: 0,city_latitude,city_longitude,city_resolvedAddress,city_address,city_timezone,city_tzoffset,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,...,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions,hour_icon,hour_source,hour_stations
0,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1020.0,0.0,91.5,0.0,,0.0,Overcast,snow,obs,remote
1,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.2,88.2,0.0,,0.0,Partially cloudy,fog,obs,remote
2,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1022.0,10.0,100.0,,,,Overcast,cloudy,obs,33177099999
3,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.1,92.0,0.0,,0.0,Overcast,fog,obs,remote
4,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.0,93.8,0.0,,0.0,Overcast,cloudy,obs,remote
5,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1022.5,10.0,100.0,0.0,,0.0,Overcast,cloudy,obs,33177099999;33301099999
6,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,10.0,100.0,0.0,,0.0,Overcast,cloudy,obs,UKLR;33301099999
7,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1022.0,10.0,100.0,0.0,,0.0,Overcast,cloudy,obs,UKLR;33301099999
8,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1024.2,4.4,100.0,,,,"Snow, Overcast",rain,obs,33177099999;UKLR;33301099999
9,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1024.0,2.0,100.0,15.0,0.1,0.0,Overcast,cloudy,obs,UKLR;33301099999


In [33]:
# exclude
weather_exclude = [
"day_feelslikemax",
"day_feelslikemin",
"day_sunriseEpoch",
"day_sunsetEpoch",
"day_description",
"city_latitude",
"city_longitude",
"city_address",
"city_timezone",
"city_tzoffset",
"day_feelslike",
"day_precipprob",
"day_snow",
"day_snowdepth",
"day_windgust",
"day_windspeed",
"day_winddir",
"day_pressure",
"day_cloudcover",
"day_visibility",
"day_conditions",
"day_icon",
"day_source",
"day_preciptype",
"day_stations",
"hour_icon",
"hour_source",
"hour_stations",
"hour_feelslike", 
"hour_solarradiation", 
"hour_solarenergy"
]

In [34]:
df_weather_v2 = df_weather.drop(weather_exclude, axis=1)

In [35]:
df_weather_v2.head(5)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_snowdepth,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_uvindex,hour_conditions
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.2,['snow'],31.3,15.5,275.6,1020.0,0.0,91.5,0.0,Overcast
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.2,['snow'],27.7,14.8,280.3,1021.0,0.2,88.2,0.0,Partially cloudy
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.1,['snow'],29.2,14.4,310.0,1022.0,10.0,100.0,,Overcast
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.1,['snow'],23.8,13.3,295.1,1021.0,0.1,92.0,0.0,Overcast
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.1,['snow'],24.5,13.3,305.8,1021.0,0.0,93.8,0.0,Overcast


In [36]:
df_weather_v2["city"] = df_weather_v2["city_resolvedAddress"].apply(lambda x: x.split(",")[0])
df_weather_v2["city"] = df_weather_v2["city"].replace('Хмельницька область', "Хмельницький")

In [37]:
df_weather_v2.head(5)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_preciptype,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_uvindex,hour_conditions,city
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,['snow'],31.3,15.5,275.6,1020.0,0.0,91.5,0.0,Overcast,Луцьк
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,['snow'],27.7,14.8,280.3,1021.0,0.2,88.2,0.0,Partially cloudy,Луцьк
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,['snow'],29.2,14.4,310.0,1022.0,10.0,100.0,,Overcast,Луцьк
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,['snow'],23.8,13.3,295.1,1021.0,0.1,92.0,0.0,Overcast,Луцьк
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,['snow'],24.5,13.3,305.8,1021.0,0.0,93.8,0.0,Overcast,Луцьк


In [38]:
df_weather_v2.columns

Index(['city_resolvedAddress', 'day_datetime', 'day_datetimeEpoch',
       'day_tempmax', 'day_tempmin', 'day_temp', 'day_dew', 'day_humidity',
       'day_precip', 'day_precipcover', 'day_solarradiation',
       'day_solarenergy', 'day_uvindex', 'day_sunrise', 'day_sunset',
       'day_moonphase', 'hour_datetime', 'hour_datetimeEpoch', 'hour_temp',
       'hour_humidity', 'hour_dew', 'hour_precip', 'hour_precipprob',
       'hour_snow', 'hour_snowdepth', 'hour_preciptype', 'hour_windgust',
       'hour_windspeed', 'hour_winddir', 'hour_pressure', 'hour_visibility',
       'hour_cloudcover', 'hour_uvindex', 'hour_conditions', 'city'],
      dtype='object')

In [39]:
df_weather_v2.shape

(608304, 35)

In [40]:
df_weather_v2.to_csv(f"{OUTPUT_FOLDER}/{WEATHER_EVENTS_OUTPUT_DATA_FILE}", sep=";", index=False)

## merging data

In [41]:
df_regions = pd.read_csv(f"../../data/regions.csv")

In [42]:
df_regions["region"] = df_regions["region"].apply(lambda x: x + " обл.")

In [43]:
df_regions.head(5)

Unnamed: 0,region,center_city_ua,center_city_en,region_alt,region_id
0,АР Крим обл.,Сімферополь,Simferopol,Крим,1
1,Вінницька обл.,Вінниця,Vinnytsia,Вінниччина,2
2,Волинська обл.,Луцьк,Lutsk,Волинь,3
3,Дніпропетровська обл.,Дніпро,Dnipro,Дніпропетровщина,4
4,Донецька обл.,Донецьк,Donetsk,Донеччина,5


In [44]:
df_weather_reg = pd.merge(df_weather_v2, df_regions, left_on="city",right_on="center_city_ua")

In [45]:
df_weather_reg.head(10)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_visibility,hour_cloudcover,hour_uvindex,hour_conditions,city,region,center_city_ua,center_city_en,region_alt,region_id
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,91.5,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.2,88.2,0.0,Partially cloudy,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,10.0,100.0,,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.1,92.0,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,93.8,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
5,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,10.0,100.0,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
6,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,10.0,100.0,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
7,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,10.0,100.0,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
8,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,4.4,100.0,,"Snow, Overcast",Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
9,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,2.0,100.0,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3


In [46]:
kyiv_reg = df_weather_reg[df_weather_reg["region"] == "Київська обл."].copy()
kyiv_reg["region"] = "Київ"
kyiv_reg["region_id"] = 1

df_weather_reg = pd.concat([df_weather_reg, kyiv_reg])

In [47]:
df_weather_reg.head()

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_visibility,hour_cloudcover,hour_uvindex,hour_conditions,city,region,center_city_ua,center_city_en,region_alt,region_id
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,91.5,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.2,88.2,0.0,Partially cloudy,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,10.0,100.0,,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.1,92.0,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,93.8,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3


In [48]:
df_weather_reg.shape

(634752, 40)

In [49]:
df_weather_v2.shape

(608304, 35)

### Merging weather and events

In [50]:
df_events_v2.dtypes

region_city                         object
all_region                           int64
start                       datetime64[ns]
end                         datetime64[ns]
start_hour                  datetime64[ns]
end_hour                    datetime64[ns]
day_date                            object
start_hour_datetimeEpoch             int64
end_hour_datetimeEpoch               int64
dtype: object

In [51]:
df_events_v2.shape

(55788, 9)

In [52]:
df_events_v2.head(10)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800
1,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600
2,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000
3,Харківська обл.,1,2022-02-24 20:11:47,2022-02-24 20:59:47,2022-02-24 20:00:00,2022-02-24 21:00:00,2022-02-24,1645732800,1645736400
4,Тернопільська обл.,1,2022-02-25 01:59:36,2022-02-25 09:00:19,2022-02-25 01:00:00,2022-02-25 10:00:00,2022-02-25,1645750800,1645783200
5,Вінницька обл.,1,2022-02-25 04:01:42,2022-02-25 08:35:42,2022-02-25 04:00:00,2022-02-25 09:00:00,2022-02-25,1645761600,1645779600
6,Харківська обл.,1,2022-02-25 04:56:47,2022-02-25 05:40:47,2022-02-25 04:00:00,2022-02-25 06:00:00,2022-02-25,1645761600,1645768800
7,Чернігівська обл.,1,2022-02-25 06:46:43,2022-02-25 06:52:43,2022-02-25 06:00:00,2022-02-25 07:00:00,2022-02-25,1645768800,1645772400
8,Львівська обл.,1,2022-02-25 06:53:17,2022-02-25 07:56:28,2022-02-25 06:00:00,2022-02-25 08:00:00,2022-02-25,1645768800,1645776000
9,Київ,1,2022-02-25 07:19:04,2022-02-25 07:49:04,2022-02-25 07:00:00,2022-02-25 08:00:00,2022-02-25,1645772400,1645776000


In [53]:
df_regions["region"].unique()

array(['АР Крим обл.', 'Вінницька обл.', 'Волинська обл.',
       'Дніпропетровська обл.', 'Донецька обл.', 'Житомирська обл.',
       'Закарпатська обл.', 'Запорізька обл.', 'Івано-Франківська обл.',
       'Київська обл.', 'Кіровоградська обл.', 'Луганська обл.',
       'Львівська обл.', 'Миколаївська обл.', 'Одеська обл.',
       'Полтавська обл.', 'Рівненська обл.', 'Сумська обл.',
       'Тернопільська обл.', 'Харківська обл.', 'Херсонська обл.',
       'Хмельницька обл.', 'Черкаська обл.', 'Чернівецька обл.',
       'Чернігівська обл.'], dtype=object)

In [54]:
df_events_v2["region_city"].unique()

array(['Львівська обл.', 'Чернігівська обл.', 'Вінницька обл.',
       'Харківська обл.', 'Тернопільська обл.', 'Київ', 'Рівненська обл.',
       'Черкаська обл.', 'Одеська обл.', 'Запорізька обл.',
       'Волинська обл.', 'Житомирська обл.', 'Херсонська обл.',
       'Миколаївська обл.', 'Хмельницька обл.', 'Івано-Франківська обл.',
       'Дніпропетровська обл.', 'Кіровоградська обл.', 'Чернівецька обл.',
       'Полтавська обл.', 'Київська обл.', 'Сумська обл.',
       'Донецька обл.', 'Закарпатська обл.', 'Крим'], dtype=object)

In [55]:
# df_events_v2_sample = df_events_v2.sample(10)
# df_events_v2_sample.shape

events_dict = df_events_v2.to_dict('records')
events_by_hour = []

In [56]:
events_dict[0]

{'region_city': 'Львівська обл.',
 'all_region': 1,
 'start': Timestamp('2022-02-24 07:43:17'),
 'end': Timestamp('2022-02-24 09:52:28'),
 'start_hour': Timestamp('2022-02-24 07:00:00'),
 'end_hour': Timestamp('2022-02-24 10:00:00'),
 'day_date': datetime.date(2022, 2, 24),
 'start_hour_datetimeEpoch': 1645686000,
 'end_hour_datetimeEpoch': 1645696800}

In [57]:
for event in events_dict:
    for d in pd.date_range(start=event["start_hour"], end=event["end_hour"], freq='1h'):
        et = event.copy()
        et["hour_level_event_time"] = d
        events_by_hour.append(et)

In [58]:
df_events_v3 = pd.DataFrame.from_dict(events_by_hour)

In [59]:
df_events_v3["hour_level_event_datetimeEpoch"] = df_events_v3["hour_level_event_time"].apply(lambda x: int(x.timestamp())  if not isNaN(x) else None)

In [60]:
df_events_v3.shape

(179408, 11)

In [61]:
df_events_v3.head(10)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch,hour_level_event_time,hour_level_event_datetimeEpoch
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 07:00:00,1645686000
1,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 08:00:00,1645689600
2,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 09:00:00,1645693200
3,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 10:00:00,1645696800
4,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 14:00:00,1645711200
5,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 15:00:00,1645714800
6,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 16:00:00,1645718400
7,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 17:00:00,1645722000
8,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 18:00:00,1645725600
9,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000,2022-02-24 15:00:00,1645714800


In [62]:
df_weather_reg.head(5)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_visibility,hour_cloudcover,hour_uvindex,hour_conditions,city,region,center_city_ua,center_city_en,region_alt,region_id
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,91.5,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.2,88.2,0.0,Partially cloudy,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,10.0,100.0,,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.1,92.0,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,93.8,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3


In [63]:
df_weather_reg.shape

(634752, 40)

In [64]:
df_events_v3.head(10)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch,hour_level_event_time,hour_level_event_datetimeEpoch
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 07:00:00,1645686000
1,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 08:00:00,1645689600
2,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 09:00:00,1645693200
3,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 10:00:00,1645696800
4,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 14:00:00,1645711200
5,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 15:00:00,1645714800
6,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 16:00:00,1645718400
7,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 17:00:00,1645722000
8,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 18:00:00,1645725600
9,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000,2022-02-24 15:00:00,1645714800


In [65]:
#count = df_events_v3.groupby("hour_level_event_time")['region_city'].nunique()
#df_events_v3['alarms_in_regions'] = df_events_v3["hour_level_event_time"].map(count)

#counts = df_events_v3.groupby(['region_city', 'day_date'])["region_city"].nunique()
#df_events_v3['events_on_day'] = df_events_v3.set_index(['region_city', 'day_date']).index.map(counts)

In [66]:
df_events_v4 = df_events_v3.copy().add_prefix("event_")

In [67]:
df_events_v4.head(10)

Unnamed: 0,event_region_city,event_all_region,event_start,event_end,event_start_hour,event_end_hour,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 07:00:00,1645686000
1,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 08:00:00,1645689600
2,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 09:00:00,1645693200
3,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 10:00:00,1645696800
4,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 14:00:00,1645711200
5,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 15:00:00,1645714800
6,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 16:00:00,1645718400
7,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 17:00:00,1645722000
8,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 18:00:00,1645725600
9,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000,2022-02-24 15:00:00,1645714800


In [68]:
df_weather_v4 = df_weather_reg.merge(df_events_v4, 
                                     how="left", 
                                     left_on=["region","hour_datetimeEpoch"],
                                     right_on=["event_region_city","event_hour_level_event_datetimeEpoch"])

In [69]:
df_weather_v4.head(10)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,event_all_region,event_start,event_end,event_start_hour,event_end_hour,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
5,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
6,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
7,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
8,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
9,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,


In [70]:
df_weather_v4.shape

(656441, 51)

In [71]:
df_weather_v4.to_csv(f"{OUTPUT_FOLDER}/{MERGED_OUTPUT_DATA_FILE}", sep=";", index=False)

In [72]:
df_weather_v4.head(10)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,event_all_region,event_start,event_end,event_start_hour,event_end_hour,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
5,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
6,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
7,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
8,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
9,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,


In [73]:
df_weather_v4.shape

(656441, 51)

In [74]:
df_isw.head(10)

Unnamed: 0,content,lemma_content,stem_content,keywords,report_date,date_tomorrow_datetime
0,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,0.448 0.451 0.761 0.65 0.423 0.468 0.473 0.426...,2022-02-24,2022-02-25
1,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,0.469 0.501 0.748 0.702 0.484 0.489 0.462 0.52...,2022-02-25,2022-02-26
2,mason clark george barros and katya stepanenko...,mason clark george barros katya stepanenko 3pm...,mason clark georg barro katya stepanenko 3pm e...,0.37 0.424 0.733 0.349 0.542 0.354 0.387 0.404...,2022-02-26,2022-02-27
3,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 4p...,0.454 0.494 0.812 0.681 0.449 0.466 0.515 0.50...,2022-02-27,2022-02-28
4,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 33...,0.459 0.454 0.751 0.721 0.444 0.471 0.454 0.50...,2022-02-28,2022-03-01
5,frederick w kagan george barros and kateryna s...,frederick w kagan george barros kateryna stepa...,frederick w kagan georg barro kateryna stepane...,0.465 0.497 0.933 0.446 0.638 0.457 0.489 0.45...,2022-03-01,2022-03-02
6,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 43...,0.482 0.48 0.45 0.821 0.743 0.493 0.491 0.46 0...,2022-03-02,2022-03-03
7,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 40...,0.456 0.482 0.831 0.683 0.454 0.491 0.52 0.469...,2022-03-03,2022-03-04
8,fredrick w kagan george barros and kateryna st...,fredrick w kagan george barros kateryna stepan...,fredrick w kagan georg barro kateryna stepanen...,0.476 0.815 0.702 0.44 0.46 0.482 0.463 0.486 ...,2022-03-04,2022-03-05
9,fredrick w kagan george barros and kateryna st...,fredrick w kagan george barros kateryna stepan...,fredrick w kagan georg barro kateryna stepanen...,0.423 0.451 0.913 0.633 0.424 0.438 0.456 0.41...,2022-03-05,2022-03-06


In [75]:
df_isw_v2 = df_isw.drop(["content", "stem_content"], axis=1)

In [76]:
df_isw_v2.head(5)

Unnamed: 0,lemma_content,keywords,report_date,date_tomorrow_datetime
0,mason clark george barros kateryna stepanenko ...,0.448 0.451 0.761 0.65 0.423 0.468 0.473 0.426...,2022-02-24,2022-02-25
1,mason clark george barros kateryna stepanenko ...,0.469 0.501 0.748 0.702 0.484 0.489 0.462 0.52...,2022-02-25,2022-02-26
2,mason clark george barros katya stepanenko 3pm...,0.37 0.424 0.733 0.349 0.542 0.354 0.387 0.404...,2022-02-26,2022-02-27
3,mason clark george barros kateryna stepanenko ...,0.454 0.494 0.812 0.681 0.449 0.466 0.515 0.50...,2022-02-27,2022-02-28
4,mason clark george barros kateryna stepanenko ...,0.459 0.454 0.751 0.721 0.444 0.471 0.454 0.50...,2022-02-28,2022-03-01


In [77]:
df_merged = df_weather_v4.merge(df_isw_v2, how="left", left_on="day_datetime", right_on="date_tomorrow_datetime")

In [78]:
df_merged.head(10)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,event_end_hour,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch,lemma_content,keywords,report_date,date_tomorrow_datetime
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,,,NaT,NaT
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,,,NaT,NaT
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,,,NaT,NaT
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,,,NaT,NaT
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,,,NaT,NaT
5,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,,,NaT,NaT
6,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,,,NaT,NaT
7,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,,,NaT,NaT
8,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,,,NaT,NaT
9,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,,,NaT,NaT


In [79]:
df_merged=df_merged.drop(["city_resolvedAddress", "day_datetime", "city", "region", "event_day_date"], axis=1)
df_merged = df_merged.dropna(subset=df_isw_v2.columns.difference(["date"]))


In [80]:
df_merged = df_merged.dropna(subset=["report_date"])

In [81]:
count = df_merged.groupby("event_hour_level_event_time")["event_region_city"].nunique()
df_merged['alarms_in_regions'] = df_merged["event_hour_level_event_time"].map(count)

In [82]:
df_merged["event_all_region"] = df_merged["event_all_region"].fillna(0)

In [83]:
df_merged.shape

(643944, 51)

In [84]:
df_merged.shape

(643944, 51)

In [85]:
df_merged = df_merged.drop(['lemma_content','event_region_city','event_start','event_end','event_end_hour','event_start_hour','event_end_hour','event_start_hour_datetimeEpoch'], axis = 1)

In [86]:
df_merged = df_merged.drop(['center_city_ua','center_city_en','region_alt','event_hour_level_event_datetimeEpoch','event_end_hour_datetimeEpoch','event_hour_level_event_time','report_date','date_tomorrow_datetime']
, axis = 1)


In [87]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Index: 643944 entries, 24 to 656440
Data columns (total 36 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   day_datetimeEpoch   643944 non-null  int64  
 1   day_tempmax         643944 non-null  float64
 2   day_tempmin         643944 non-null  float64
 3   day_temp            643944 non-null  float64
 4   day_dew             643944 non-null  float64
 5   day_humidity        643944 non-null  float64
 6   day_precip          643944 non-null  float64
 7   day_precipcover     643944 non-null  float64
 8   day_solarradiation  643351 non-null  float64
 9   day_solarenergy     643351 non-null  float64
 10  day_uvindex         643351 non-null  float64
 11  day_sunrise         643944 non-null  object 
 12  day_sunset          643944 non-null  object 
 13  day_moonphase       643944 non-null  float64
 14  hour_datetime       643944 non-null  object 
 15  hour_datetimeEpoch  643944 non-null  i

In [88]:
df = df_merged.fillna(df_merged.median(numeric_only=True))

In [89]:
df_encoded = pd.get_dummies(df, columns=['hour_preciptype'], prefix='hour_preciptype')

In [90]:
bool_columns = df_encoded.select_dtypes(include=['bool']).columns
df_encoded[bool_columns] = df_encoded[bool_columns].astype(int)

In [91]:
df_encoded.columns

Index(['day_datetimeEpoch', 'day_tempmax', 'day_tempmin', 'day_temp',
       'day_dew', 'day_humidity', 'day_precip', 'day_precipcover',
       'day_solarradiation', 'day_solarenergy', 'day_uvindex', 'day_sunrise',
       'day_sunset', 'day_moonphase', 'hour_datetime', 'hour_datetimeEpoch',
       'hour_temp', 'hour_humidity', 'hour_dew', 'hour_precip',
       'hour_precipprob', 'hour_snow', 'hour_snowdepth', 'hour_windgust',
       'hour_windspeed', 'hour_winddir', 'hour_pressure', 'hour_visibility',
       'hour_cloudcover', 'hour_uvindex', 'hour_conditions', 'region_id',
       'event_all_region', 'keywords', 'alarms_in_regions',
       'hour_preciptype_['freezingrain']', 'hour_preciptype_['ice']',
       'hour_preciptype_['rain', 'snow']', 'hour_preciptype_['rain']',
       'hour_preciptype_['snow']'],
      dtype='object')

In [92]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 643944 entries, 24 to 656440
Data columns (total 40 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   day_datetimeEpoch                 643944 non-null  int64  
 1   day_tempmax                       643944 non-null  float64
 2   day_tempmin                       643944 non-null  float64
 3   day_temp                          643944 non-null  float64
 4   day_dew                           643944 non-null  float64
 5   day_humidity                      643944 non-null  float64
 6   day_precip                        643944 non-null  float64
 7   day_precipcover                   643944 non-null  float64
 8   day_solarradiation                643944 non-null  float64
 9   day_solarenergy                   643944 non-null  float64
 10  day_uvindex                       643944 non-null  float64
 11  day_sunrise                       643944 non-null  objec

In [93]:
#df_encoded['keywords'].apply(lambda x: [float(num) for num in x.split(" ")])

In [94]:
temp_df = df_encoded['keywords'].str.split(expand=True)
temp_df = temp_df.astype('float64')
df_encoded_v2 = pd.concat([df_encoded.drop('keywords', axis=1), temp_df], axis=1)

In [95]:
df_encoded_v2

Unnamed: 0,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,...,990,991,992,993,994,995,996,997,998,999
24,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.535,0.708,0.534,0.527,0.533,0.503,0.519,0.501,0.457,0.617
25,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.535,0.708,0.534,0.527,0.533,0.503,0.519,0.501,0.457,0.617
26,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.535,0.708,0.534,0.527,0.533,0.503,0.519,0.501,0.457,0.617
27,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.535,0.708,0.534,0.527,0.533,0.503,0.519,0.501,0.457,0.617
28,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,0.535,0.708,0.534,0.527,0.533,0.503,0.519,0.501,0.457,0.617
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
656436,1740780000,1.1,-1.1,0.0,-2.6,82.9,0.7,8.33,40.6,3.6,...,0.649,0.818,0.649,0.659,0.560,0.639,0.610,0.601,0.572,0.756
656437,1740780000,1.1,-1.1,0.0,-2.6,82.9,0.7,8.33,40.6,3.6,...,0.649,0.818,0.649,0.659,0.560,0.639,0.610,0.601,0.572,0.756
656438,1740780000,1.1,-1.1,0.0,-2.6,82.9,0.7,8.33,40.6,3.6,...,0.649,0.818,0.649,0.659,0.560,0.639,0.610,0.601,0.572,0.756
656439,1740780000,1.1,-1.1,0.0,-2.6,82.9,0.7,8.33,40.6,3.6,...,0.649,0.818,0.649,0.659,0.560,0.639,0.610,0.601,0.572,0.756


In [96]:
unique_values = sorted(df['hour_conditions'].unique())
mapping = {val: idx for idx, val in enumerate(unique_values)}
df_encoded_v2['hour_conditions'] = df['hour_conditions'].map(mapping)

In [97]:
df_encoded_v2['hour_conditions']

24         3
25         3
26         4
27         4
28         4
          ..
656436     3
656437    14
656438     3
656439     3
656440     3
Name: hour_conditions, Length: 643944, dtype: int64

In [98]:
df_encoded_v2.info()

<class 'pandas.core.frame.DataFrame'>
Index: 643944 entries, 24 to 656440
Columns: 1039 entries, day_datetimeEpoch to 999
dtypes: float64(1027), int64(9), object(3)
memory usage: 5.0+ GB


In [99]:
df_encoded_v2['hour_datetime'] = pd.to_datetime(df_encoded_v2['hour_datetime'], format='%H:%M:%S')

In [100]:
df_encoded_v2['day_sunrise'] = pd.to_datetime(df_encoded_v2['day_sunrise'], format='%H:%M:%S')
df_encoded_v2['day_sunset'] = pd.to_datetime(df_encoded_v2['day_sunset'], format='%H:%M:%S')

In [101]:
df_encoded_v2['hour_datetime'] = (df_encoded_v2['hour_datetime'].dt.hour * 3600 +
                                  df_encoded_v2['hour_datetime'].dt.minute * 60 +
                                  df_encoded_v2['hour_datetime'].dt.second) / 86400

df_encoded_v2['day_sunrise'] = (df_encoded_v2['day_sunrise'].dt.hour * 3600 +
                                df_encoded_v2['day_sunrise'].dt.minute * 60 +
                                df_encoded_v2['day_sunrise'].dt.second) / 86400

df_encoded_v2['day_sunset'] = (df_encoded_v2['day_sunset'].dt.hour * 3600 +
                               df_encoded_v2['day_sunset'].dt.minute * 60 +
                               df_encoded_v2['day_sunset'].dt.second) / 86400

In [102]:
df_final = df_encoded_v2

In [103]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 643944 entries, 24 to 656440
Columns: 1039 entries, day_datetimeEpoch to 999
dtypes: float64(1030), int64(9)
memory usage: 5.0 GB


In [104]:
df_final.to_parquet(f"{OUTPUT_FOLDER}/{ALL_MERGED_DATA_FILE}.parquet", index=False, engine="pyarrow")

In [105]:
df_final["event_all_region"].value_counts()

event_all_region
0.0    467627
1.0    176317
Name: count, dtype: int64

In [106]:
df_merged["event_all_region"].value_counts()

event_all_region
0.0    467627
1.0    176317
Name: count, dtype: int64

In [107]:
df_encoded["event_all_region"].value_counts()

event_all_region
0.0    467627
1.0    176317
Name: count, dtype: int64