In [162]:
import datetime
import pandas as pd
import pyarrow as pa

In [163]:
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [164]:
INPUT_DATA_FOLDER = "../data"
REPORTS_DATA_FILE = "ISW_vector.csv"

OUTPUT_FOLDER = "../data/all_data_preprocessed"
ISW_OUTPUT_DATA_FILE = "all_isw.csv"
ALARMS_OUTPUT_DATA_FILE = "all_alarms.csv"
WEATHER_EVENTS_OUTPUT_DATA_FILE = "all_weather_by_hour.csv"
MERGED_OUTPUT_DATA_FILE = "weather_regions_alarms_merged.csv"
ALL_MERGED_DATA_FILE = "all_merged"

MODEL_FOLDER = "model"

tfidf_transformer_model = "tfidf_transformer"
count_vectorizer_model = "count_vectorizer"

tfidf_transformer_version = "v1"
count_vectorizer_version = "v1"

In [165]:
def isNaN(num):
    return num != num

## Reading data

In [166]:
df_isw = pd.read_csv(f"{INPUT_DATA_FOLDER}/{REPORTS_DATA_FILE}", sep=",")

In [167]:
df_isw.head(5)

Unnamed: 0,date,content,lemma_content,stem_content,keywords
0,2022-02-24,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,"{'roc mp': 1.0, 'local time': 0.94, 'operation..."
1,2022-02-25,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,"{'roc mp': 1.0, 'information campaign': 0.978,..."
2,2022-02-26,mason clark george barros and katya stepanenko...,mason clark george barros katya stepanenko 3pm...,mason clark georg barro katya stepanenko 3pm e...,"{'local time': 1.0, 'roc mp': 0.967, 'operatio..."
3,2022-02-27,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 4p...,"{'operational pause': 1.0, 'roc mp': 0.957, 'w..."
4,2022-02-28,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 33...,"{'roc mp': 1.0, 'information campaign': 0.976,..."


## Preparing ISW reports

## Reading models

In [168]:
df_isw.head(5)

Unnamed: 0,date,content,lemma_content,stem_content,keywords
0,2022-02-24,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,"{'roc mp': 1.0, 'local time': 0.94, 'operation..."
1,2022-02-25,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,"{'roc mp': 1.0, 'information campaign': 0.978,..."
2,2022-02-26,mason clark george barros and katya stepanenko...,mason clark george barros katya stepanenko 3pm...,mason clark georg barro katya stepanenko 3pm e...,"{'local time': 1.0, 'roc mp': 0.967, 'operatio..."
3,2022-02-27,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 4p...,"{'operational pause': 1.0, 'roc mp': 0.957, 'w..."
4,2022-02-28,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 33...,"{'roc mp': 1.0, 'information campaign': 0.976,..."


In [169]:
df_isw["report_date"] = pd.to_datetime(df_isw["date"])

In [170]:
df_isw['date_tomorrow_datetime'] = df_isw['report_date'].apply(lambda x: x+datetime.timedelta(days=1))

In [171]:
df_isw.drop("date", axis=1, inplace=True)

In [172]:
df_isw.head(5)

Unnamed: 0,content,lemma_content,stem_content,keywords,report_date,date_tomorrow_datetime
0,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
1,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,"{'roc mp': 1.0, 'information campaign': 0.978,...",2022-02-25,2022-02-26
2,mason clark george barros and katya stepanenko...,mason clark george barros katya stepanenko 3pm...,mason clark georg barro katya stepanenko 3pm e...,"{'local time': 1.0, 'roc mp': 0.967, 'operatio...",2022-02-26,2022-02-27
3,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 4p...,"{'operational pause': 1.0, 'roc mp': 0.957, 'w...",2022-02-27,2022-02-28
4,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 33...,"{'roc mp': 1.0, 'information campaign': 0.976,...",2022-02-28,2022-03-01


In [173]:
df_isw.to_csv(f"{OUTPUT_FOLDER}/{ISW_OUTPUT_DATA_FILE}", sep=";", index=False)

## Prepare events data

In [174]:
EVENTS_DATA_FILE = "alarms.csv"

In [175]:
df_events = pd.read_csv(f"{INPUT_DATA_FOLDER}/{EVENTS_DATA_FILE}", sep=";")

In [176]:
df_events.head(5)

Unnamed: 0,id,region_id,region_city,all_region,start,end
0,52432,12,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28
1,53292,23,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43
2,52080,3,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42
3,52857,19,Харківська обл.,1,2022-02-24 20:11:47,2022-02-24 20:59:47
4,52700,18,Тернопільська обл.,1,2022-02-25 01:59:36,2022-02-25 09:00:19


In [177]:
df_events_v2 = df_events.drop(["id","region_id"],axis=1)

In [178]:
df_events_v2.head(5)

Unnamed: 0,region_city,all_region,start,end
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28
1,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43
2,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42
3,Харківська обл.,1,2022-02-24 20:11:47,2022-02-24 20:59:47
4,Тернопільська обл.,1,2022-02-25 01:59:36,2022-02-25 09:00:19


In [179]:
df_events_v2[isNaN(df_events_v2).any(axis=1)].head(5)

Unnamed: 0,region_city,all_region,start,end


In [180]:
df_events_v2["start"] = pd.to_datetime(df_events_v2["start"])
df_events_v2["end"] = pd.to_datetime(df_events_v2["end"])

In [181]:
df_events_v2["start_hour"] = df_events_v2['start'].dt.floor('h')
df_events_v2["end_hour"] = df_events_v2['end'].dt.ceil('h')

In [182]:
df_events_v2["day_date"] = df_events_v2["start"].dt.date

df_events_v2["start_hour_datetimeEpoch"] = df_events_v2['start_hour'].apply(lambda x: int(x.timestamp())  if not isNaN(x) else None)
df_events_v2["end_hour_datetimeEpoch"] = df_events_v2['end_hour'].apply(lambda x: int(x.timestamp())  if not isNaN(x) else None)

df_events_v2.head(10)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800
1,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600
2,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000
3,Харківська обл.,1,2022-02-24 20:11:47,2022-02-24 20:59:47,2022-02-24 20:00:00,2022-02-24 21:00:00,2022-02-24,1645732800,1645736400
4,Тернопільська обл.,1,2022-02-25 01:59:36,2022-02-25 09:00:19,2022-02-25 01:00:00,2022-02-25 10:00:00,2022-02-25,1645750800,1645783200
5,Вінницька обл.,1,2022-02-25 04:01:42,2022-02-25 08:35:42,2022-02-25 04:00:00,2022-02-25 09:00:00,2022-02-25,1645761600,1645779600
6,Харківська обл.,1,2022-02-25 04:56:47,2022-02-25 05:40:47,2022-02-25 04:00:00,2022-02-25 06:00:00,2022-02-25,1645761600,1645768800
7,Чернігівська обл.,1,2022-02-25 06:46:43,2022-02-25 06:52:43,2022-02-25 06:00:00,2022-02-25 07:00:00,2022-02-25,1645768800,1645772400
8,Львівська обл.,1,2022-02-25 06:53:17,2022-02-25 07:56:28,2022-02-25 06:00:00,2022-02-25 08:00:00,2022-02-25,1645768800,1645776000
9,Київ,0,2022-02-25 07:19:04,2022-02-25 07:49:04,2022-02-25 07:00:00,2022-02-25 08:00:00,2022-02-25,1645772400,1645776000


In [183]:
df_events_v2[df_events_v2["all_region"]==1].shape

(54236, 9)

In [184]:
df_events_v2[df_events_v2["all_region"]==1].head(5)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800
1,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600
2,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000
3,Харківська обл.,1,2022-02-24 20:11:47,2022-02-24 20:59:47,2022-02-24 20:00:00,2022-02-24 21:00:00,2022-02-24,1645732800,1645736400
4,Тернопільська обл.,1,2022-02-25 01:59:36,2022-02-25 09:00:19,2022-02-25 01:00:00,2022-02-25 10:00:00,2022-02-25,1645750800,1645783200


In [185]:
df_events_v2[df_events_v2["all_region"]==0].shape

(1552, 9)

In [186]:
df_events_v2[df_events_v2["all_region"]==0].head(5)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
9,Київ,0,2022-02-25 07:19:04,2022-02-25 07:49:04,2022-02-25 07:00:00,2022-02-25 08:00:00,2022-02-25,1645772400,1645776000
39,Київ,0,2022-02-26 11:24:04,2022-02-26 11:54:04,2022-02-26 11:00:00,2022-02-26 12:00:00,2022-02-26,1645873200,1645876800
59,Київ,0,2022-02-26 19:56:04,2022-02-26 20:26:04,2022-02-26 19:00:00,2022-02-26 21:00:00,2022-02-26,1645902000,1645909200
62,Київ,0,2022-02-26 20:58:04,2022-02-26 21:28:04,2022-02-26 20:00:00,2022-02-26 22:00:00,2022-02-26,1645905600,1645912800
65,Київ,0,2022-02-26 22:57:04,2022-02-26 23:27:04,2022-02-26 22:00:00,2022-02-27 00:00:00,2022-02-26,1645912800,1645920000


In [187]:
df_events_v2.to_csv(f"{OUTPUT_FOLDER}/{ALARMS_OUTPUT_DATA_FILE}", sep=";", index=False)

## Prepare weather

In [188]:
WEATHER_DATA_FILE = "weather_by_hour.csv"

In [189]:
df_weather = pd.read_csv(f"{INPUT_DATA_FOLDER}/{WEATHER_DATA_FILE}", sep=",")
df_weather.head(5)

Unnamed: 0,city_latitude,city_longitude,city_resolvedAddress,city_address,city_timezone,city_tzoffset,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,...,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions,hour_icon,hour_source,hour_stations
0,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1020.0,0.0,91.5,0.0,,0.0,Overcast,snow,obs,remote
1,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.2,88.2,0.0,,0.0,Partially cloudy,fog,obs,remote
2,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1022.0,10.0,100.0,,,,Overcast,cloudy,obs,33177099999
3,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.1,92.0,0.0,,0.0,Overcast,fog,obs,remote
4,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.0,93.8,0.0,,0.0,Overcast,cloudy,obs,remote


In [190]:
df_weather["day_datetime"] = pd.to_datetime(df_weather["day_datetime"])

In [191]:
df_weather.shape

(608304, 65)

In [192]:
df_weather.head(10)

Unnamed: 0,city_latitude,city_longitude,city_resolvedAddress,city_address,city_timezone,city_tzoffset,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,...,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions,hour_icon,hour_source,hour_stations
0,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1020.0,0.0,91.5,0.0,,0.0,Overcast,snow,obs,remote
1,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.2,88.2,0.0,,0.0,Partially cloudy,fog,obs,remote
2,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1022.0,10.0,100.0,,,,Overcast,cloudy,obs,33177099999
3,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.1,92.0,0.0,,0.0,Overcast,fog,obs,remote
4,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,0.0,93.8,0.0,,0.0,Overcast,cloudy,obs,remote
5,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1022.5,10.0,100.0,0.0,,0.0,Overcast,cloudy,obs,33177099999;33301099999
6,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1021.0,10.0,100.0,0.0,,0.0,Overcast,cloudy,obs,UKLR;33301099999
7,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1022.0,10.0,100.0,0.0,,0.0,Overcast,cloudy,obs,UKLR;33301099999
8,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1024.2,4.4,100.0,,,,"Snow, Overcast",rain,obs,33177099999;UKLR;33301099999
9,50.7469,25.3263,"Луцьк, Луцький район, Україна","Lutsk,Ukraine",Europe/Kiev,2.0,2022-02-24,1645653600,4.9,0.7,...,1024.0,2.0,100.0,15.0,0.1,0.0,Overcast,cloudy,obs,UKLR;33301099999


In [193]:
# exclude
weather_exclude = [
"day_feelslikemax",
"day_feelslikemin",
"day_sunriseEpoch",
"day_sunsetEpoch",
"day_description",
"city_latitude",
"city_longitude",
"city_address",
"city_timezone",
"city_tzoffset",
"day_feelslike",
"day_precipprob",
"day_snow",
"day_snowdepth",
"day_windgust",
"day_windspeed",
"day_winddir",
"day_pressure",
"day_cloudcover",
"day_visibility",
"day_conditions",
"day_icon",
"day_source",
"day_preciptype",
"day_stations",
"hour_icon",
"hour_source",
"hour_stations",
"hour_feelslike"
]

In [194]:
df_weather_v2 = df_weather.drop(weather_exclude, axis=1)

In [195]:
df_weather_v2.head(5)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_windgust,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,31.3,15.5,275.6,1020.0,0.0,91.5,0.0,,0.0,Overcast
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,27.7,14.8,280.3,1021.0,0.2,88.2,0.0,,0.0,Partially cloudy
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,29.2,14.4,310.0,1022.0,10.0,100.0,,,,Overcast
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,23.8,13.3,295.1,1021.0,0.1,92.0,0.0,,0.0,Overcast
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,24.5,13.3,305.8,1021.0,0.0,93.8,0.0,,0.0,Overcast


In [196]:
df_weather_v2["city"] = df_weather_v2["city_resolvedAddress"].apply(lambda x: x.split(",")[0])
df_weather_v2["city"] = df_weather_v2["city"].replace('Хмельницька область', "Хмельницький")

In [197]:
df_weather_v2.head(5)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_windspeed,hour_winddir,hour_pressure,hour_visibility,hour_cloudcover,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions,city
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,15.5,275.6,1020.0,0.0,91.5,0.0,,0.0,Overcast,Луцьк
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,14.8,280.3,1021.0,0.2,88.2,0.0,,0.0,Partially cloudy,Луцьк
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,14.4,310.0,1022.0,10.0,100.0,,,,Overcast,Луцьк
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,13.3,295.1,1021.0,0.1,92.0,0.0,,0.0,Overcast,Луцьк
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,13.3,305.8,1021.0,0.0,93.8,0.0,,0.0,Overcast,Луцьк


In [198]:
df_weather_v2.shape

(608304, 37)

In [199]:
df_weather_v2.to_csv(f"{OUTPUT_FOLDER}/{WEATHER_EVENTS_OUTPUT_DATA_FILE}", sep=";", index=False)

## merging data

In [200]:
df_regions = pd.read_csv(f"../data/regions.csv")

In [201]:
df_regions["region"] = df_regions["region"].apply(lambda x: x + " обл.")

In [202]:
df_regions.head(5)

Unnamed: 0,region,center_city_ua,center_city_en,region_alt,region_id
0,АР Крим обл.,Сімферополь,Simferopol,Крим,1
1,Вінницька обл.,Вінниця,Vinnytsia,Вінниччина,2
2,Волинська обл.,Луцьк,Lutsk,Волинь,3
3,Дніпропетровська обл.,Дніпро,Dnipro,Дніпропетровщина,4
4,Донецька обл.,Донецьк,Donetsk,Донеччина,5


In [203]:
df_weather_reg = pd.merge(df_weather_v2, df_regions, left_on="city",right_on="center_city_ua")

In [204]:
df_weather_reg.head(10)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions,city,region,center_city_ua,center_city_en,region_alt,region_id
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,,0.0,Partially cloudy,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,,,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
5,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
6,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
7,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
8,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,,,"Snow, Overcast",Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
9,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,15.0,0.1,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3


In [205]:
df_weather_reg.shape

(608304, 42)

In [206]:
df_weather_v2.shape

(608304, 37)

### Merging weather and events

In [207]:
df_events_v2.dtypes

region_city                         object
all_region                           int64
start                       datetime64[ns]
end                         datetime64[ns]
start_hour                  datetime64[ns]
end_hour                    datetime64[ns]
day_date                            object
start_hour_datetimeEpoch             int64
end_hour_datetimeEpoch               int64
dtype: object

In [208]:
df_events_v2.shape

(55788, 9)

In [209]:
df_events_v2.head(10)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800
1,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600
2,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000
3,Харківська обл.,1,2022-02-24 20:11:47,2022-02-24 20:59:47,2022-02-24 20:00:00,2022-02-24 21:00:00,2022-02-24,1645732800,1645736400
4,Тернопільська обл.,1,2022-02-25 01:59:36,2022-02-25 09:00:19,2022-02-25 01:00:00,2022-02-25 10:00:00,2022-02-25,1645750800,1645783200
5,Вінницька обл.,1,2022-02-25 04:01:42,2022-02-25 08:35:42,2022-02-25 04:00:00,2022-02-25 09:00:00,2022-02-25,1645761600,1645779600
6,Харківська обл.,1,2022-02-25 04:56:47,2022-02-25 05:40:47,2022-02-25 04:00:00,2022-02-25 06:00:00,2022-02-25,1645761600,1645768800
7,Чернігівська обл.,1,2022-02-25 06:46:43,2022-02-25 06:52:43,2022-02-25 06:00:00,2022-02-25 07:00:00,2022-02-25,1645768800,1645772400
8,Львівська обл.,1,2022-02-25 06:53:17,2022-02-25 07:56:28,2022-02-25 06:00:00,2022-02-25 08:00:00,2022-02-25,1645768800,1645776000
9,Київ,0,2022-02-25 07:19:04,2022-02-25 07:49:04,2022-02-25 07:00:00,2022-02-25 08:00:00,2022-02-25,1645772400,1645776000


In [210]:
df_regions["region"].unique()

array(['АР Крим обл.', 'Вінницька обл.', 'Волинська обл.',
       'Дніпропетровська обл.', 'Донецька обл.', 'Житомирська обл.',
       'Закарпатська обл.', 'Запорізька обл.', 'Івано-Франківська обл.',
       'Київська обл.', 'Кіровоградська обл.', 'Луганська обл.',
       'Львівська обл.', 'Миколаївська обл.', 'Одеська обл.',
       'Полтавська обл.', 'Рівненська обл.', 'Сумська обл.',
       'Тернопільська обл.', 'Харківська обл.', 'Херсонська обл.',
       'Хмельницька обл.', 'Черкаська обл.', 'Чернівецька обл.',
       'Чернігівська обл.'], dtype=object)

In [211]:
df_events_v2["region_city"].unique()

array(['Львівська обл.', 'Чернігівська обл.', 'Вінницька обл.',
       'Харківська обл.', 'Тернопільська обл.', 'Київ', 'Рівненська обл.',
       'Черкаська обл.', 'Одеська обл.', 'Запорізька обл.',
       'Волинська обл.', 'Житомирська обл.', 'Херсонська обл.',
       'Миколаївська обл.', 'Хмельницька обл.', 'Івано-Франківська обл.',
       'Дніпропетровська обл.', 'Кіровоградська обл.', 'Чернівецька обл.',
       'Полтавська обл.', 'Київська обл.', 'Сумська обл.',
       'Донецька обл.', 'Закарпатська обл.', 'Крим'], dtype=object)

In [212]:
# df_events_v2_sample = df_events_v2.sample(10)
# df_events_v2_sample.shape

events_dict = df_events_v2.to_dict('records')
events_by_hour = []

In [213]:
events_dict[0]

{'region_city': 'Львівська обл.',
 'all_region': 1,
 'start': Timestamp('2022-02-24 07:43:17'),
 'end': Timestamp('2022-02-24 09:52:28'),
 'start_hour': Timestamp('2022-02-24 07:00:00'),
 'end_hour': Timestamp('2022-02-24 10:00:00'),
 'day_date': datetime.date(2022, 2, 24),
 'start_hour_datetimeEpoch': 1645686000,
 'end_hour_datetimeEpoch': 1645696800}

In [214]:
for event in events_dict:
    for d in pd.date_range(start=event["start_hour"], end=event["end_hour"], freq='1h'):
        et = event.copy()
        et["hour_level_event_time"] = d
        events_by_hour.append(et)

In [215]:
df_events_v3 = pd.DataFrame.from_dict(events_by_hour)

In [216]:
df_events_v3["hour_level_event_datetimeEpoch"] = df_events_v3["hour_level_event_time"].apply(lambda x: int(x.timestamp())  if not isNaN(x) else None)

In [217]:
df_events_v3.shape

(179408, 11)

In [218]:
df_events_v3.head(10)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch,hour_level_event_time,hour_level_event_datetimeEpoch
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 07:00:00,1645686000
1,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 08:00:00,1645689600
2,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 09:00:00,1645693200
3,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 10:00:00,1645696800
4,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 14:00:00,1645711200
5,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 15:00:00,1645714800
6,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 16:00:00,1645718400
7,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 17:00:00,1645722000
8,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 18:00:00,1645725600
9,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000,2022-02-24 15:00:00,1645714800


In [219]:
df_weather_reg.head(5)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,hour_solarradiation,hour_solarenergy,hour_uvindex,hour_conditions,city,region,center_city_ua,center_city_en,region_alt,region_id
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,,0.0,Partially cloudy,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,,,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,0.0,,0.0,Overcast,Луцьк,Волинська обл.,Луцьк,Lutsk,Волинь,3


In [220]:
df_weather_reg.shape

(608304, 42)

In [221]:
df_events_v3.head(10)

Unnamed: 0,region_city,all_region,start,end,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch,hour_level_event_time,hour_level_event_datetimeEpoch
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 07:00:00,1645686000
1,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 08:00:00,1645689600
2,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 09:00:00,1645693200
3,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 10:00:00,1645696800
4,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 14:00:00,1645711200
5,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 15:00:00,1645714800
6,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 16:00:00,1645718400
7,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 17:00:00,1645722000
8,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 18:00:00,1645725600
9,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000,2022-02-24 15:00:00,1645714800


In [222]:
df_events_v3["region_merge"] = df_events_v3["region_city"].apply(lambda x: "Київська обл." if x == "Київ" else x)

#count = df_events_v3.groupby("hour_level_event_time")['region_city'].nunique()
#df_events_v3['alarms_in_regions'] = df_events_v3["hour_level_event_time"].map(count)

#counts = df_events_v3.groupby(['region_city', 'day_date'])["region_city"].nunique()
#df_events_v3['events_on_day'] = df_events_v3.set_index(['region_city', 'day_date']).index.map(counts)

In [223]:
df_events_v4 = df_events_v3.copy().add_prefix("event_")

In [224]:
df_events_v4.head(10)

Unnamed: 0,event_region_city,event_all_region,event_start,event_end,event_start_hour,event_end_hour,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch,event_region_merge
0,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 07:00:00,1645686000,Львівська обл.
1,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 08:00:00,1645689600,Львівська обл.
2,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 09:00:00,1645693200,Львівська обл.
3,Львівська обл.,1,2022-02-24 07:43:17,2022-02-24 09:52:28,2022-02-24 07:00:00,2022-02-24 10:00:00,2022-02-24,1645686000,1645696800,2022-02-24 10:00:00,1645696800,Львівська обл.
4,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 14:00:00,1645711200,Чернігівська обл.
5,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 15:00:00,1645714800,Чернігівська обл.
6,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 16:00:00,1645718400,Чернігівська обл.
7,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 17:00:00,1645722000,Чернігівська обл.
8,Чернігівська обл.,1,2022-02-24 14:00:43,2022-02-24 17:11:43,2022-02-24 14:00:00,2022-02-24 18:00:00,2022-02-24,1645711200,1645725600,2022-02-24 18:00:00,1645725600,Чернігівська обл.
9,Вінницька обл.,1,2022-02-24 15:40:42,2022-02-24 16:10:42,2022-02-24 15:00:00,2022-02-24 17:00:00,2022-02-24,1645714800,1645722000,2022-02-24 15:00:00,1645714800,Вінницька обл.


In [225]:
df_weather_v4 = df_weather_reg.merge(df_events_v4, 
                                     how="left", 
                                     left_on=["region","hour_datetimeEpoch"],
                                     right_on=["event_region_merge","event_hour_level_event_datetimeEpoch"])

In [226]:
df_weather_v4.drop(["event_region_merge"], axis=1, inplace=True)

In [227]:
df_weather_v4.head(10)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,event_all_region,event_start,event_end,event_start_hour,event_end_hour,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
5,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
6,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
7,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
8,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
9,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,


In [228]:
df_weather_v4.shape

(634115, 53)

In [229]:
df_weather_v4.to_csv(f"{OUTPUT_FOLDER}/{MERGED_OUTPUT_DATA_FILE}", sep=";", index=False)

In [230]:
df_weather_v4.head(10)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,event_all_region,event_start,event_end,event_start_hour,event_end_hour,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
5,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
6,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
7,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
8,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,
9,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,,NaT,NaT,NaT,NaT,,,,NaT,


In [231]:
df_weather_v4.shape

(634115, 53)

In [232]:
df_isw.head(10)

Unnamed: 0,content,lemma_content,stem_content,keywords,report_date,date_tomorrow_datetime
0,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
1,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 30...,"{'roc mp': 1.0, 'information campaign': 0.978,...",2022-02-25,2022-02-26
2,mason clark george barros and katya stepanenko...,mason clark george barros katya stepanenko 3pm...,mason clark georg barro katya stepanenko 3pm e...,"{'local time': 1.0, 'roc mp': 0.967, 'operatio...",2022-02-26,2022-02-27
3,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 4p...,"{'operational pause': 1.0, 'roc mp': 0.957, 'w...",2022-02-27,2022-02-28
4,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 33...,"{'roc mp': 1.0, 'information campaign': 0.976,...",2022-02-28,2022-03-01
5,frederick w kagan george barros and kateryna s...,frederick w kagan george barros kateryna stepa...,frederick w kagan georg barro kateryna stepane...,"{'hour russian': 1.0, 'roc mp': 0.997, 'fomin ...",2022-03-01,2022-03-02
6,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 43...,"{'roc mp': 1.0, 'fomin ’': 0.909, 'operational...",2022-03-02,2022-03-03
7,mason clark george barros and kateryna stepane...,mason clark george barros kateryna stepanenko ...,mason clark georg barro kateryna stepanenko 40...,"{'operational pause': 1.0, 'information campai...",2022-03-03,2022-03-04
8,fredrick w kagan george barros and kateryna st...,fredrick w kagan george barros kateryna stepan...,fredrick w kagan georg barro kateryna stepanen...,"{'russian troop': 1.0, 'fomin ’': 0.945, 'roc ...",2022-03-04,2022-03-05
9,fredrick w kagan george barros and kateryna st...,fredrick w kagan george barros kateryna stepan...,fredrick w kagan georg barro kateryna stepanen...,"{'roc mp': 1.0, 'kyiv kharkiv': 0.98, 'operati...",2022-03-05,2022-03-06


In [233]:
df_isw_v2 = df_isw.drop(["content", "stem_content"], axis=1)

In [234]:
df_isw_v2.head(5)

Unnamed: 0,lemma_content,keywords,report_date,date_tomorrow_datetime
0,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
1,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'information campaign': 0.978,...",2022-02-25,2022-02-26
2,mason clark george barros katya stepanenko 3pm...,"{'local time': 1.0, 'roc mp': 0.967, 'operatio...",2022-02-26,2022-02-27
3,mason clark george barros kateryna stepanenko ...,"{'operational pause': 1.0, 'roc mp': 0.957, 'w...",2022-02-27,2022-02-28
4,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'information campaign': 0.976,...",2022-02-28,2022-03-01


In [235]:
df_merged = df_weather_v4.merge(df_isw_v2, how="left", left_on="day_datetime", right_on="report_date")

In [236]:
df_merged.head(10)

Unnamed: 0,city_resolvedAddress,day_datetime,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,...,event_end_hour,event_day_date,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch,lemma_content,keywords,report_date,date_tomorrow_datetime
0,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
1,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
2,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
3,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
4,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
5,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
6,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
7,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
8,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25
9,"Луцьк, Луцький район, Україна",2022-02-24,1645653600,4.9,0.7,2.6,0.0,83.7,0.118,4.17,...,NaT,,,,NaT,,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'local time': 0.94, 'operation...",2022-02-24,2022-02-25


In [237]:
df_merged=df_merged.drop(["city_resolvedAddress", "day_datetime", "city", "region", "event_day_date"], axis=1)
df_merged = df_merged.dropna(subset=df_isw_v2.columns.difference(["date"]))
df_merged = df_merged.dropna(subset=["event_all_region"])

In [238]:
count = df_merged.groupby("event_hour_level_event_time")["event_region_city"].nunique()
df_merged['alarms_in_regions'] = df_merged["event_hour_level_event_time"].map(count)

In [239]:
df_merged.shape

(175551, 53)

In [240]:
df_merged.to_parquet(f"{OUTPUT_FOLDER}/{ALL_MERGED_DATA_FILE}.parquet", index=False, engine="pyarrow")

In [241]:
df_merged.to_csv(f"{OUTPUT_FOLDER}/{ALL_MERGED_DATA_FILE}.csv", index=False, sep=";")

In [242]:
check = pd.read_parquet(f"{OUTPUT_FOLDER}/{ALL_MERGED_DATA_FILE}.parquet")
check

Unnamed: 0,day_datetimeEpoch,day_tempmax,day_tempmin,day_temp,day_dew,day_humidity,day_precip,day_precipcover,day_solarradiation,day_solarenergy,...,event_end_hour,event_start_hour_datetimeEpoch,event_end_hour_datetimeEpoch,event_hour_level_event_time,event_hour_level_event_datetimeEpoch,lemma_content,keywords,report_date,date_tomorrow_datetime,alarms_in_regions
0,1645740000,6.3,-0.3,2.2,-1.2,78.3,0.2,8.33,45.7,3.3,...,2022-02-25 22:00:00,1.645823e+09,1.645826e+09,2022-02-25 21:00:00,1.645823e+09,mason clark george barros kateryna stepanenko ...,"{'roc mp': 1.0, 'information campaign': 0.978,...",2022-02-25,2022-02-26,6
1,1645826400,6.3,-1.5,1.9,-2.7,73.4,0.0,0.00,116.2,8.4,...,2022-02-25 22:00:00,1.645823e+09,1.645826e+09,2022-02-25 22:00:00,1.645826e+09,mason clark george barros katya stepanenko 3pm...,"{'local time': 1.0, 'roc mp': 0.967, 'operatio...",2022-02-26,2022-02-27,5
2,1645826400,6.3,-1.5,1.9,-2.7,73.4,0.0,0.00,116.2,8.4,...,2022-02-26 07:00:00,1.645855e+09,1.645859e+09,2022-02-26 06:00:00,1.645855e+09,mason clark george barros katya stepanenko 3pm...,"{'local time': 1.0, 'roc mp': 0.967, 'operatio...",2022-02-26,2022-02-27,6
3,1645826400,6.3,-1.5,1.9,-2.7,73.4,0.0,0.00,116.2,8.4,...,2022-02-26 07:00:00,1.645855e+09,1.645859e+09,2022-02-26 07:00:00,1.645859e+09,mason clark george barros katya stepanenko 3pm...,"{'local time': 1.0, 'roc mp': 0.967, 'operatio...",2022-02-26,2022-02-27,7
4,1645826400,6.3,-1.5,1.9,-2.7,73.4,0.0,0.00,116.2,8.4,...,2022-02-26 12:00:00,1.645870e+09,1.645877e+09,2022-02-26 10:00:00,1.645870e+09,mason clark george barros katya stepanenko 3pm...,"{'local time': 1.0, 'roc mp': 0.967, 'operatio...",2022-02-26,2022-02-27,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175546,1740780000,0.3,-6.9,-3.0,-5.9,81.2,0.0,0.00,106.1,9.2,...,2025-03-01 19:00:00,1.740848e+09,1.740856e+09,2025-03-01 17:00:00,1.740848e+09,russian offensive campaign assessment 2025 dav...,"{'roc mp': 1.0, 'united state': 0.967, 'kursk ...",2025-03-01,2025-03-02,11
175547,1740780000,0.3,-6.9,-3.0,-5.9,81.2,0.0,0.00,106.1,9.2,...,2025-03-01 19:00:00,1.740848e+09,1.740856e+09,2025-03-01 18:00:00,1.740852e+09,russian offensive campaign assessment 2025 dav...,"{'roc mp': 1.0, 'united state': 0.967, 'kursk ...",2025-03-01,2025-03-02,11
175548,1740780000,0.3,-6.9,-3.0,-5.9,81.2,0.0,0.00,106.1,9.2,...,2025-03-01 19:00:00,1.740848e+09,1.740856e+09,2025-03-01 19:00:00,1.740856e+09,russian offensive campaign assessment 2025 dav...,"{'roc mp': 1.0, 'united state': 0.967, 'kursk ...",2025-03-01,2025-03-02,11
175549,1740780000,0.3,-6.9,-3.0,-5.9,81.2,0.0,0.00,106.1,9.2,...,2025-03-02 02:00:00,1.740859e+09,1.740881e+09,2025-03-01 20:00:00,1.740859e+09,russian offensive campaign assessment 2025 dav...,"{'roc mp': 1.0, 'united state': 0.967, 'kursk ...",2025-03-01,2025-03-02,9
