In [41]:
import datetime
import numpy as np
import pandas as pd

import pickle

from utils import tf_idf

In [42]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [43]:
#INPUT_DATA_FOLDER = "data/2_isw_preprocessed"
#REPORTS_DATA_FILE = "all_days_isw_reports_parsed.csv"

#OUTPUT_FOLDER = "data/4_all_data_preprocessed"
#ISW_OUTPUT_DATA_FILE = "all_isw.csv"
#WEATHER_EVENTS_OUTPUT_DATA_FILE = "all_hourly_weather_events.csv"

#MODEL_FOLDER = "model"

#tfidf_transformer_model = "tfidf_transformer"
#count_vectorizer_model = "count_vectorizer"

#tfidf_transformer_version = "v1"
#count_vectorizer_version = "v1"

In [44]:
def isNaN(num):
    return num != num

## reading data

In [45]:
df_isw = pd.read_csv("data/0_isw_data_collection/2_isw_preprocessed_before_vectosing.csv")

In [46]:
df_isw = df_isw.drop(columns=['Unnamed: 0.1','Unnamed: 0'],axis=1)

In [47]:
df_isw.head(5)

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v6,report_text_lemm,report_text_stemm
0,2022-02-25,RusCampaignFeb25,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step...",mason clark georg barro kateryna stepanenkofe...,mason clark georg barro kateryna stepanenkofe...
1,2022-02-26,RusCampaignFeb26,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Katya Stepane...",mason clark georg barro katya stepanenkofebru...,mason clark georg barro katya stepanenkofebru...
2,2022-02-27,RusCampaignFeb27,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step...",mason clark georg barro kateryna stepanenkofe...,mason clark georg barro kateryna stepanenkofe...
3,2022-02-28,RusCampaignFeb28,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step...",mason clark georg barro kateryna stepanenkofe...,mason clark georg barro kateryna stepanenkofe...
4,2022-03-01,RusCampaignMar1,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Frederick W. Kagan, George Barros, and Katery...",frederick kagan georg barro kateryna stepanen...,frederick kagan georg barro kateryna stepanen...


## preparing ISW reports

## reading models

In [48]:
#load the content
tfidf = pickle.load(open("models/tfidf_transformer_v1.pkl", "rb"))
cv = pickle.load(open("models/count_vectorizer_v1.pkl", "rb"))

In [49]:
cv.get_feature_names_out

<bound method CountVectorizer.get_feature_names_out of CountVectorizer(max_df=0.98, min_df=2)>

In [50]:
df_isw['keywords'] = df_isw['report_text_lemm'].apply(lambda x: tf_idf.convert_doc_to_vector(x,tfidf,cv))

In [51]:
df_isw.head(5)

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v6,report_text_lemm,report_text_stemm,keywords
0,2022-02-25,RusCampaignFeb25,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step...",mason clark georg barro kateryna stepanenkofe...,mason clark georg barro kateryna stepanenkofe...,"{'russian': 0.401, 'forc': 0.389, 'februari': ..."
1,2022-02-26,RusCampaignFeb26,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Katya Stepane...",mason clark georg barro katya stepanenkofebru...,mason clark georg barro katya stepanenkofebru...,"{'russian': 0.406, 'forc': 0.397, 'februari': ..."
2,2022-02-27,RusCampaignFeb27,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step...",mason clark georg barro kateryna stepanenkofe...,mason clark georg barro kateryna stepanenkofe...,"{'russian': 0.437, 'forc': 0.437, 'februari': ..."
3,2022-02-28,RusCampaignFeb28,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step...",mason clark georg barro kateryna stepanenkofe...,mason clark georg barro kateryna stepanenkofe...,"{'februari': 0.444, 'russian': 0.403, 'forc': ..."
4,2022-03-01,RusCampaignMar1,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Frederick W. Kagan, George Barros, and Katery...",frederick kagan georg barro kateryna stepanen...,frederick kagan georg barro kateryna stepanen...,"{'russian': 0.367, 'kyiv': 0.301, 'forc': 0.28..."


In [52]:
df_isw["date_datetime"] = pd.to_datetime(df_isw["date"])

In [53]:
df_isw['date_tomorrow_datetime'] = df_isw['date_datetime'].apply(lambda x: x+datetime.timedelta(days=1))

In [54]:
df_isw = df_isw.rename(columns = {"date_datetime":"report_date"})
df_isw.to_csv("data/0_isw_data_collection/all_isw.csv", sep=";", index=False)

In [55]:
df_isw.head(5)

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v6,report_text_lemm,report_text_stemm,keywords,report_date,date_tomorrow_datetime
0,2022-02-25,RusCampaignFeb25,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step...",mason clark georg barro kateryna stepanenkofe...,mason clark georg barro kateryna stepanenkofe...,"{'russian': 0.401, 'forc': 0.389, 'februari': ...",2022-02-25,2022-02-26
1,2022-02-26,RusCampaignFeb26,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Katya Stepane...",mason clark georg barro katya stepanenkofebru...,mason clark georg barro katya stepanenkofebru...,"{'russian': 0.406, 'forc': 0.397, 'februari': ...",2022-02-26,2022-02-27
2,2022-02-27,RusCampaignFeb27,Russia-Ukraine Warning Update: Russian Offensi...,Russia-Ukraine Warning Update: Russian Offensi...,/backgrounder/russia-ukraine-warning-update-ru...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step...",mason clark georg barro kateryna stepanenkofe...,mason clark georg barro kateryna stepanenkofe...,"{'russian': 0.437, 'forc': 0.437, 'februari': ...",2022-02-27,2022-02-28
3,2022-02-28,RusCampaignFeb28,"Russian Offensive Campaign Assessment, Februar...","Russian Offensive Campaign Assessment, Februar...",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Mason Clark, George Barros, and Kateryna Step...",mason clark georg barro kateryna stepanenkofe...,mason clark georg barro kateryna stepanenkofe...,"{'februari': 0.444, 'russian': 0.403, 'forc': ...",2022-02-28,2022-03-01
4,2022-03-01,RusCampaignMar1,"Russian Offensive Campaign Assessment, March 1...","Russian Offensive Campaign Assessment, March 1",/backgrounder/russian-offensive-campaign-asses...,"<div class=""field field-name-body field-type-t...","Frederick W. Kagan, George Barros, and Katery...",frederick kagan georg barro kateryna stepanen...,frederick kagan georg barro kateryna stepanen...,"{'russian': 0.367, 'kyiv': 0.301, 'forc': 0.28...",2022-03-01,2022-03-02


## prepare events data

In [56]:
#EVENTS_DATA_FOLDER = "data/1_events"
#EVENTS_DATA_FILE = "all_events.csv"

In [57]:
df_events = pd.read_csv("data/alarms_with_features.csv")

In [58]:
df_events.head()

Unnamed: 0.1,Unnamed: 0,id,region_id,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id,start_time,end_time,day,feature_number_of_region,within_24_hours,feature_number_of_alarms_within_24_hours
0,1218,5257,24,Івано-Франківщина,Калуш,0,2022-03-12 10:13:03,2022-03-12 10:26:06,2022-03-12 10:26:06,,2022-03-12 10:13:03,2022-03-12 10:26:06,12,0,2022-03-11 10:13:03,0
1,19206,45672,24,Івано-Франківщина,Івано-Франківська обл.,1,2022-12-28 11:22:22,2022-12-28 13:20:37,2022-12-28 13:20:37,,2022-12-28 11:22:22,2022-12-28 13:20:37,28,0,2022-12-27 11:22:22,0
2,13297,27633,24,Івано-Франківщина,Івано-Франківська обл.,1,2022-08-21 03:32:58,2022-08-21 04:03:53,2022-08-21 04:03:53,,2022-08-21 03:32:58,2022-08-21 04:03:53,21,0,2022-08-20 03:32:58,0
3,11354,21845,24,Івано-Франківщина,Івано-Франківська обл.,1,2022-07-14 10:20:11,2022-07-14 12:12:23,2022-07-14 12:12:23,,2022-07-14 10:20:11,2022-07-14 12:12:23,14,0,2022-07-13 10:20:11,0
4,3351,5295,24,Івано-Франківщина,Івано-Франківська обл.,1,2022-03-26 21:05:24,2022-03-26 21:42:55,2022-03-26 21:42:55,,2022-03-26 21:05:24,2022-03-26 21:42:55,26,0,2022-03-25 21:05:24,0


In [59]:
df_events_v2 = df_events.drop(["id","region_id"],axis=1)

In [60]:
df_events_v2.head(5)

Unnamed: 0.1,Unnamed: 0,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id,start_time,end_time,day,feature_number_of_region,within_24_hours,feature_number_of_alarms_within_24_hours
0,1218,Івано-Франківщина,Калуш,0,2022-03-12 10:13:03,2022-03-12 10:26:06,2022-03-12 10:26:06,,2022-03-12 10:13:03,2022-03-12 10:26:06,12,0,2022-03-11 10:13:03,0
1,19206,Івано-Франківщина,Івано-Франківська обл.,1,2022-12-28 11:22:22,2022-12-28 13:20:37,2022-12-28 13:20:37,,2022-12-28 11:22:22,2022-12-28 13:20:37,28,0,2022-12-27 11:22:22,0
2,13297,Івано-Франківщина,Івано-Франківська обл.,1,2022-08-21 03:32:58,2022-08-21 04:03:53,2022-08-21 04:03:53,,2022-08-21 03:32:58,2022-08-21 04:03:53,21,0,2022-08-20 03:32:58,0
3,11354,Івано-Франківщина,Івано-Франківська обл.,1,2022-07-14 10:20:11,2022-07-14 12:12:23,2022-07-14 12:12:23,,2022-07-14 10:20:11,2022-07-14 12:12:23,14,0,2022-07-13 10:20:11,0
4,3351,Івано-Франківщина,Івано-Франківська обл.,1,2022-03-26 21:05:24,2022-03-26 21:42:55,2022-03-26 21:42:55,,2022-03-26 21:05:24,2022-03-26 21:42:55,26,0,2022-03-25 21:05:24,0


In [61]:
# df_events_v2["start_time"] = df_events_v2.apply(lambda x: x["start"] if not isNaN(x["start"]) else x["event_time"] , axis=1)
# df_events_v2["end_time"] = df_events_v2.apply(lambda x: x["end"] if not isNaN(x["end"]) else x["event_time"], axis=1)

In [62]:
df_events_v2["start_time"] = pd.to_datetime(df_events_v2["start"])
df_events_v2["end_time"] = pd.to_datetime(df_events_v2["end"])
#df_events_v2["event_time"] = pd.to_datetime(df_events_v2["event_time"])

In [63]:
df_events_v2["start_hour"] = df_events_v2['start_time'].dt.floor('H')
df_events_v2["end_hour"] = df_events_v2['end_time'].dt.ceil('H')
#df_events_v2["event_hour"] = df_events_v2['event_time'].dt.round('H')

In [64]:
df_events_v2["start_hour"] = df_events_v2.apply(lambda x: x["start_hour"] if not isNaN(x["start_hour"]) else x["event_hour"] , axis=1)
df_events_v2["end_hour"] = df_events_v2.apply(lambda x: x["end_hour"] if not isNaN(x["end_hour"]) else x["event_hour"] , axis=1)

In [65]:
df_events_v2["day_date"] = df_events_v2["start_time"].dt.date

df_events_v2["start_hour_datetimeEpoch"] = df_events_v2['start_hour'].apply(lambda x: int(x.strftime('%s'))  if not isNaN(x) else None)
df_events_v2["end_hour_datetimeEpoch"] = df_events_v2['end_hour'].apply(lambda x: int(x.strftime('%s'))  if not isNaN(x) else None)

df_events_v2.head(10)

Unnamed: 0.1,Unnamed: 0,region_title,region_city,all_region,start,end,clean_end,intersection_alarm_id,start_time,end_time,day,feature_number_of_region,within_24_hours,feature_number_of_alarms_within_24_hours,start_hour,end_hour,day_date,start_hour_datetimeEpoch,end_hour_datetimeEpoch
0,1218,Івано-Франківщина,Калуш,0,2022-03-12 10:13:03,2022-03-12 10:26:06,2022-03-12 10:26:06,,2022-03-12 10:13:03,2022-03-12 10:26:06,12,0,2022-03-11 10:13:03,0,2022-03-12 10:00:00,2022-03-12 11:00:00,2022-03-12,1647079200,1647082800
1,19206,Івано-Франківщина,Івано-Франківська обл.,1,2022-12-28 11:22:22,2022-12-28 13:20:37,2022-12-28 13:20:37,,2022-12-28 11:22:22,2022-12-28 13:20:37,28,0,2022-12-27 11:22:22,0,2022-12-28 11:00:00,2022-12-28 14:00:00,2022-12-28,1672225200,1672236000
2,13297,Івано-Франківщина,Івано-Франківська обл.,1,2022-08-21 03:32:58,2022-08-21 04:03:53,2022-08-21 04:03:53,,2022-08-21 03:32:58,2022-08-21 04:03:53,21,0,2022-08-20 03:32:58,0,2022-08-21 03:00:00,2022-08-21 05:00:00,2022-08-21,1661050800,1661058000
3,11354,Івано-Франківщина,Івано-Франківська обл.,1,2022-07-14 10:20:11,2022-07-14 12:12:23,2022-07-14 12:12:23,,2022-07-14 10:20:11,2022-07-14 12:12:23,14,0,2022-07-13 10:20:11,0,2022-07-14 10:00:00,2022-07-14 13:00:00,2022-07-14,1657792800,1657803600
4,3351,Івано-Франківщина,Івано-Франківська обл.,1,2022-03-26 21:05:24,2022-03-26 21:42:55,2022-03-26 21:42:55,,2022-03-26 21:05:24,2022-03-26 21:42:55,26,0,2022-03-25 21:05:24,0,2022-03-26 21:00:00,2022-03-26 22:00:00,2022-03-26,1648328400,1648332000
5,18082,Івано-Франківщина,Івано-Франківська обл.,1,2022-11-17 08:43:21,2022-11-17 10:59:10,2022-11-17 10:59:10,,2022-11-17 08:43:21,2022-11-17 10:59:10,17,0,2022-11-16 08:43:21,0,2022-11-17 08:00:00,2022-11-17 11:00:00,2022-11-17,1668672000,1668682800
6,3350,Івано-Франківщина,Івано-Франківська обл.,1,2022-03-26 21:04:47,2022-03-26 21:42:24,2022-03-26 21:42:24,,2022-03-26 21:04:47,2022-03-26 21:42:24,26,0,2022-03-25 21:04:47,0,2022-03-26 21:00:00,2022-03-26 22:00:00,2022-03-26,1648328400,1648332000
7,238,Івано-Франківщина,Калуш,0,2022-03-03 01:52:41,2022-03-03 02:45:57,2022-03-03 02:45:57,,2022-03-03 01:52:41,2022-03-03 02:45:57,3,0,2022-03-02 01:52:41,0,2022-03-03 01:00:00,2022-03-03 03:00:00,2022-03-03,1646269200,1646276400
8,1219,Івано-Франківщина,Івано-Франківськ,0,2022-03-12 10:14:24,2022-03-12 10:25:32,2022-03-12 10:25:32,,2022-03-12 10:14:24,2022-03-12 10:25:32,12,0,2022-03-11 10:14:24,0,2022-03-12 10:00:00,2022-03-12 11:00:00,2022-03-12,1647079200,1647082800
9,13259,Івано-Франківщина,Івано-Франківська обл.,1,2022-08-20 14:37:27,2022-08-20 15:13:37,2022-08-20 15:13:37,,2022-08-20 14:37:27,2022-08-20 15:13:37,20,0,2022-08-19 14:37:27,0,2022-08-20 14:00:00,2022-08-20 16:00:00,2022-08-20,1661004000,1661011200


In [66]:
df_events_v2.shape

(19933, 19)

## prepare weather

In [67]:
#WEATHER_DATA_FOLDER = "data/1_weather"
#wWEATHER_DATA_FILE = "all_weather_by_hour.csv"

In [None]:
df_weather = pd.read_csv("data/0_raw_other_data/all_weather_by_hour.csv")
df_weather["day_datetime"] = pd.to_datetime(df_weather["day_datetime"])

In [None]:
df_weather.shape

In [None]:
df_weather.head(15)

In [None]:
# len(clmns)

In [None]:
# exclude
weather_exclude = [
"day_feelslikemax",
"day_feelslikemin",
"day_sunriseEpoch",
"day_sunsetEpoch",
"day_description",
"city_latitude",
"city_longitude",
"city_address",
"city_timezone",
"city_tzoffset",
"day_feelslike",
"day_precipprob",
"day_snow",
"day_snowdepth",
"day_windgust",
"day_windspeed",
"day_winddir",
"day_pressure",
"day_cloudcover",
"day_visibility",
"day_severerisk",
"day_conditions",
"day_icon",
"day_source",
"day_preciptype",
"day_stations",
"hour_icon",
"hour_source",
"hour_stations",
"hour_feelslike"
]

In [None]:
# new_list = [x for x in clmns if (x not in weather_exclude)]
# new_list

In [None]:
df_weather_v2 = df_weather.drop(weather_exclude, axis=1)

In [None]:
df_weather_v2["city"] = df_weather_v2["city_resolvedAddress"].apply(lambda x: x.split(",")[0])
df_weather_v2["city"] = df_weather_v2["city"].replace('Хмельницька область', "Хмельницький")

In [None]:
df_weather_v2.head(5)

In [None]:
df_weather_v2.shape

## merging data

In [None]:
df_regions = pd.read_csv("data/0_raw_other_data/regions.csv")

In [None]:
df_regions.head(5)

In [None]:
df_weather_reg = pd.merge(df_weather_v2, df_regions, left_on="city",right_on="center_city_ua")

In [None]:
df_weather_reg.head(10)

In [None]:
df_weather_reg.shape

In [None]:
df_weather_v2.shape

### Merging weather and events

In [None]:
# df_events_v2["start_hour_datetimeEpoch"] = df_events_v2['start_hour'].apply(lambda x: int(x.strftime('%s'))  if not isNaN(x) else 0)
# df_events_v2["end_hour_datetimeEpoch"] = df_events_v2['end_hour'].apply(lambda x: int(x.strftime('%s'))  if not isNaN(x) else 0)

In [None]:
df_events_v2.dtypes

In [None]:
df_events_v2.shape

In [None]:
df_events_v2.head(10)

In [None]:
# df_events_v2_sample = df_events_v2.sample(10)
# df_events_v2_sample.shape

events_dict = df_events_v2.to_dict('records')
events_by_hour = []

In [None]:
events_dict[0]

In [None]:
for event in events_dict:
    for d in pd.date_range(start=event["start_hour"], end=event["end_hour"], freq='1H'):
        et = event.copy()
        et["hour_level_event_time"] = d
        events_by_hour.append(et)

In [None]:
df_events_v3 = pd.DataFrame.from_dict(events_by_hour)

In [None]:
df_events_v3["hour_level_event_datetimeEpoch"] = df_events_v3["hour_level_event_time"].apply(lambda x: int(x.strftime('%s'))  if not isNaN(x) else None)

In [None]:
df_events_v3.shape

In [None]:
df_events_v3.head(15)

In [None]:
df_weather_reg.head(5)

In [None]:
df_weather_reg.shape

In [None]:
df_events_v3.head(10)

In [None]:
df_events_v4 = df_events_v3.copy().add_prefix('event_')

In [None]:
df_weather_reg.head()

In [None]:
df_weather_reg.to_csv("df_weather_reg.csv")
df_events_v4.to_csv("df_events_v4.csv")

In [None]:
#df_weather_v4 = df_weather_reg.merge(df_events_v4, 
#                                   how="left", 
#                                   left_on=["region_alt","hour_datetimeEpoch"],
#                                   right_on=["event_region_title","event_hour_level_event_datetimeEpoch"])

In [None]:
df_weather_v4 = pd.read_csv('data/df_weather_v4.csv')