In [4]:
# import libraries

import datetime
import pickle
import nltk
import re
import scipy
import pandas as pd
from scipy import sparse
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#from utils import get_weather
from utils import text_processing

### Get and preprocess ISW files

In [5]:
# get article from yesterday
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)

yesterday_day = yesterday.day
yesterday_month = yesterday.month
yesterday_year = yesterday.year

In [6]:
file = text_processing.get_article_from_yesterday(yesterday_day,yesterday_month,yesterday_year)

In [7]:
data = text_processing.read_html(file)

In [8]:
def preprocess_all_text(data):
    pattern = "\[(\d+)\]"
    data['main_html_v1'] = data['main_html'].apply(lambda x: re.sub(pattern,"",str(x)))
    data['main_html_v2'] = data['main_html_v1'].apply(lambda x: re.sub(r'http(\S+.*\s)',"",x))
    data['main_html_v3'] = data['main_html_v2'].apply(lambda x: re.sub(r'2022|2023|©2022|©2023|\xa0|\n',"",x))
    data['main_html_v4'] = data['main_html_v3'].apply(lambda x: BeautifulSoup(x).text)
    data['main_html_v5'] = data['main_html_v4'].apply(lambda x: text_processing.remove_names_and_dates(x))
    
    return data

In [9]:
data_preprocessed = preprocess_all_text(data)

In [10]:
data_preprocessed = data_preprocessed.drop(['main_html_v1','main_html_v2','main_html_v3','main_html_v4'],axis=1)

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dimai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
data_preprocessed['report_text_lemm'] = data_preprocessed['main_html_v5'].apply(lambda x: text_processing.preprocess(x,"lemm"))
data_preprocessed['report_text_stemm'] = data_preprocessed['main_html_v5'].apply(lambda x: text_processing.preprocess(x,"stemm"))

In [13]:
data_preprocessed.head()

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v5,report_text_lemm,report_text_stemm
0,2023-04-30,russian_offensive_campaign_assessment_April_30...,"Russian Offensive Campaign Assessment, April 3...","Russian Offensive Campaign Assessment, April 3...",/backgrounder/russian-offensive-campaign-asses...,"[[[ , <p align=""center""><strong><br/></strong>...","Russian Offensive Campaign Assessment, April ...",russian offens campaign ass april thirty rile...,russian offen campaign assess april thirti ri...


In [14]:
docs = data_preprocessed['report_text_lemm'].tolist()

### ISW vectorize

In [15]:
cv = CountVectorizer()
word_count_vector = cv.fit_transform(docs)

word_count_vector.shape

tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
tfidf_transformer.fit(word_count_vector)

tf_idf_vector = tfidf_transformer.transform(word_count_vector)

In [16]:
tf_idf_vector

<1x664 sparse matrix of type '<class 'numpy.float64'>'
	with 664 stored elements in Compressed Sparse Row format>

In [17]:
feature_names = cv.get_feature_names_out()
tf_idf_vector

<1x664 sparse matrix of type '<class 'numpy.float64'>'
	with 664 stored elements in Compressed Sparse Row format>

In [18]:
data_preprocessed['keywords'] = data_preprocessed['report_text_stemm'].apply(lambda x: text_processing.convert_doc_to_vector(x,feature_names,tf_idf_vector))

In [19]:
data_preprocessed['keywords'][0]

{'command': 0.497,
 'russian': 0.432,
 'putin': 0.272,
 'gerasimov': 0.243,
 'forc': 0.201,
 'militari': 0.184,
 'like': 0.166,
 'wagner': 0.136,
 'teplinski': 0.124,
 'gener': 0.118}

In [20]:
data_preprocessed.head()

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v5,report_text_lemm,report_text_stemm,keywords
0,2023-04-30,russian_offensive_campaign_assessment_April_30...,"Russian Offensive Campaign Assessment, April 3...","Russian Offensive Campaign Assessment, April 3...",/backgrounder/russian-offensive-campaign-asses...,"[[[ , <p align=""center""><strong><br/></strong>...","Russian Offensive Campaign Assessment, April ...",russian offens campaign ass april thirty rile...,russian offen campaign assess april thirti ri...,"{'command': 0.497, 'russian': 0.432, 'putin': ..."


#### Part of script: Final preprocessing

In [21]:
data_preprocessed["date_datetime"] = pd.to_datetime(data_preprocessed["date"])
data_preprocessed['date_tomorrow_datetime'] = data_preprocessed['date_datetime'].apply(lambda x: x+datetime.timedelta(days=1))
data_preprocessed = data_preprocessed.rename(columns = {"date_datetime":"report_date"})

In [22]:
data_preprocessed.head()

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v5,report_text_lemm,report_text_stemm,keywords,report_date,date_tomorrow_datetime
0,2023-04-30,russian_offensive_campaign_assessment_April_30...,"Russian Offensive Campaign Assessment, April 3...","Russian Offensive Campaign Assessment, April 3...",/backgrounder/russian-offensive-campaign-asses...,"[[[ , <p align=""center""><strong><br/></strong>...","Russian Offensive Campaign Assessment, April ...",russian offens campaign ass april thirty rile...,russian offen campaign assess april thirti ri...,"{'command': 0.497, 'russian': 0.432, 'putin': ...",2023-04-30,2023-05-01


In [23]:
data_vectorised = tf_idf_vector.toarray()
vectors_df = pd.DataFrame(data_vectorised)
vectors_df['date'] = pd.to_datetime(today)


In [24]:
vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,655,656,657,658,659,660,661,662,663,date
0,0.023685,0.023685,0.011842,0.005921,0.017764,0.005921,0.005921,0.005921,0.005921,0.005921,...,0.005921,0.005921,0.005921,0.011842,0.011842,0.011842,0.005921,0.011842,0.005921,2023-05-01


In [25]:
df_isw_short = data_preprocessed[['date','report_text_lemm','keywords','date_tomorrow_datetime']]

In [26]:
df_isw_short.head()

Unnamed: 0,date,report_text_lemm,keywords,date_tomorrow_datetime
0,2023-04-30,russian offens campaign ass april thirty rile...,"{'command': 0.497, 'russian': 0.432, 'putin': ...",2023-05-01


### Get weather

In [27]:
import urllib.request
import sys
import datetime
import json
import pytz
import os
import scipy
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
API_KEY = os.getenv('WEATHER_API_TOKEN')

DIR_REGIONS = "data/0_raw_other_data/regions.csv"
SAVED_FORCASTS = "data/1_weather_for_12_hours"

df_regions = pd.read_csv(DIR_REGIONS)

def save_file(data,city, date):
    data_object = json.dumps(data)

    # open file for writing, "w"
    f = open(f"{SAVED_FORCASTS}/{city}_{date}.json","w")

    # write json object to file
    f.write(data_object)

    # close file
    f.close()


def read_file(path):
    f = open(path)

    # returns JSON object as
    # a dictionary
    data = json.load(f)


    # Closing file
    f.close()
    return data

def get_weather(city, date):

    path = f"{SAVED_FORCASTS}/{city}_{date}.json"
    if (os.path.exists(path)):
        jsonData = read_file(path)
        return jsonData
    location = f"{city},Ukraine"
    url = f'https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{location}/{date}?key={API_KEY}&include=hours&unitGroup=metric&contentType=json'
    try:
      ResultBytes = urllib.request.urlopen(url)

      # Parse the results as JSON
      jsonData = json.load(ResultBytes)


    except urllib.error.HTTPError  as e:
      ErrorInfo= e.read().decode()
      print('Error code: ', e.code, ErrorInfo)
      sys.exit()
    except  urllib.error.URLError as e:
      ErrorInfo= e.read().decode()
      print('Error code: ', e.code,ErrorInfo)
      sys.exit()
    save_file(jsonData,city, date)
    return jsonData



def get_next_date(date):
    return (date+datetime.timedelta(days=1)).strftime("%Y-%m-%d")


def get_df_weather(jsonData):
    df_data_day = pd.DataFrame(jsonData['days'])
    df_data_day = df_data_day[df_data_day.columns[0:33]].add_prefix('day_')
    hours_forecast=jsonData['days'][0]['hours']
    df_weather_hours = pd.DataFrame(hours_forecast).add_prefix('hour_')
    df_weather_hours['hour_int']=pd.to_datetime(df_weather_hours['hour_datetime']).dt.hour
    df_weather_hours['key'] = 1
    df_data_day['key'] = 1
    df_weather_final = pd.merge(df_data_day,df_weather_hours, on='key')
    return df_weather_final


def get_weather_for_12_hours(city,date):
    jsonData = get_weather(city, date.strftime("%Y-%m-%d"))
    current_hour = int(date.strftime("%H"))
    weather_all_data_day1 = get_df_weather(jsonData)
    hours_needed = (weather_all_data_day1['hour_int']>=current_hour)&(weather_all_data_day1['hour_int']<=(current_hour+12))
    weather_all_data_day1=weather_all_data_day1[hours_needed]
    df_weather_final = weather_all_data_day1
    hours_left=12-weather_all_data_day1.shape[0]
    if(hours_left>0):
        jsonData = get_weather(city, get_next_date(date))
        weather_all_data_day2 = get_df_weather(jsonData)
        hours_needed_2 = ((weather_all_data_day2['hour_int']<=hours_left))
        weather_all_data_day2=weather_all_data_day2[hours_needed_2]
        df_weather_final = pd.concat([weather_all_data_day1, weather_all_data_day2], axis=0)
    df_weather_final['city']=city
    df_final = pd.merge(df_weather_final,df_regions,left_on="city",right_on="center_city_en")


    return df_final




### Predict

In [28]:
tfidf = pickle.load(open("models/tfidf_transformer_v1.pkl","rb"))
cv = pickle.load(open("models/count_vectorizer_v1.pkl","rb"))
model = pickle.load(open("models/training_models/4_rf_3.1f.pkl","rb"))

tfidf_vector = scipy.sparse.load_npz('data/matrix/tfidf_vector_train.npz')

  tfidf = pickle.load(open("models/tfidf_transformer_v1.pkl","rb"))
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [30]:
cities = ['Крим','Вінниччина','Волинь','Дніпропетровщина','Донеччина','Житомирщина','Закарпаття', 'Запоріжжя',
          'Івано-Франківщина','Київщина','Кіровоградщина','Луганщина','Львівщина','Миколаївщина','Одещина','Полтавщина',
          'Рівненщина','Сумщина','Тернопільщина','Харківщина','Херсонщина','Хмельниччина','Черкащина',
          'Буковина','Чернігівщина']

date = today = datetime.date.today()
result = {}
for city in cities:

    df_weather_final = get_weather_for_12_hours(city,date)

    # merge
    df_weather_final['key']=1
    df_isw_short['key']=1
    df_all = df_weather_final.merge(df_isw_short, how = 'left', left_on = 'key', right_on = 'key')

    # drop
    to_drop=['key','date','date_tomorrow_datetime','keywords','report_text_lemm']
    if 'sunrise' in df_all.columns:
        exceptions = ['sunset','sunrise']
        to_drop.extend(exceptions)
    df_weather_matrix_v1 = df_all.drop(to_drop, axis = 1)

    # final dataset
    df_weather_matrix_v1= df_weather_matrix_v1[['day_tempmax', 'day_tempmin', 'day_temp', 'day_dew', 'day_humidity',
           'day_precip', 'day_precipcover', 'day_solarradiation',
           'day_solarenergy', 'day_uvindex', 'hour_temp', 'hour_humidity',
           'hour_dew', 'hour_precip', 'hour_precipprob', 'hour_snow',
           'hour_snowdepth', 'hour_windgust', 'hour_windspeed', 'hour_winddir',
           'hour_pressure', 'hour_visibility', 'hour_cloudcover',
           'hour_solarradiation', 'hour_uvindex', 'hour_severerisk','region_id','hour_datetimeEpoch']]

    cv_vector_model = cv.transform(df_all['report_text_lemm'].values.astype('U'))
    #TF_IDF_MODEL = tfidf.transform(cv_vector_model)

    df_weather_matrix_v1_csr = scipy.sparse.csr_matrix(df_weather_matrix_v1)
    df_all_data_csr = scipy.sparse.hstack((df_weather_matrix_v1_csr, tfidf_vector), format='csr')

    #predict
    predicted = model.predict(df_all_data_csr)
    current_time = pd.Timestamp.now()

    hours = []
    for i in range(12):
        hour = current_time + datetime.timedelta(hours=i)
        hours.append(hour) 

    result[city] = dict(zip(hours, predicted))
    

UnicodeEncodeError: 'ascii' codec can't encode characters in position 54-57: ordinal not in range(128)

In [None]:
result

In [None]:
result = pd.DataFrame(result)
result.to_csv('results.txt', sep='\t', index=False)