## Import libraries

In [31]:
# import libraries

import datetime
import pickle
import nltk
import re
import scipy
import pytz
import pandas as pd
from scipy import sparse
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from utils import get_weather
from utils import text_processing

## Get and preprocess ISW files

In [32]:
# get article from yesterday

today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)

yesterday_day = yesterday.day
yesterday_month = yesterday.month
yesterday_year = yesterday.year

In [33]:
file = text_processing.get_article_from_yesterday(yesterday_day,yesterday_month,yesterday_year)

In [34]:
data = text_processing.read_html(file)

In [35]:
def preprocess_all_text(data):
    pattern = "\[(\d+)\]"
    data['main_html_v1'] = data['main_html'].apply(lambda x: re.sub(pattern,"",str(x)))
    data['main_html_v2'] = data['main_html_v1'].apply(lambda x: re.sub(r'http(\S+.*\s)',"",x))
    data['main_html_v3'] = data['main_html_v2'].apply(lambda x: re.sub(r'2022|2023|©2022|©2023|\xa0|\n',"",x))
    data['main_html_v4'] = data['main_html_v3'].apply(lambda x: BeautifulSoup(x).text)
    data['main_html_v5'] = data['main_html_v4'].apply(lambda x: text_processing.remove_names_and_dates(x))
    
    return data

In [36]:
data_preprocessed = preprocess_all_text(data)

In [37]:
data_preprocessed = data_preprocessed.drop(['main_html_v1','main_html_v2','main_html_v3','main_html_v4'],axis=1)

In [38]:
# Uncommit for first time download

# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dimai\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
# lemmatization and stemming algorithms to get the unique words

data_preprocessed['report_text_lemm'] = data_preprocessed['main_html_v5'].apply(lambda x: text_processing.preprocess(x,"lemm"))
data_preprocessed['report_text_stemm'] = data_preprocessed['main_html_v5'].apply(lambda x: text_processing.preprocess(x,"stemm"))

In [40]:
data_preprocessed.head()

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v5,report_text_lemm,report_text_stemm
0,2023-04-30,russian_offensive_campaign_assessment_April_30...,"Russian Offensive Campaign Assessment, April 3...","Russian Offensive Campaign Assessment, April 3...",/backgrounder/russian-offensive-campaign-asses...,"[[[ , <p align=""center""><strong><br/></strong>...","Russian Offensive Campaign Assessment, April ...",russian offens campaign ass april thirty rile...,russian offen campaign assess april thirti ri...


In [41]:
docs = data_preprocessed['report_text_lemm'].tolist()

## ISW vectorize

In [42]:
# apply vectorizing and tf-idf algorithm

cv = CountVectorizer()
word_count_vector = cv.fit_transform(docs)

word_count_vector.shape

tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
tfidf_transformer.fit(word_count_vector)

tf_idf_vector = tfidf_transformer.transform(word_count_vector)

In [43]:
tf_idf_vector

<1x664 sparse matrix of type '<class 'numpy.float64'>'
	with 664 stored elements in Compressed Sparse Row format>

In [44]:
# extract the most valuable words in article

feature_names = cv.get_feature_names_out()
tf_idf_vector
data_preprocessed['keywords'] = data_preprocessed['report_text_stemm'].apply(lambda x: text_processing.convert_doc_to_vector(x,feature_names,tf_idf_vector))

In [45]:
data_preprocessed['keywords'][0]

{'command': 0.497,
 'russian': 0.432,
 'putin': 0.272,
 'gerasimov': 0.243,
 'forc': 0.201,
 'militari': 0.184,
 'like': 0.166,
 'wagner': 0.136,
 'teplinski': 0.124,
 'gener': 0.118}

In [46]:
data_preprocessed.head()

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v5,report_text_lemm,report_text_stemm,keywords
0,2023-04-30,russian_offensive_campaign_assessment_April_30...,"Russian Offensive Campaign Assessment, April 3...","Russian Offensive Campaign Assessment, April 3...",/backgrounder/russian-offensive-campaign-asses...,"[[[ , <p align=""center""><strong><br/></strong>...","Russian Offensive Campaign Assessment, April ...",russian offens campaign ass april thirty rile...,russian offen campaign assess april thirti ri...,"{'command': 0.497, 'russian': 0.432, 'putin': ..."


## Part of script: Final preprocessing and merging

In [47]:
data_preprocessed["date_datetime"] = pd.to_datetime(data_preprocessed["date"])
data_preprocessed['date_tomorrow_datetime'] = data_preprocessed['date_datetime'].apply(lambda x: x+datetime.timedelta(days=1))
data_preprocessed = data_preprocessed.rename(columns = {"date_datetime":"report_date"})

In [48]:
data_preprocessed.head()

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v5,report_text_lemm,report_text_stemm,keywords,report_date,date_tomorrow_datetime
0,2023-04-30,russian_offensive_campaign_assessment_April_30...,"Russian Offensive Campaign Assessment, April 3...","Russian Offensive Campaign Assessment, April 3...",/backgrounder/russian-offensive-campaign-asses...,"[[[ , <p align=""center""><strong><br/></strong>...","Russian Offensive Campaign Assessment, April ...",russian offens campaign ass april thirty rile...,russian offen campaign assess april thirti ri...,"{'command': 0.497, 'russian': 0.432, 'putin': ...",2023-04-30,2023-05-01


In [49]:
data_vectorised = tf_idf_vector.toarray()
vectors_df = pd.DataFrame(data_vectorised)
vectors_df['date'] = pd.to_datetime(today)


In [50]:
vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,655,656,657,658,659,660,661,662,663,date
0,0.023685,0.023685,0.011842,0.005921,0.017764,0.005921,0.005921,0.005921,0.005921,0.005921,...,0.005921,0.005921,0.005921,0.011842,0.011842,0.011842,0.005921,0.011842,0.005921,2023-05-01


In [51]:
df_isw_short = data_preprocessed[['date','report_text_lemm','keywords','date_tomorrow_datetime']]

In [52]:
df_isw_short.head()

Unnamed: 0,date,report_text_lemm,keywords,date_tomorrow_datetime
0,2023-04-30,russian offens campaign ass april thirty rile...,"{'command': 0.497, 'russian': 0.432, 'putin': ...",2023-05-01


## Get weather and make predictions for all regions

In [60]:
# load necessary models

tfidf = pickle.load(open("models/tfidf_transformer_v1.pkl","rb"))
cv = pickle.load(open("models/count_vectorizer_v1.pkl","rb"))
# model = pickle.load(open("models/training_models/4_rf_3.1f.pkl","rb"))

# model for server which weight less
model = pickle.load(open("models/training_models/4_logreg_1.5f.pkl","rb"))

  tfidf = pickle.load(open("models/tfidf_transformer_v1.pkl","rb"))
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [62]:
cities = ['Vinnytsia','Simferopol','Lutsk','Dnipro','Donetsk','Zhytomyr','Uzhgorod','Zaporozhye','Ivano-Frankivsk','Kyiv',
          'Kropyvnytskyi', 'Luhansk','Lviv','Mykolaiv','Odesa','Poltava','Rivne','Sumy','Ternopil','Kharkiv','Kherson',
          'Khmelnytskyi','Cherkasy','Chernivtsi','Chernihiv']

date = datetime.datetime.now(pytz.timezone('Europe/Kyiv'))

result = {}

# for loop generate weather, merged it with isw vector and make a prediction for all regions in list above

for city in cities:

    df_weather_complete = get_weather.get_weather_for_12_hours(city,date)

    # merge
    df_weather_complete['key']=1
    df_isw_short['key']=1
    df_all = df_weather_complete.merge(df_isw_short, how = 'left', left_on = 'key', right_on = 'key')

    # drop
    to_drop=['key','date','date_tomorrow_datetime','keywords','report_text_lemm']
    if 'sunrise' in df_all.columns:
        exceptions = ['sunset','sunrise']
        to_drop.extend(exceptions)
    df_weather_matrix = df_all.drop(to_drop, axis = 1)

    # final dataset
    df_weather_matrix['Unnamed: 0'] = 0
    df_weather_matrix= df_weather_matrix[['Unnamed: 0', 'day_datetimeEpoch', 'day_tempmax', 'day_tempmin', 'day_temp', 'day_dew', 'day_humidity', 'day_precip', 'day_precipcover', 'day_solarradiation', 'day_solarenergy', 'day_uvindex', 'day_moonphase', 'hour_datetimeEpoch', 'hour_temp', 'hour_humidity', 'hour_dew', 'hour_precipprob', 'hour_snow', 'hour_snowdepth', 'hour_windgust', 'hour_windspeed', 'hour_winddir', 'hour_pressure', 'hour_visibility', 'hour_cloudcover', 'hour_severerisk', 'region_id']]

    cv_vector_model = cv.transform(df_all['report_text_lemm'].values.astype('U'))
    tf_idf_model = tfidf.transform(cv_vector_model)

    # merge final dataset with tfidf vector
    df_weather_matrix_csr = scipy.sparse.csr_matrix(df_weather_matrix)
    df_all_data_csr = scipy.sparse.hstack((df_weather_matrix_csr, tf_idf_model), format='csr')

    #predict
    predicted = model.predict(df_all_data_csr)
    current_time = pd.Timestamp.now()
    hours = []
    
    # store results in dictionary, after extract 
    for i in range(12):
        hour = date + datetime.timedelta(hours=i)
        hour_rounded = hour.replace(minute=0, second=0, microsecond=0)
        hours.append(hour_rounded.strftime('%Y-%m-%d %H:%M'))

    result[city] = dict(zip(hours, predicted))

#Show result
result

  df_weather_hours['hour_int']=pd.to_datetime(df_weather_hours['hour_datetime']).dt.hour
  df_weather_hours['hour_int']=pd.to_datetime(df_weather_hours['hour_datetime']).dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_isw_short['key']=1
  df_weather_hours['hour_int']=pd.to_datetime(df_weather_hours['hour_datetime']).dt.hour
  df_weather_hours['hour_int']=pd.to_datetime(df_weather_hours['hour_datetime']).dt.hour
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_isw_short['key']=1
  df_weather_hours['hour_int']=pd.to_datetime(df_weather_hours['hour_d

{'Vinnytsia': {'2023-05-02 17:00': 0,
  '2023-05-02 18:00': 0,
  '2023-05-02 19:00': 0,
  '2023-05-02 20:00': 0,
  '2023-05-02 21:00': 0,
  '2023-05-02 22:00': 0,
  '2023-05-02 23:00': 0,
  '2023-05-03 00:00': 0,
  '2023-05-03 01:00': 0,
  '2023-05-03 02:00': 0,
  '2023-05-03 03:00': 0,
  '2023-05-03 04:00': 0},
 'Simferopol': {'2023-05-02 17:00': 0,
  '2023-05-02 18:00': 0,
  '2023-05-02 19:00': 0,
  '2023-05-02 20:00': 0,
  '2023-05-02 21:00': 0,
  '2023-05-02 22:00': 0,
  '2023-05-02 23:00': 0,
  '2023-05-03 00:00': 0,
  '2023-05-03 01:00': 0,
  '2023-05-03 02:00': 0,
  '2023-05-03 03:00': 0,
  '2023-05-03 04:00': 0},
 'Lutsk': {'2023-05-02 17:00': 0,
  '2023-05-02 18:00': 0,
  '2023-05-02 19:00': 0,
  '2023-05-02 20:00': 0,
  '2023-05-02 21:00': 0,
  '2023-05-02 22:00': 0,
  '2023-05-02 23:00': 0,
  '2023-05-03 00:00': 0,
  '2023-05-03 01:00': 0,
  '2023-05-03 02:00': 0,
  '2023-05-03 03:00': 0,
  '2023-05-03 04:00': 0},
 'Dnipro': {'2023-05-02 17:00': 0,
  '2023-05-02 18:00': 0,
 

##     Save results

In [63]:
result = pd.DataFrame(result)
VERSION = "2"
result.to_csv(f'data/results/results_{VERSION}.txt', sep='\t', index=False)