In [2]:
# import libraries

import datetime
import pickle
import nltk
import re
import scipy
import pandas as pd
from scipy import sparse
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from utils import get_weather
from utils import text_processing

### Get and preprocess ISW files

In [3]:
# get article from yesterday
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)

yesterday_day = yesterday.day
yesterday_month = yesterday.month
yesterday_year = yesterday.year

In [4]:
file = text_processing.get_article_from_yesterday(yesterday_day,yesterday_month,yesterday_year)

In [5]:
data = text_processing.read_html(file)

In [6]:
def preprocess_all_text(data):
    pattern = "\[(\d+)\]"
    data['main_html_v1'] = data['main_html'].apply(lambda x: re.sub(pattern,"",str(x)))
    data['main_html_v2'] = data['main_html_v1'].apply(lambda x: re.sub(r'http(\S+.*\s)',"",x))
    data['main_html_v3'] = data['main_html_v2'].apply(lambda x: re.sub(r'2022|2023|©2022|©2023|\xa0|\n',"",x))
    data['main_html_v4'] = data['main_html_v3'].apply(lambda x: BeautifulSoup(x).text)
    data['main_html_v5'] = data['main_html_v4'].apply(lambda x: text_processing.remove_names_and_dates(x))
    
    return data

In [7]:
data_preprocessed = preprocess_all_text(data)

In [8]:
data_preprocessed = data_preprocessed.drop(['main_html_v1','main_html_v2','main_html_v3','main_html_v4'],axis=1)

In [9]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

In [10]:
data_preprocessed['report_text_lemm'] = data_preprocessed['main_html_v5'].apply(lambda x: text_processing.preprocess(x,"lemm"))
data_preprocessed['report_text_stemm'] = data_preprocessed['main_html_v5'].apply(lambda x: text_processing.preprocess(x,"stemm"))

In [11]:
data_preprocessed.head()

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v5,report_text_lemm,report_text_stemm
0,2023-04-28,russian_offensive_campaign_assessment_April_28...,"Russian Offensive Campaign Assessment, April 2...","Russian Offensive Campaign Assessment, April 2...",/backgrounder/russian-offensive-campaign-asses...,"[[[ , <p align=""center""><strong><br/></strong>...","Russian Offensive Campaign Assessment, April ...",russian offens campaign ass april twenty eigh...,russian offen campaign assess april twenti ei...


In [12]:
docs = data_preprocessed['report_text_lemm'].tolist()

### ISW vectorize

In [13]:
cv = CountVectorizer()
word_count_vector = cv.fit_transform(docs)

word_count_vector.shape

tfidf_transformer = TfidfTransformer(smooth_idf = True, use_idf = True)
tfidf_transformer.fit(word_count_vector)

tf_idf_vector = tfidf_transformer.transform(word_count_vector)

In [14]:
tf_idf_vector

<1x697 sparse matrix of type '<class 'numpy.float64'>'
	with 697 stored elements in Compressed Sparse Row format>

In [15]:
feature_names = cv.get_feature_names_out()
tf_idf_vector

<1x697 sparse matrix of type '<class 'numpy.float64'>'
	with 697 stored elements in Compressed Sparse Row format>

In [16]:
data_preprocessed['keywords'] = data_preprocessed['report_text_stemm'].apply(lambda x: text_processing.convert_doc_to_vector(x,feature_names,tf_idf_vector))

In [17]:
data_preprocessed['keywords'][0]

{'russian': 0.66,
 'forc': 0.242,
 'twenty': 0.23,
 'ukrainian': 0.2,
 'april': 0.177,
 'russia': 0.165,
 'eight': 0.136,
 'drone': 0.118,
 'continu': 0.118,
 'oblast': 0.112}

In [18]:
data_preprocessed.head()

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v5,report_text_lemm,report_text_stemm,keywords
0,2023-04-28,russian_offensive_campaign_assessment_April_28...,"Russian Offensive Campaign Assessment, April 2...","Russian Offensive Campaign Assessment, April 2...",/backgrounder/russian-offensive-campaign-asses...,"[[[ , <p align=""center""><strong><br/></strong>...","Russian Offensive Campaign Assessment, April ...",russian offens campaign ass april twenty eigh...,russian offen campaign assess april twenti ei...,"{'russian': 0.66, 'forc': 0.242, 'twenty': 0.2..."


#### Part of script: Final preprocessing

In [19]:
data_preprocessed["date_datetime"] = pd.to_datetime(data_preprocessed["date"])
data_preprocessed['date_tomorrow_datetime'] = data_preprocessed['date_datetime'].apply(lambda x: x+datetime.timedelta(days=1))
data_preprocessed = data_preprocessed.rename(columns = {"date_datetime":"report_date"})

In [20]:
data_preprocessed.head()

Unnamed: 0,date,short_url,title,text_title,full_url,main_html,main_html_v5,report_text_lemm,report_text_stemm,keywords,report_date,date_tomorrow_datetime
0,2023-04-28,russian_offensive_campaign_assessment_April_28...,"Russian Offensive Campaign Assessment, April 2...","Russian Offensive Campaign Assessment, April 2...",/backgrounder/russian-offensive-campaign-asses...,"[[[ , <p align=""center""><strong><br/></strong>...","Russian Offensive Campaign Assessment, April ...",russian offens campaign ass april twenty eigh...,russian offen campaign assess april twenti ei...,"{'russian': 0.66, 'forc': 0.242, 'twenty': 0.2...",2023-04-28,2023-04-29


In [21]:
data_vectorised = tf_idf_vector.toarray()
vectors_df = pd.DataFrame(data_vectorised)
vectors_df['date'] = pd.to_datetime(today)


In [22]:
vectors_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,688,689,690,691,692,693,694,695,696,date
0,0.023585,0.011793,0.005896,0.005896,0.017689,0.005896,0.005896,0.005896,0.005896,0.017689,...,0.005896,0.011793,0.011793,0.005896,0.017689,0.017689,0.005896,0.023585,0.029482,2023-04-29


In [23]:
df_isw_short = data_preprocessed[['date','report_text_lemm','keywords','date_tomorrow_datetime']]

In [24]:
df_isw_short.head()

Unnamed: 0,date,report_text_lemm,keywords,date_tomorrow_datetime
0,2023-04-28,russian offens campaign ass april twenty eigh...,"{'russian': 0.66, 'forc': 0.242, 'twenty': 0.2...",2023-04-29


### Predict

In [None]:
tfidf = pickle.load(open("models/tfidf_transformer_v1.pkl","rb"))
cv = pickle.load(open("models/count_vectorizer_v1.pkl","rb"))
model = pickle.load(open("models/training_models/4_rf_3.1f.pkl","rb"))

tfidf_vector = scipy.sparse.load_npz('data/matrix/tfidf_vector_train.npz')

  tfidf = pickle.load(open("models/tfidf_transformer_v1.pkl","rb"))
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
cities = ['Крим','Вінниччина','Волинь','Дніпропетровщина','Донеччина','Житомирщина','Закарпаття', 'Запоріжжя',
          'Івано-Франківщина','Київщина','Кіровоградщина','Луганщина','Львівщина','Миколаївщина','Одещина','Полтавщина',
          'Рівненщина','Сумщина','Тернопільщина','Харківщина','Херсонщина','Хмельниччина','Черкащина',
          'Буковина','Чернігівщина']

date = today = datetime.date.today()
result = {}
for city in cities:

    df_weather_final = get_weather.get_weather_for_12_hours(city,date)

    # merge
    df_weather_final['key']=1
    df_isw_short['key']=1
    df_all = df_weather_final.merge(df_isw_short, how = 'left', left_on = 'key', right_on = 'key')

    # drop
    to_drop=['key','date','date_tomorrow_datetime','keywords','report_text_lemm']
    if 'sunrise' in df_all.columns:
        exceptions = ['sunset','sunrise']
        to_drop.extend(exceptions)
    df_weather_matrix_v1 = df_all.drop(to_drop, axis = 1)

    # final dataset
    df_weather_matrix_v1= df_weather_matrix_v1[['day_tempmax', 'day_tempmin', 'day_temp', 'day_dew', 'day_humidity',
           'day_precip', 'day_precipcover', 'day_solarradiation',
           'day_solarenergy', 'day_uvindex', 'hour_temp', 'hour_humidity',
           'hour_dew', 'hour_precip', 'hour_precipprob', 'hour_snow',
           'hour_snowdepth', 'hour_windgust', 'hour_windspeed', 'hour_winddir',
           'hour_pressure', 'hour_visibility', 'hour_cloudcover',
           'hour_solarradiation', 'hour_uvindex', 'hour_severerisk','region_id','hour_datetimeEpoch']]

    cv_vector_model = cv.transform(df_all['report_text_lemm'].values.astype('U'))
    #TF_IDF_MODEL = tfidf.transform(cv_vector_model)

    df_weather_matrix_v1_csr = scipy.sparse.csr_matrix(df_weather_matrix_v1)
    df_all_data_csr = scipy.sparse.hstack((df_weather_matrix_v1_csr, tfidf_vector), format='csr')

    #predict
    predicted = model.predict(df_all_data_csr)
    current_time = pd.Timestamp.now()

    hours = []
    for i in range(12):
        hour = current_time + datetime.timedelta(hours=i)
        hours.append(hour) 

    result[city] = dict(zip(hours, predicted))
    

In [None]:
result

In [None]:
result = pd.DataFrame(result)
result.to_csv('results.txt', sep='\t', index=False)