In [None]:
import pandas as pd
import numpy as np
import string
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

import numpy as np
import matplotlib
# matplotlib.use("agg")
import matplotlib.pyplot as plt


import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def get_year_month(date_time_str):
    return date_time_str[:7]

def normalize(arr):
    return (arr - np.mean(arr)) / np.std(arr) / np.sqrt(len(arr))

def group_monthly(kremlin_data):
    tmp = kremlin_data.loc['datetime'].apply(lambda x: x[:7]).value_counts().to_dict()
    dt_min = dt.strptime(min(kremlin_data.loc['datetime'])[:7], '%Y-%m')
    dt_max = dt.strptime(max(kremlin_data.loc['datetime'])[:7], '%Y-%m')
    dt_list = [dt.strftime(dt_min+relativedelta(months=+x), "%Y-%m") for x in range((dt_max.year-dt_min.year)*12+(dt_max.month-dt_min.month)+1)]
    monthly_kremlin = {item:{'count': tmp.get(item, 0), 'header': '', 'summary': '', 'content': ''} for item in dt_list}
#     for item in tmp:
#         monthly_kremlin[item] = {'count': tmp[item], 'header': '', 'summary': '', 'content': ''}
    for item in kremlin_data:
        monthly_kremlin[get_year_month(kremlin_data[item]['datetime'])]['header'] += ' ' + kremlin_data[item]['header']
        monthly_kremlin[get_year_month(kremlin_data[item]['datetime'])]['summary'] += ' ' + kremlin_data[item]['summary']
        monthly_kremlin[get_year_month(kremlin_data[item]['datetime'])]['content'] += ' ' + kremlin_data[item]['content']
    
    return pd.DataFrame.from_dict(monthly_kremlin)
    
def calculate_freq_per_month(phrase_dict):
    words = [val for sublist in phrase_dict.values() for val in sublist]
    monthly_words_decomposition = {}
    for key in phrase_dict:
        tmp = dict(zip(set(words), [0] * len(set(words))))
        for word in phrase_dict[key]:
            tmp[word] += 1
        monthly_words_decomposition[key] = tmp
    return pd.DataFrame.from_dict(monthly_words_decomposition)

def plot_word_correlation(arr_a, arr_b, arr_corr, word, save_name):
    x = np.arange(0., len(arr_b))/12
    fig, (ax0, ax1, ax2) = plt.subplots(nrows=3, ncols=1, constrained_layout=True,figsize=(9, 9),dpi=150)
    
    a = np.zeros_like(x)
    a[-len(arr_a):] = arr_a
    b = np.zeros_like(x)
    b[:len(arr_b)] = arr_b
    c = np.zeros_like(x)
    c[:len(arr_corr)] = arr_corr
    
    ax0.plot(x, a, 'r')
    ax0.set_title('Статистика бойових втрат')
    ax0.set_xticks(np.arange(0, max(x), 1))
    ax0.grid()

    ax1.plot(x, b)
    ax1.set_title('Корінь "'+ word + '"')
    ax1.set_xticks(np.arange(0, max(x), 1))
    ax1.grid()

    ax2.plot(x, c)
    ax2.set_title('кореляційна функція')
    ax2.set_xticks(np.arange(0, max(x), 1))
    ax2.grid()

    fig.savefig(save_name)
    fig.clear()
    plt.close(fig)
    
def NormData(arr = np.array(0)):
    arrSD = np.std(arr)
    arrAM = np.mean(arr)
    tmp = (arr - arrAM) / arrSD
    return tmp

def PearsonCorr(Base = np.array(0), part = np.array(0)):
    if len(Base) == 0:
        return 'Invalid input data'
    else:
        tm = np.float128(0)
        ptmp = NormData(part)
        Nn = len(part)
        res = np.zeros(len(Base)-Nn)
        for ii in range(len(Base)-Nn):
            tmp0 = NormData(Base[ii:ii + Nn])
            for jj in range(Nn):
                tm = tm + ptmp[jj]*tmp0[jj]
            res[ii] = tm / (Nn-1)
            tm = 0.0
        return res

def fftCorr(Base = np.array(0), part = np.array(0)):
    part0 = np.zeros(len(Base))
    normPart = NormData(part)
    part0[:len(normPart)] = normPart[:]
    normBase = NormData(Base)
    nBsp = np.fft.fft(normBase)
    nP0sp = np.fft.fft(part0)
    nBsp = nBsp * np.conj(nP0sp)
    res = np.real(np.fft.ifft(nBsp))/len(part)
    return res[:-len(part)+1]

In [None]:
defenders_losses = pd.read_json('results/killed_defenders.json').applymap(get_year_month)
defenders_losses['https://uk.wikipedia.org'].value_counts().sort_index().to_csv('defenders.csv')
defenders_losses['https://uk.wikipedia.org'].value_counts().sort_index().plot.line()

In [None]:
kremlin_data = pd.read_json('results/kremlin_data.json')
# kremlin_data.loc['datetime'].apply(lambda x: x[:7]).value_counts().to_csv('kremlin_freg.csv')
kremlin_data.loc['datetime'].apply(get_year_month).value_counts().sort_index().plot.line()

In [None]:
monthly_kremlin = group_monthly(kremlin_data)
monthly_kremlin.loc['count'].to_numpy()

In [None]:
# biwords
additional_punctuation = '«»'
all_punct = string.punctuation + additional_punctuation
monthly_header = monthly_kremlin.loc['content'].apply(
    lambda w: list(ngrams(nltk.word_tokenize(w.lower()), 1))
).to_dict()
monthly_header_stemmed = {}
ps = SnowballStemmer('russian')
for key in monthly_header:
    tmp = []
    for item in monthly_header[key]:
        if not any(val in all_punct for val in item):
            tmp.append(tuple([ps.stem(word) for word in item]))
    monthly_header_stemmed[key] = tmp
header_monthly_words_decomposition = pd.DataFrame.sort_index(
    calculate_freq_per_month(monthly_header_stemmed),
    axis=1
)
header_monthly_words_decomposition

In [None]:
count_words = {}
defend = defenders_losses['https://uk.wikipedia.org'].value_counts().sort_index().to_numpy()
for word in header_monthly_words_decomposition.T:
    words = header_monthly_words_decomposition.loc[word].sort_index().to_numpy()
#     if sum(words) < 30:
#         continue
    corr_tmp = fftCorr(
        Base=normalize(words),
        part=normalize(defend)
    )
    count_words[' '.join(word)] = {'sum': sum(words), 'max': corr_tmp.max(), 'min': corr_tmp.min()}
#     sv_name = 'results/corr-{:0>3}_count-{:0>6}_root-{}.png'.format(
#         np.round(int(corr_tmp.max()*100)),
#         sum(words),
#         '_'.join(word)
#     )
    
#     plot_word_correlation(defend, words, corr_tmp, '_'.join(word), sv_name)
count_words = pd.DataFrame.from_dict(count_words).T

In [None]:
count_words.to_csv('results/01_words_corr_content_all.csv')