In [16]:
import pandas as pd
import numpy as np
import string
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

import numpy as np
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt

import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def get_year_month(date_time_str):
    return date_time_str[:7]

def normalize(arr):
    return (arr - np.mean(arr)) / np.std(arr) / np.sqrt(len(arr))

def group_monthly(kremlin_data):
    tmp = kremlin_data.loc['datetime'].apply(lambda x: x[:7]).value_counts().to_dict()
    dt_min = dt.strptime(min(kremlin_data.loc['datetime'])[:7], '%Y-%m')
    dt_max = dt.strptime(max(kremlin_data.loc['datetime'])[:7], '%Y-%m')
    dt_list = [dt.strftime(dt_min+relativedelta(months=+x), "%Y-%m") for x in range((dt_max.year-dt_min.year)*12+(dt_max.month-dt_min.month)+1)]
    monthly_kremlin = {item:{'count': tmp.get(item, 0), 'header': '', 'content': ''} for item in dt_list}
    for item in kremlin_data:
        monthly_kremlin[get_year_month(kremlin_data[item]['datetime'])]['header'] += ' ' + kremlin_data[item]['header']
        monthly_kremlin[get_year_month(kremlin_data[item]['datetime'])]['content'] += ' ' + kremlin_data[item]['content']
    
    return pd.DataFrame.from_dict(monthly_kremlin)
    
def calculate_freq_per_month(phrase_dict):
    words = [val for sublist in phrase_dict.values() for val in sublist]
    monthly_words_decomposition = {}
    for key in phrase_dict:
        tmp = dict(zip(set(words), [0] * len(set(words))))
        for word in phrase_dict[key]:
            tmp[word] += 1
        monthly_words_decomposition[key] = tmp
    return pd.DataFrame.from_dict(monthly_words_decomposition)

def plot_word_correlation(arr_a, arr_b, word, n_meet, corr_coef, save_name):
    x = np.arange(1, 1 + len(arr_a))
    fig, (ax0, ax1) = plt.subplots(nrows=2, ncols=1, constrained_layout=True,figsize=(16, 9),dpi=150)
    
    a = np.zeros_like(x)
    a[-len(arr_a):] = arr_a
    b = np.zeros_like(x)
    b[-len(arr_b):] = arr_b
    
    ax0.plot(x, a, 'r')
    ax0.set_title('Статистика бойових втрат')
    ax0.set_xticks(np.arange(0, max(x), 1))
    ax0.grid()

    ax1.plot(x, b)
    ax1.set_title('Трійка коренів "'+ word + '" зустрілася ' + str(n_meet) + ' разів; кореляція ' + str(corr_coef) + '%')
    ax1.set_xticks(np.arange(0, max(x), 1))
    ax1.grid()

    fig.savefig(save_name)
    fig.clear()
    plt.close(fig)
    
def NormData(arr = np.array(0)):
    arrSD = np.std(arr)
    arrAM = np.mean(arr)
    tmp = (arr - arrAM) / arrSD
    return tmp

def PearsonCorr(Base = np.array(0), part = np.array(0)):
    if len(Base) == 0:
        return 'Invalid input data'
    else:
        tm = np.float128(0)
        ptmp = NormData(part)
        Nn = len(part)
        res = np.zeros(len(Base)-Nn)
        for ii in range(len(Base)-Nn):
            tmp0 = NormData(Base[ii:ii + Nn])
            for jj in range(Nn):
                tm = tm + ptmp[jj]*tmp0[jj]
            res[ii] = tm / (Nn-1)
            tm = 0.0
        return res

def fftCorr(Base = np.array(0), part = np.array(0)):
    part0 = np.zeros(len(Base))
    normPart = NormData(part)
    part0[:len(normPart)] = normPart[:]
    normBase = NormData(Base)
    nBsp = np.fft.fft(normBase)
    nP0sp = np.fft.fft(part0)
    nBsp = nBsp * np.conj(nP0sp)
    res = np.real(np.fft.ifft(nBsp))/len(part)
    return res[:-len(part)+1]

[nltk_data] Downloading package stopwords to /home/vasyl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vasyl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vasyl/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
defenders_losses = pd.read_json('results/killed_defenders.json').applymap(get_year_month)
# defenders_losses['https://uk.wikipedia.org'].value_counts().sort_index().to_csv('defenders.csv')
defenders_losses['https://uk.wikipedia.org'].value_counts().sort_index().plot.line()

<matplotlib.axes._subplots.AxesSubplot at 0x7f7e427e9cf8>

In [3]:
ukr_president_data = pd.read_json('results/ukr_president_data.json')
ukr_president_data.loc['datetime'].value_counts().to_csv('ukr_president_data.csv')
ukr_president_data.loc['datetime'].apply(get_year_month).value_counts().sort_index().plot.line()

<matplotlib.axes._subplots.AxesSubplot at 0x7f7e427e9cf8>

In [4]:
monthly_ukr_president_data = group_monthly(ukr_president_data)
monthly_ukr_president_data.loc['count'].to_numpy()

array([54, 94, 129, 81, 118, 143, 119, 112, 100, 106, 104, 125, 96],
      dtype=object)

In [17]:
words_in_phrase = 3
additional_punctuation = '«»'
all_punct = string.punctuation + additional_punctuation
monthly_header = monthly_ukr_president_data.loc['content'].apply(
    lambda w: list(ngrams(nltk.word_tokenize(w.lower()), words_in_phrase))
).to_dict()
monthly_header_stemmed = {}
ps = SnowballStemmer('russian')
for key in monthly_header:
    tmp = []
    for item in monthly_header[key]:
        if not any(val in all_punct for val in item):
            tmp.append(tuple([ps.stem(word) for word in item]))
    monthly_header_stemmed[key] = tmp
header_monthly_words_decomposition = pd.DataFrame.sort_index(
    calculate_freq_per_month(monthly_header_stemmed),
    axis=1
)
header_monthly_words_decomposition

Unnamed: 0,Unnamed: 1,Unnamed: 2,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01,2020-02,2020-03,2020-04,2020-05
нас,очен,полезн,0,0,0,0,0,0,1,0,0,0,0,0,0
независим,внешн,финансов,0,0,0,0,0,0,0,0,1,0,0,0,0
что,имел,шанс,0,0,0,0,0,0,0,0,1,0,0,0,0
будет,изменя,комплектац,0,0,1,0,0,0,0,0,0,0,0,0,0
для,поддержан,форм,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
выдвинут,пут,праймериз,0,0,0,0,0,0,0,0,0,0,0,0,1
проход,в,вильнюс,0,0,0,0,0,0,1,0,0,0,0,0,0
продукт,в,буфет,0,0,0,0,0,0,1,0,0,0,0,0,0
собствен,из,фонд,0,0,0,0,0,0,0,0,0,0,0,0,1


In [6]:
len(header_monthly_words_decomposition.T)

13

In [18]:
count_words = {}
defend = defenders_losses['https://uk.wikipedia.org'].value_counts().sort_index().to_numpy()
defend_part = defend[-len(header_monthly_words_decomposition.T):]
for word in header_monthly_words_decomposition.T:
    words = header_monthly_words_decomposition.loc[word].sort_index().to_numpy()
    if sum(words) < 10:
        continue
    corr_tmp = fftCorr(
        Base=normalize(words),
        part=normalize(defend_part)
    )
#     print(int(corr_tmp*100))
    count_words[' '.join(word)] = {'sum': sum(words), 'max': corr_tmp.max()}
    sv_name = 'results/corr-{}_count-{:0>6}_root-{}.png'.format(
        '{:0>3}p'.format(abs(np.round(int(corr_tmp.max()*100)))) if corr_tmp > 0 else '{:0>3}n'.format(abs(np.round(int(corr_tmp.max()*100)))),
        sum(words),
        '_'.join(word).replace('/','_slash_')
    )
    
    plot_word_correlation(defend_part, words, '_'.join(word), sum(words), np.round(int(corr_tmp.max()*100)), sv_name)
count_words = pd.DataFrame.from_dict(count_words).T

In [None]:
count_words.to_csv('results/02_ukr_president_words_corr_content_10.csv')