In [2]:
import pandas as pd
import numpy as np
import string
from datetime import datetime as dt
from dateutil.relativedelta import relativedelta

import numpy as np
import matplotlib
matplotlib.use("agg")
import matplotlib.pyplot as plt


import nltk
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.stem import SnowballStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def get_year_month(date_time_str):
    return date_time_str[:7]

def normalize(arr):
    return (arr - np.mean(arr)) / np.std(arr) / np.sqrt(len(arr))

def group_monthly(kremlin_data):
    tmp = kremlin_data.loc['datetime'].apply(lambda x: x[:7]).value_counts().to_dict()
    dt_min = dt.strptime(min(kremlin_data.loc['datetime'])[:7], '%Y-%m')
    dt_max = dt.strptime(max(kremlin_data.loc['datetime'])[:7], '%Y-%m')
    dt_list = [dt.strftime(dt_min+relativedelta(months=+x), "%Y-%m") for x in range((dt_max.year-dt_min.year)*12+(dt_max.month-dt_min.month)+1)]
    monthly_kremlin = {item:{'count': tmp.get(item, 0), 'header': '', 'summary': '', 'content': ''} for item in dt_list}
    for item in kremlin_data:
        monthly_kremlin[get_year_month(kremlin_data[item]['datetime'])]['header'] += ' ' + kremlin_data[item]['header']
        monthly_kremlin[get_year_month(kremlin_data[item]['datetime'])]['summary'] += ' ' + kremlin_data[item]['summary']
        monthly_kremlin[get_year_month(kremlin_data[item]['datetime'])]['content'] += ' ' + kremlin_data[item]['content']
    
    return pd.DataFrame.from_dict(monthly_kremlin)
    
def calculate_freq_per_month(phrase_dict):
    words = [val for sublist in phrase_dict.values() for val in sublist]
    monthly_words_decomposition = {}
    for key in phrase_dict:
        tmp = dict(zip(set(words), [0] * len(set(words))))
        for word in phrase_dict[key]:
            tmp[word] += 1
        monthly_words_decomposition[key] = tmp
    return pd.DataFrame.from_dict(monthly_words_decomposition)

def plot_word_correlation(arr_a, arr_b, arr_corr, word, n_meet, corr_coef, save_name):
    x = np.arange(0., len(arr_b))/12
    fig, (ax0, ax1, ax2) = plt.subplots(nrows=3, ncols=1, constrained_layout=True,figsize=(16, 9),dpi=150)
    
    a = np.zeros_like(x)
    a[-len(arr_a):] = arr_a
    b = np.zeros_like(x)
    b[:len(arr_b)] = arr_b
    c = np.zeros_like(x)
    c[:len(arr_corr)] = arr_corr
    
    ax0.plot(x, a, 'r')
    ax0.set_title('Статистика бойових втрат')
    ax0.set_xticks(np.arange(0, max(x), 1))
    ax0.grid()

    ax1.plot(x, b)
    ax1.set_title('Корінь "'+ word + '" зустрівся ' + str(n_meet) + ' разів; кореляція ' + str(corr_coef) + '%')
    ax1.set_xticks(np.arange(0, max(x), 1))
    ax1.grid()

    ax2.plot(x, c)
    ax2.set_title('кореляційна функція')
    ax2.set_xticks(np.arange(0, max(x), 1))
    ax2.grid()

    fig.savefig(save_name)
    fig.clear()
    plt.close(fig)
    
def NormData(arr = np.array(0)):
    arrSD = np.std(arr)
    arrAM = np.mean(arr)
    tmp = (arr - arrAM) / arrSD
    return tmp

def PearsonCorr(Base = np.array(0), part = np.array(0)):
    if len(Base) == 0:
        return 'Invalid input data'
    else:
        tm = np.float128(0)
        ptmp = NormData(part)
        Nn = len(part)
        res = np.zeros(len(Base)-Nn)
        for ii in range(len(Base)-Nn):
            tmp0 = NormData(Base[ii:ii + Nn])
            for jj in range(Nn):
                tm = tm + ptmp[jj]*tmp0[jj]
            res[ii] = tm / (Nn-1)
            tm = 0.0
        return res

def fftCorr(Base = np.array(0), part = np.array(0)):
    part0 = np.zeros(len(Base))
    normPart = NormData(part)
    part0[:len(normPart)] = normPart[:]
    normBase = NormData(Base)
    nBsp = np.fft.fft(normBase)
    nP0sp = np.fft.fft(part0)
    nBsp = nBsp * np.conj(nP0sp)
    res = np.real(np.fft.ifft(nBsp))/len(part)
#     return res[:-len(part)+1]
    return res[:]

[nltk_data] Downloading package stopwords to /home/vasyl/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/vasyl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vasyl/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
defenders_losses = pd.read_json('results/killed_defenders.json').applymap(get_year_month)
defenders_losses['https://uk.wikipedia.org'].value_counts().sort_index().to_csv('defenders.csv')
defenders_losses['https://uk.wikipedia.org'].value_counts().sort_index().plot.line()

<matplotlib.axes._subplots.AxesSubplot at 0x7fbb90d24550>

In [4]:
kremlin_data = pd.read_json('results/kremlin_data.json')
# kremlin_data.loc['datetime'].apply(lambda x: x[:7]).value_counts().to_csv('kremlin_freg.csv')
kremlin_data.loc['datetime'].apply(get_year_month).value_counts().sort_index().plot.line()

<matplotlib.axes._subplots.AxesSubplot at 0x7fbb90d24550>

In [5]:
monthly_kremlin = group_monthly(kremlin_data)
monthly_kremlin.loc['count'].to_numpy()

array([2, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 2, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 2, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0, 2, 3, 1, 1, 0, 1, 0, 2, 1,
       0, 1, 1, 1, 1, 2, 2, 4, 0, 1, 1, 3, 0, 0, 1, 10, 0, 8, 10, 2, 0, 1,
       3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 61, 66, 93, 20, 27, 53, 51,
       111, 30, 22, 25, 56, 28, 45, 63, 24, 24, 33, 61, 115, 16, 41, 53,
       49, 50, 57, 125, 38, 28, 55, 59, 126, 55, 58, 69, 97, 61, 110, 108,
       21, 28, 48, 73, 111, 29, 27, 35, 47, 66, 116, 93, 65, 33, 57, 70,
       93, 52, 43, 36, 66, 31, 49, 110, 5, 25, 51, 43, 93, 11, 40, 56, 69,
       54, 72, 47, 10, 17, 49, 46, 98, 7, 35, 50, 62, 45, 42, 70, 8, 18,
       35, 66, 94, 19, 39, 66, 61, 53, 44, 57, 16, 15, 32, 52, 85, 12, 39,
       42, 48, 34, 37, 90, 23, 18, 46, 56, 77, 30, 38, 26, 61, 31, 83,
       101, 31, 9, 52, 51, 62, 5, 15, 40, 33, 40, 44, 54, 30, 9, 25, 35,
       72, 30, 30, 40, 58, 19], dtype=object)

In [6]:
words_in_phrase = 1
additional_punctuation = '«»'
all_punct = string.punctuation + additional_punctuation
monthly_header = monthly_kremlin.loc['content'].apply(
    lambda w: list(ngrams(nltk.word_tokenize(w.lower()), words_in_phrase))
).to_dict()
monthly_header_stemmed = {}
ps = SnowballStemmer('russian')
for key in monthly_header:
    tmp = []
    for item in monthly_header[key]:
        if not any(val in all_punct for val in item):
            tmp.append(tuple([ps.stem(word) for word in item]))
    monthly_header_stemmed[key] = tmp
header_monthly_words_decomposition = pd.DataFrame.sort_index(
    calculate_freq_per_month(monthly_header_stemmed),
    axis=1
)
header_monthly_words_decomposition

Unnamed: 0,2000-01,2000-02,2000-03,2000-04,2000-05,2000-06,2000-07,2000-08,2000-09,2000-10,...,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01,2020-02,2020-03,2020-04,2020-05
среднерыночн,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
укреп,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
дистанцион,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,3,2
198–199,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
наказуем,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
команд-победительниц,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
подъезд,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
искажен,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
павел,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
count_words = {}
defend = defenders_losses['https://uk.wikipedia.org'].value_counts().sort_index().to_numpy()
for word in header_monthly_words_decomposition.T:
    words = header_monthly_words_decomposition.loc[word].sort_index().to_numpy()
    if sum(words) < 30:
        continue
    corr_tmp = fftCorr(
        Base=normalize(words),
        part=normalize(defend)
    )
    count_words[' '.join(word)] = {'sum': sum(words), 'max': corr_tmp.max(), 'min': corr_tmp.min()}
    sv_name = 'results/corr-{:0>3}_count-{:0>6}_root-{}.png'.format(
        np.round(int(corr_tmp.max()*100)),
        sum(words),
        '_'.join(word)
    )
    
    plot_word_correlation(defend, words, corr_tmp, '_'.join(word), sum(words), np.round(int(corr_tmp.max()*100)), sv_name)
count_words = pd.DataFrame.from_dict(count_words).T

In [None]:
count_words.to_csv('results/01_words_corr_content_all.csv')