## Importing libs & exploration

In [30]:
import json
import os
import random
import numpy as np
import pandas as pd
import gensim
import nltk

from gensim.models import CoherenceModel
from multiprocess import Pool
from tqdm import tqdm 
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from nltk.stem.snowball import RussianStemmer
from string import punctuation

In [31]:
SOCIAL_MEDIA = 'RBK'

with open(f'data/{SOCIAL_MEDIA}/result.json') as json_data:
    data = json.load(json_data)

In [32]:
data['messages'][10:15]

[{'id': 42838,
  'type': 'message',
  'date': '2022-02-24T05:30:10',
  'date_unixtime': '1645673410',
  'edited': '2022-03-01T14:45:29',
  'edited_unixtime': '1646138729',
  'from': 'РБК',
  'from_id': 'channel1099860397',
  'file': '(File not included. Change data exporting settings to download.)',
  'thumbnail': '(File not included. Change data exporting settings to download.)',
  'media_type': 'video_file',
  'mime_type': 'video/mp4',
  'duration_seconds': 41,
  'width': 1010,
  'height': 480,
  'text': 'Обращение Путина с сообщением о начале специальной военной операции.'},
 {'id': 42839,
  'type': 'message',
  'date': '2022-02-24T05:32:59',
  'date_unixtime': '1645673579',
  'edited': '2022-02-24T05:33:02',
  'edited_unixtime': '1645673582',
  'from': 'РБК',
  'from_id': 'channel1099860397',
  'text': 'Киевский аэропорт «Жуляны» отменил все рейсы - УНИАН'},
 {'id': 42840,
  'type': 'message',
  'date': '2022-02-24T05:37:44',
  'date_unixtime': '1645673864',
  'edited': '2022-02-24

In [33]:
all_types = np.unique([t['type'] for m in data['messages'] for t in m['text'] if type(t) is dict])
all_types

array(['bold', 'hashtag', 'italic', 'link', 'mention', 'phone',
       'text_link', 'underline'], dtype='<U9')

In [34]:
all_email = np.unique([t['text'] for m in data['messages'] for t in m['text'] if type(t) is dict and t['type'] == 'email'])
all_email

array([], dtype=float64)

In [35]:
from demoji import replace 


BAD_TEXT_TYPES = ['email', 'mention', 'strikethrough', 'hashtag', 'code', 'link']
UKR_CHARS = ['ї', 'є', 'і', 'ʼ', 'ґ']
FOREIGN_AGENT_MESSAGE_START = 'ДАННОЕ'
SUBSCRIBE_MESSAGE = 'УС > Подписаться'
UKRAINE24_BANNER_MESSAGE = 'Украина 24/7'


def clean_str(input_str):
    output_str = replace(input_str.strip().replace("\n", "."), '')
    
    if FOREIGN_AGENT_MESSAGE_START in output_str:
        return ''
    
    output_str = output_str.replace('\u200b', ' ') \
                           .replace('\xa0', ' ') \
                           .replace(SUBSCRIBE_MESSAGE, '') \
                           .replace(UKRAINE24_BANNER_MESSAGE, '') \
                           .strip() + ' '
    
    return output_str
    

def create_datasets_from_news_sources(news_sources_list):
    
    for news_source in tqdm(news_sources_list):
        extracted_data = {
            'date': [],
            'text': []
        }

        ukr_messages_inds = []
        
        with open(f'data/{news_source}/result.json') as json_data:
            data = json.load(json_data)
            
        for i, m in enumerate(data['messages']):
            texts = m['text']
            text = ''

            if isinstance(texts, str):
                text += clean_str(texts)
            else:
                for t in texts:
                    sub_text = ''

                    if type(t) is dict and t['type'] not in BAD_TEXT_TYPES and len(t['text']) > 0 and 'http' not in t['text']:
                        sub_text = t['text']

                    if type(t) is str and len(t) > 0:
                        sub_text = t

                    sub_text = clean_str(sub_text)

                    if len(sub_text) == 0:
                        continue

                    # if sub_text[-1] not in ['!', '?', '.', ',', ':', ';']:
                    #     sub_text += ' '

                    text += sub_text

            if text != '' and len(text) > 25:
                extracted_data['date'].append(m['date'])
                extracted_data['text'].append(text.strip())

            if any(c in UKR_CHARS for c in text):
                ukr_messages_inds.append(len(extracted_data['text']) - 1)

        extracted_data['date'] = np.delete(extracted_data['date'], ukr_messages_inds)
        extracted_data['text'] = np.delete(extracted_data['text'], ukr_messages_inds)

        df = pd.DataFrame(extracted_data)
        df['date'] = pd.to_datetime(df['date'])
        df.to_csv(f'./data/{news_source}/dataset.csv', index=False)

In [63]:
SOCIAL_MEDIA_SOURCES = [d for d in os.listdir('data') if d[0] != '.']
create_datasets_from_news_sources(SOCIAL_MEDIA_SOURCES)

In [73]:
sizes = []
for news_source in SOCIAL_MEDIA_SOURCES:
    if news_source in ['Ukraine24', 'UkraineNow', 'ShockedUkraine', 'UNIAN']:
        df = pd.read_csv(f'./data/{news_source}/dataset.csv')
        sizes.append(df.shape[0])
        print(f"{news_source}: \t size {df.shape[0]}")

Ukraine24: 	 size 2754
UkraineNow: 	 size 22209
ShockedUkraine: 	 size 7179
UNIAN: 	 size 23574


In [56]:
relative_sizes = np.array(sizes) / np.sum(sizes) * 100

In [74]:
relative_sizes, np.sum(sizes)

(array([ 2.00821077, 14.54530871, 16.19475415,  5.31001845,  0.54106477,
         7.75866469,  0.38793323, 23.14765526,  5.23491107,  1.00191779,
        17.19010916,  6.67945193]),
 55716)

In [68]:
33507, 72865, 8556

(33507, 72865, 8556)