In [None]:
import os
import ast
import pandas as pd
from tqdm import tqdm
import datetime
from src.helpers import load_csv, read_txt, get_project_dir
from src.plotters import custom_bar_plot, custom_line_plots
from src.aggregators import SentimentsRate, SentimentsRateMulti
import warnings
warnings.filterwarnings("ignore")

In [None]:
PROJECT_PATH = get_project_dir()
PATH_TO_MAIN = os.path.join(PROJECT_PATH, 'data/preprocessed')
PATH_TO_TITLE = os.path.join(PROJECT_PATH, 'data/marked')
PATH_TO_VOCAB = os.path.join(PROJECT_PATH, 'data/vocabulary')
PATH_TO_GROUPS = os.path.join(PROJECT_PATH, 'data')

# Loading

## main

In [26]:
main = load_csv(path=PATH_TO_MAIN,
                filename='main.csv',
                columns=['type','date_parsed','source','title_id'],
                chunksize=1000000,
                n_rows=19291934,
                ignore_index=False,
                prefix='')

CHUNKS:   0%|          | 0/20 [00:00<?, ?it/s]

In [27]:
main['year'] = list(tqdm(map(lambda x: int(x[:4]), main['date_parsed']), total=len(main)))

100%|██████████| 19291933/19291933 [00:12<00:00, 1596416.85it/s]


In [None]:
main['date_parsed'] = list(tqdm(map(lambda x: datetime.datetime.strptime('/'.join(x.split('/')[:2]), '%Y/%m'),
                                    main['date_parsed']), total=len(main)))

## title

In [28]:
title = load_csv(path=PATH_TO_TITLE,
                filename='p_title_marked.csv',
                columns=['id','label'],
                chunksize=1000000,
                n_rows=4388764,
                ignore_index=False,
                prefix='')

CHUNKS:   0%|          | 0/5 [00:00<?, ?it/s]

In [30]:
title = title.set_index('id')
tqdm.pandas()
title['lemmatized'] = title['lemmatized'].progress_apply(lambda x: ast.literal_eval(x))

100%|██████████| 4388764/4388764 [01:10<00:00, 62553.42it/s]


## vocabulary

In [31]:
vocab = load_csv(path=PATH_TO_VOCAB,
                filename='lemma_voc_title.csv',
                columns=['lemma','id'],
                chunksize=1000,
                n_rows=None,
                ignore_index=False,
                prefix='')

counting rows...: 187884it [00:00, 370802.58it/s]


CHUNKS:   0%|          | 0/188 [00:00<?, ?it/s]

In [32]:
vocab = dict(zip(vocab['id'], vocab['lemma']))

## sources

In [34]:
sources = read_txt(path=PATH_TO_GROUPS, filename='sources.txt')
sources = sorted(sources)
is_not_pro_rus = [0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0 ,0 ,0 ,0 ,0 ,1, 0, 1, 1, 0, 0, 0, 0]
sources = dict(zip(sources, is_not_pro_rus))
tqdm.pandas()
main['is_pro_opp'] = main['source'].progress_apply(lambda x: sources[x] if type(x)==str else None)

100%|██████████| 19291933/19291933 [00:17<00:00, 1118115.14it/s]


## types

In [56]:
types = pd.read_excel(os.path.join(PATH_TO_GROUPS, 'types.xlsx'))
types = types.set_index('type').sort_index()

# Grouping

In [None]:
grouped_by_type = SentimentsRate().find(main_data=main,
                                        text_data=title,
                                        col_to_group='type',
                                        text_col='title_id',
                                        labels_col='label',
                                        mapping_df=types,
                                        max_groups=19,
                                        scaling=None)

In [None]:
grouped_by_source = SentimentsRate().find(main_data=main,
                                          text_data=title,
                                          col_to_group='source',
                                          text_col='title_id',
                                          labels_col='label',
                                          mapping_df=None,
                                          max_groups=None,
                                          scaling=None)

In [None]:
grouped_by_year = SentimentsRate().find(main_data=main,
                                        text_data=title,
                                        col_to_group='year',
                                        text_col='title_id',
                                        labels_col='label',
                                        mapping_df=None,
                                        max_groups=None,
                                        scaling=None)

In [None]:
grouped_by_sourceGroup = SentimentsRate().find(main_data=main,
                                               text_data=title,
                                               col_to_group='is_pro_opp',
                                               text_col='title_id',
                                               labels_col='label',
                                               mapping_df=None,
                                               max_groups=None,
                                               scaling=None) \
                                         .rename(columns={'is_pro_opp': 'group'})
grouped_by_sourceGroup['group'] = grouped_by_sourceGroup['group'].replace({0: 'pro-russian',
                                                                           1: 'pro-opposite'})

In [None]:
grouped_by_year_and_source = SentimentsRateMulti().find(main_data=main,
                                                        text_data=title,
                                                        cols_to_group=['source','year'],
                                                        text_col='title_id',
                                                        labels_col='label',
                                                        scaling=None)

In [None]:
grouped = SentimentsRateMulti().find(main_data=main,
                                     text_data=title,
                                     cols_to_group=['source','date_parsed'],
                                     text_col='title_id',
                                     labels_col='label',
                                     scaling=None)

# Figures

In [None]:
custom_line_plots(ids_start=[0, 9, 18],
                 ids_stop=[9, 18, 26],
                 groups=list(sources.keys()),
                 data=grouped[grouped['date_parsed'] >= datetime.datetime.strptime('2014', '%Y')],
                 x='date_parsed',
                 y='neg_rate',
                 hue='source',
                 fs=18, lw=5,
                 x_label='date parsed',
                 y_label='Negativity Rate',
                 k=1,
                 aspect_ratio=0.3,
                 weight=0.9,
                 reverse=True)

In [None]:
custom_bar_plot(data=grouped_by_type, x='type', y='rate', hue='label', fs=18, aspect_ratio=0.4, x_rotation=45)

In [None]:
custom_bar_plot(data=grouped_by_year, x='year', y='rate', hue='label', fs=18, aspect_ratio=0.4, x_vals=[k for k in range(2014, 2024)])

In [None]:
custom_bar_plot(data=grouped_by_source, x='source', y='rate', hue='label', fs=18, aspect_ratio=0.4, x_rotation=60)

In [None]:
custom_bar_plot(data=grouped_by_sourceGroup, x='group', y='rate', hue='label', fs=18, aspect_ratio=0.4, x_rotation=0, show_values=True)