In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import gc
import numpy as np

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
# Creating dictionary for theme groups
theme_groups = {}
theme_groups['SARS-CoV-2'] = ['r/CovidVaccinated', 'r/Masks4All', 'r/NoLockdownsNoMasks', 'r/EndTheLockdowns', 'r/COVID19', 'r/COVID19positive', 'r/CoronavirusCanada', 'r/CoronavirusRecession', 'r/CoronavirusUK', 'r/CoronavirusUS', 'r/Coronavirus', 'r/LockdownSkepticism']
theme_groups['Vaccines'] = ['r/CovidVaccinated', 'r/VACCINES', 'r/vaxxhappened', 'r/AntiVaxxers', 'r/antivax', 'r/TrueAntiVaccination', 'r/DebateVaccine', 'r/DebateVaccines']
theme_groups['Abortion'] = ['r/AskProchoice', 'r/prochoice', 'r/insaneprolife', 'r/prolife', 'r/ProLifeLibertarians', 'r/Abortiondebate', 'r/abortion']
theme_groups['womens-and-mens-rights'] = ['r/Feminism', 'r/feminisms', 'r/RadicalFeminism', 'r/RadicalFeminismUSA', 'r/MRActivism', 'r/MensRights', 'r/antifeminists', 'r/feminismformen', 'r/masculism', 'r/GenderCritical', 'r/Egalitarianism']
theme_groups['Gun-control'] = ['r/Firearms', 'r/GunsAreCool', 'r/liberalgunowners', 'r/progun', 'r/guncontrol', 'r/GunDebates', 'r/GunResearch', 'r/gunpolitics']
theme_groups['Climate-change'] = ['r/climateskeptics', 'r/GlobalClimateChange', 'r/climate', 'r/climatechange']
theme_groups['5G'] = ['r/5GDebate']
theme_groups['general-political-debate'] = ['r/JoeBiden', 'r/LeftistsForMen', 'r/Liberal', 'r/LockdownCriticalLeft', 'r/democrats', 'r/Conservative', 'r/ConservativesOnly', 'r/conservatives', 'r/Republican', 'r/RepublicanValues', 'r/politics', 'r/uspolitics']

bias = {}
bias['pro'] = ['r/CovidVaccinated', 'r/Masks4All', 'r/CovidVaccinated', 'r/VACCINES', 'r/vaxxhappened', 'r/AskProchoice', 'r/prochoice', 'r/insaneprolife', 'r/Firearms', 'r/GunsAreCool',  'r/liberalgunowners',  'r/progun']
bias['anit'] = ['r/NoLockdownsNoMasks', 'r/EndTheLockdowns', 'r/AntiVaxxers',  'r/antivax',  'r/TrueAntiVaccination', 'r/prolife', 'r/ProLifeLibertarians', 'r/guncontrol']
bias['unbiased'] = ['r/COVID19', 'r/COVID19positive', 'r/CoronavirusCanada','r/CoronavirusRecession','r/CoronavirusUK','r/CoronavirusUS','r/Coronavirus','r/LockdownSkepticism','r/DebateVaccine','r/DebateVaccines','r/Abortiondebate','r/abortion', 'r/GunDebates','r/GunResearch','r/gunpolitics']

# !ProLifeLibertarians is missing any labled data

#theme_groups

In [None]:
# Extract data end sort to dictionaries for later plots
sub_reddit_dic = {}

for row in df.iterrows():
    data = row[1]
    documents = data['documents']
    
    for doc_id, text, date, sub_reddit, labels in documents:
        if len(labels) > 0:
            #print(labels)
            if sub_reddit not in sub_reddit_dic:
                sub_reddit_dic[sub_reddit] = {'politics': {'count':0,'LEFT_CENTER':0,'LEFT':0,'LEAST_BIASED':0,'RIGHT_CENTER':0,'SATIRE':0,'PRO_SCIENCE':0,'RIGHT':0,'EXTREME_LEFT':0,'CONSPIRACY_PSEUDOSCIENCE':0,'EXTREME_RIGHT':0,'PRO_RUSSIAN_PROPAGANDA':0}, 'news': {'count':0,'HIGH':0,'VERY_HIGH':0,'MOSTLY_FACTUAL':0,'MIXED':0,'VERY_LOW':0,'LOW':0}}
            for label in labels:
                pol_bias = label[2][0]
                news_bias = label[3]
                if len(pol_bias) > 0:
                    sub_reddit_dic[sub_reddit]['politics'][pol_bias] += 1
                    sub_reddit_dic[sub_reddit]['politics']['count'] += 1
                if (len(news_bias) > 0):
                    sub_reddit_dic[sub_reddit]['news'][news_bias] += 1
                    sub_reddit_dic[sub_reddit]['news']['count'] += 1
                    
theme_dic = {}

for theme in theme_groups:
    theme_dic[theme] = {'politics': {'count':0,'LEFT_CENTER':0,'LEFT':0,'LEAST_BIASED':0,'RIGHT_CENTER':0,'SATIRE':0,'PRO_SCIENCE':0,'RIGHT':0,'EXTREME_LEFT':0,'CONSPIRACY_PSEUDOSCIENCE':0,'EXTREME_RIGHT':0,'PRO_RUSSIAN_PROPAGANDA':0}, 'news': {'count':0,'HIGH':0,'VERY_HIGH':0,'MOSTLY_FACTUAL':0,'MIXED':0,'VERY_LOW':0,'LOW':0}}
    for sub_reddit in theme_groups[theme]:
        if sub_reddit not in sub_reddit_dic:
            continue
        for i in sub_reddit_dic[sub_reddit]['politics']:
            theme_dic[theme]['politics'][i] += sub_reddit_dic[sub_reddit]['politics'][i]
        for i in sub_reddit_dic[sub_reddit]['news']:
            theme_dic[theme]['news'][i] += sub_reddit_dic[sub_reddit]['news'][i]

In [None]:
del df # Free up some RAM
gc.collect();

In [None]:
# Plot of news distribution in categories. Singel Plot
plt.figure(figsize=(16,8))
cmap = ['orangered', 'lime', 'aqua', 'violet', 'gold', 'grey', 'blue', 'darkmagenta']
categories = ['VERY_HIGH','HIGH','MOSTLY_FACTUAL','MIXED','LOW','VERY_LOW']
width = 1/len(theme_dic)
max_y = 0
for ind, theme in enumerate(theme_dic):
    data = theme_dic[theme]['news']
    n = data['count']
    heights = [data[i]/n if i in data else 0 for i in categories]
    if(max(heights) > max_y):
            max_y = max(heights)
    plt.bar(x = [i*3+ind*(1.5*width)-(1.5*width*(len(theme_dic)-1)/2) for i in range(len(categories))], height=heights, width=width, color=cmap[ind])
plt.xticks([i*3 for i in range(len(categories))],categories)
plt.legend(theme_dic)
plt.ylabel("relative percentage")
plt.yticks([i/10 for i in range(min(11,int(max_y*10) + 2))], [str(i*10)+'%' for i in range(min(11,int(max_y*10) + 2))])
plt.title("News bias in all groups")
plt.savefig("./overview/news.pdf")
plt.show()

# Plot of political distribution in categories. Singel Plot
plt.figure(figsize=(16,8))
cmap = ['orangered', 'lime', 'aqua', 'violet', 'gold', 'grey', 'blue', 'darkmagenta']
categories = ['EXTREME_LEFT','LEFT','LEFT_CENTER','LEAST_BIASED','RIGHT_CENTER','RIGHT','EXTREME_RIGHT','CONSPIRACY_PSEUDOSCIENCE','PRO_RUSSIAN_PROPAGANDA', 'PRO_SCIENCE']
max_y = 0
for ind, theme in enumerate(theme_dic):
    data = theme_dic[theme]['politics']
    n = data['count']
    heights = [data[i]/n if i in data else 0 for i in categories]
    if(max(heights) > max_y):
            max_y = max(heights)
    plt.bar(x = [i*3+ind*(1.5*width)-(1.5*width*(len(theme_dic)-1)/2) for i in range(len(categories))], height=heights, width=width, color=cmap[ind])
plt.xticks([i*3 for i in range(len(categories))],[j if ind % 2 == 0 else " \n"+j for ind,j in enumerate(categories)])
plt.legend(theme_dic)
plt.ylabel("relative percentage")
plt.yticks([i/10 for i in range(min(11,int(max_y*10) + 2))], [str(i*10)+'%' for i in range(min(11,int(max_y*10) + 2))])
plt.title("Political bias in all groups")
plt.savefig("./overview/political.pdf")
plt.show()

In [None]:
# Each subreddit of a category

# custom colormap. len = 12, since the biggest category consists out of 12 sub-reddits
cmap = ['orangered', 'darkmagenta', 'aqua', 'violet', 'gold', 'grey', 'blue', 'lime', 'steelblue', 'silver', 'deeppink', 'olivedrab']

# POLITICAL BIAS
# Ignore certian categories
categories = ['EXTREME_LEFT','LEFT','LEFT_CENTER','LEAST_BIASED','RIGHT_CENTER','RIGHT','EXTREME_RIGHT']#,'CONSPIRACY_PSEUDOSCIENCE','PRO_RUSSIAN_PROPAGANDA','PRO_SCIENCE']
for group in theme_groups:
    plt.figure(figsize=(16,8))
    max_y = 0
    
    sub_reddits_in_group = [i for i in theme_groups[group] if i in sub_reddit_dic]
    for ind, sub_reddit in enumerate(sub_reddits_in_group):
        
        data = sub_reddit_dic[sub_reddit]['politics']
        # normalization
        n = data['count'] - data['CONSPIRACY_PSEUDOSCIENCE'] - data['PRO_RUSSIAN_PROPAGANDA'] - data['PRO_SCIENCE']
        
        width = 1/len(sub_reddits_in_group)
        heights = [data[i]/n if i in data else 0 for i in categories]
        
        # max hight for y-axis labels
        if(max(heights) > max_y):
            max_y = max(heights)
        
        plt.bar(x = [i*3+ind*(1.5*width)-(1.5*width*(len(sub_reddits_in_group)-1)/2) for i in range(len(categories))], height=heights, width=width, color=cmap[ind])
    plt.xticks([i*3 for i in range(len(categories))],[j.replace('_', ' ') for j in categories])
    plt.legend(sub_reddits_in_group)
    plt.title('Political tendencies for sub-reddits in group: ' + group, fontsize=20)
    plt.ylabel('relative percentage', fontsize=16)
    plt.yticks([i/10 for i in range(min(11,int(max_y*10) + 2))], [str(i*10)+'%' for i in range(min(11,int(max_y*10) + 2))])
    plt.savefig('./categories/' + group + '_politics.pdf')
    plt.show()

# FACTUAL STRUCTURE
categories = ['VERY_HIGH','HIGH','MOSTLY_FACTUAL','MIXED','LOW','VERY_LOW']
for group in theme_groups:
    plt.figure(figsize=(16,8))
    max_y = 0
    # Dont iterate over sub-reddits with no labeled data
    sub_reddits_in_group = [i for i in theme_groups[group] if i in sub_reddit_dic]
    for ind, sub_reddit in enumerate(sub_reddits_in_group):
        if (sub_reddit not in sub_reddit_dic):
            continue
        data = sub_reddit_dic[sub_reddit]['news']
        n = data['count']
        width = 1/len(sub_reddits_in_group)
        heights = [data[i]/n if i in data else 0 for i in categories]
        if(max(heights) > max_y):
            max_y = max(heights)
        plt.bar(x = [i*3+ind*(1.5*width)-(1.5*width*(len(sub_reddits_in_group)-1)/2) for i in range(len(categories))], height=heights, width=width, color=cmap[ind])
    plt.xticks([i*3 for i in range(len(categories))],[j.replace('_', ' ') for j in categories])
    plt.legend(sub_reddits_in_group)
    plt.title('News tendencies for sub-reddits in group: ' + group, fontsize=20)
    plt.ylabel('relative percentage', fontsize=16)
    plt.yticks([i/10 for i in range(min(11,int(max_y*10) + 2))], [str(i*10)+'%' for i in range(min(11,int(max_y*10) + 2))])
    plt.savefig('./categories/' + group + '_news.pdf')
    plt.show()

In [None]:
all_post_dic = {}

for row in df.iterrows():
    data = row[1]
    documents = data['documents']
    
    for doc_id, text, date, sub_reddit, labels in documents:
        if sub_reddit not in all_post_dic:
            all_post_dic[sub_reddit] = {'total_posts': 0, 'real_news': 0, 'fake_news': 0}
        all_post_dic[sub_reddit]['total_posts'] += 1
        if len(labels) == 1:
            fake = False
            
            # If any fakenews in post mark post as fake
            # If counting everything: 'real_news': 102024, 'fake_news': 10557
            # If any label is fake_news lable as fake_news: 'real_news': 69182, 'fake_news': 8884
            for page, fake_news, political_bias, factual_bias in labels:
                if fake_news == 1:
                    fake = True
                    break
            
            if fake:
                all_post_dic[sub_reddit]['fake_news'] += 1
            else:
                all_post_dic[sub_reddit]['real_news'] += 1

all_post_dic['TOTAL'] = {'total_posts': 0, 'real_news': 0, 'fake_news': 0}
for theme in theme_groups:
    sub_reddits = theme_groups[theme]
    all_post_dic[theme] = {'total_posts': 0, 'real_news': 0, 'fake_news': 0}
    for sub_reddit in sub_reddits:
        all_post_dic[theme]['total_posts'] += all_post_dic[sub_reddit]['total_posts']
        all_post_dic[theme]['real_news'] += all_post_dic[sub_reddit]['real_news']
        all_post_dic[theme]['fake_news'] += all_post_dic[sub_reddit]['fake_news']
    all_post_dic['TOTAL']['total_posts'] += all_post_dic[theme]['total_posts']
    all_post_dic['TOTAL']['real_news'] += all_post_dic[theme]['real_news']
    all_post_dic['TOTAL']['fake_news'] += all_post_dic[theme]['fake_news']

In [None]:
all_post_dic['TOTAL']

In [None]:
print('num_docs', sum(df['num_docs']))
print('fn', sum(df['fn_amounts']))
print('rn', sum(df['rn_amounts']))

In [None]:
def disable_box(ax):
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['left'].set_visible(False)
    
def make_dist_bar(n, key, title, all_post_dic, subs, ax, cmap):
    ax.set_title(title)
    
    s_dic = {}
    col_ind = {}
    for ind, sub in enumerate(subs):
        s_dic[sub] = all_post_dic[sub][key]
        col_ind[sub] = ind
    s_dic = {k: v for k, v in sorted(s_dic.items(), key=lambda item: item[1], reverse=True)}
    
    x_ticks = {}
    cur_n = 0
    for ind, sub in enumerate(s_dic):
        percentage = all_post_dic[sub][key]/n
        ax.barh(0, percentage, left=cur_n, color=cmap[col_ind[sub]], edgecolor=cmap[col_ind[sub]], alpha=0.5)
        if percentage > 0.1:
            x_ticks[sub] = cur_n + percentage/2
            n_points = int(percentage/0.05)
            left = (percentage - 0.05*n_points)/2
            if sub in bias['pro']:
                m = '+'
            elif sub in bias['anit']:
                m = '_'
            elif sub in bias['unbiased']:
                m = 'o'
            else:
                cur_n += percentage
                continue
                
            ax.scatter(np.arange(cur_n+0.05+left, cur_n+percentage-0.05, 0.05), [0]*(n_points-1), marker = m, s= 45, c='black')
            ax.scatter(np.arange(cur_n+0.025+left, cur_n+percentage-0.025, 0.05), [0.2]*(n_points), marker = m, s= 45, c='black')
            ax.scatter(np.arange(cur_n+0.025+left, cur_n+percentage-0.025, 0.05), [-0.2]*(n_points), marker = m, s= 45, c='black')
        cur_n += percentage
    
    
    ax.set_xticks(list(x_ticks.values()))
    ax.set_xticklabels(list(x_ticks.keys()))
    #ax.set_yticks([])
    ax.figure.set_size_inches(15, 8)

def make_dist_plot(group, subs):
    data = all_post_dic[group]
    fig, ax = plt.subplots(5, 1, figsize=(12,8))
    plt.subplots_adjust(hspace=1.5, wspace=0)
    n = data['total_posts']
    rn = data['real_news']
    fn = data['fake_news']

    for a in ax:
        disable_box(a)
    ax[0].set_title('Overall distribution of labeled and unlabeled posts')
    ax[0].barh(0,(rn+fn)/n, color='red', left=0)
    ax[0].barh(0,(n-(rn+fn))/n, color='steelblue', left=(rn+fn)/n)
    ax[0].set_xticks([0.5*(rn+fn)/n, 0.5*(n-(rn+fn))/n + (rn+fn)/n])
    ax[0].set_xticklabels(['Labeled Posts - {:.3f}%'.format(100*(rn+fn)/n), 'Unlabeled Posts - {:.3f}%'.format(100*(n-(rn+fn))/n)])
    ax[0].set_yticks([]);

    labeled_n = rn+fn
    ax[1].set_title('Distribution of real and fake news in labeled posts')
    ax[1].barh(0,rn/labeled_n, color='orange', left=0)
    ax[1].barh(0,fn/labeled_n, color='indianred', left=rn/labeled_n)
    ax[1].set_xticks([0.5*rn/labeled_n, 0.5*fn/labeled_n + rn/labeled_n])
    ax[1].set_xticklabels(['Real News Posts - {:.3f}%'.format(100*rn/labeled_n), 'Fake News Posts - {:.3f}%'.format(100*fn/labeled_n)])
    ax[1].set_yticks([]);

    # custom colormap. len = 12, since the biggest category consists out of 12 sub-reddits
    cmap = ['orangered', 'darkmagenta', 'aqua', 'violet', 'gold', 'grey', 'blue', 'lime', 'steelblue', 'silver', 'deeppink', 'olivedrab']
    
    ##### TOTAL POSTS #####
    make_dist_bar(n, 'total_posts', 'Distribution of all labeled posts', all_post_dic, subs, ax[2], cmap)
    
    ##### REAL NEWS #####
    make_dist_bar(rn, 'real_news', 'Distribution of real news in groups',all_post_dic, subs, ax[3], cmap)

    ##### FAKE NEWS #####
    make_dist_bar(fn, 'fake_news', 'Distribution of fake news in groups',all_post_dic, subs, ax[4], cmap)
    
    #plt.savefig('./categories/' + group + '_dist.pdf', bbox_inches='tight')
    plt.show();

In [None]:
for group in theme_groups:
    subs = theme_groups[group]
    make_dist_plot(group, subs)

In [None]:
print(all_post_dic['general-political-debate'])
s = 0;
for sub in theme_groups['general-political-debate']:
    print(all_post_dic[sub])
    s += all_post_dic[sub]['fake_news']
s

In [None]:
np.arange(0.2, 0.4, 0.05)