In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import gc

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
# Creating dictionary for theme groups
theme_groups = {}
theme_groups['SARS-CoV-2'] = ['r/CovidVaccinated', 'r/Masks4All', 'r/NoLockdownsNoMasks', 'r/EndTheLockdowns', 'r/COVID19', 'r/COVID19positive', 'r/CoronavirusCanada', 'r/CoronavirusRecession', 'r/CoronavirusUK', 'r/CoronavirusUS', 'r/Coronavirus', 'r/LockdownSkepticism']
theme_groups['Vaccines'] = ['r/CovidVaccinated', 'r/VACCINES', 'r/vaxxhappened', 'r/AntiVaxxers', 'r/antivax', 'r/TrueAntiVaccination', 'r/DebateVaccine', 'r/DebateVaccines']
theme_groups['Abortion'] = ['r/AskProchoice', 'r/prochoice', 'r/insaneprolife', 'r/prolife', 'r/ProLifeLibertarians', 'r/Abortiondebate', 'r/abortion']
theme_groups['womens-and-mens-rights'] = ['r/Feminism', 'r/feminisms', 'r/RadicalFeminism', 'r/RadicalFeminismUSA', 'r/MRActivism', 'r/MensRights', 'r/antifeminists', 'r/feminismformen', 'r/masculism', 'r/GenderCritical', 'r/Egalitarianism']
theme_groups['Gun-control'] = ['r/Firearms', 'r/GunsAreCool', 'r/liberalgunowners', 'r/progun', 'r/guncontrol', 'r/GunDebates', 'r/GunResearch', 'r/gunpolitics']
theme_groups['Climate-change'] = ['r/climateskeptics', 'r/GlobalClimateChange', 'r/climate', 'r/climatechange']
theme_groups['5G'] = ['r/5GDebate']
theme_groups['general-political-debate'] = ['r/JoeBiden', 'r/LeftistsForMen', 'r/Liberal', 'r/LockdownCriticalLeft', 'r/democrats', 'r/Conservative', 'r/ConservativesOnly', 'r/conservatives', 'r/Republican', 'r/RepublicanValues', 'r/politics', 'r/uspolitics']

# !ProLifeLibertarians is missing any labled data

#theme_groups

In [None]:
# Extract data end sort to dictionaries for later plots
sub_reddit_dic = {}

for i,row in enumerate(df.iterrows()):
    data = row[1]
    user_id = data['user_id']
    documents = data['documents']
    
    for doc_id, text, date, sub_reddit, labels in documents:
        if len(labels) > 0:
            #print(labels)
            if sub_reddit not in sub_reddit_dic:
                sub_reddit_dic[sub_reddit] = {'politics': {'count': 0}, 'news': {'count': 0}}
            for label in labels:
                pol_bias = label[2][0]
                news_bias = label[3]
                if len(pol_bias) > 0:
                    if not pol_bias in sub_reddit_dic[sub_reddit]['politics']:
                        sub_reddit_dic[sub_reddit]['politics'][pol_bias] = 0
                    sub_reddit_dic[sub_reddit]['politics'][pol_bias] += 1
                    sub_reddit_dic[sub_reddit]['politics']['count'] += 1
                if (len(news_bias) > 0):
                    if not news_bias in sub_reddit_dic[sub_reddit]['news']:
                        sub_reddit_dic[sub_reddit]['news'][news_bias] = 0
                    sub_reddit_dic[sub_reddit]['news'][news_bias] += 1
                    sub_reddit_dic[sub_reddit]['news']['count'] += 1
                    
theme_dic = {}

for theme in theme_groups:
    theme_dic[theme] = {'politics': {'count':0,'LEFT_CENTER':0,'LEFT':0,'LEAST_BIASED':0,'RIGHT_CENTER':0,'SATIRE':0,'PRO_SCIENCE':0,'RIGHT':0,'EXTREME_LEFT':0,'CONSPIRACY_PSEUDOSCIENCE':0,'EXTREME_RIGHT':0,'PRO_RUSSIAN_PROPAGANDA':0}, 'news': {'count':0,'HIGH':0,'VERY_HIGH':0,'MOSTLY_FACTUAL':0,'MIXED':0,'VERY_LOW':0,'LOW':0}}
    for sub_reddit in theme_groups[theme]:
        if sub_reddit not in sub_reddit_dic:
            continue
        for i in sub_reddit_dic[sub_reddit]['politics']:
            theme_dic[theme]['politics'][i] += sub_reddit_dic[sub_reddit]['politics'][i]
        for i in sub_reddit_dic[sub_reddit]['news']:
            theme_dic[theme]['news'][i] += sub_reddit_dic[sub_reddit]['news'][i]

In [None]:
del df # Free up some RAM
gc.collect();

In [None]:
# Plot of news distribution in categories. Singel Plot
plt.figure(figsize=(16,8))
cmap = ['orangered', 'lime', 'aqua', 'violet', 'gold', 'grey', 'blue', 'darkmagenta']
categories = ['VERY_HIGH','HIGH','MOSTLY_FACTUAL','MIXED','LOW','VERY_LOW']
width = 1/len(theme_dic)
max_y = 0
for ind, theme in enumerate(theme_dic):
    data = theme_dic[theme]['news']
    n = data['count']
    heights = [data[i]/n if i in data else 0 for i in categories]
    if(max(heights) > max_y):
            max_y = max(heights)
    plt.bar(x = [i*3+ind*(1.5*width)-(1.5*width*(len(theme_dic)-1)/2) for i in range(len(categories))], height=heights, width=width, color=cmap[ind])
plt.xticks([i*3 for i in range(len(categories))],categories)
plt.legend(theme_dic)
plt.ylabel("relative percentage")
plt.yticks([i/10 for i in range(min(11,int(max_y*10) + 2))], [str(i*10)+'%' for i in range(min(11,int(max_y*10) + 2))])
plt.title("News bias in all groups")
plt.savefig("./overview/news.pdf")
plt.show()

# Plot of political distribution in categories. Singel Plot
plt.figure(figsize=(16,8))
cmap = ['orangered', 'lime', 'aqua', 'violet', 'gold', 'grey', 'blue', 'darkmagenta']
categories = ['EXTREME_LEFT','LEFT','LEFT_CENTER','LEAST_BIASED','RIGHT_CENTER','RIGHT','EXTREME_RIGHT','CONSPIRACY_PSEUDOSCIENCE','PRO_RUSSIAN_PROPAGANDA', 'PRO_SCIENCE']
max_y = 0
for ind, theme in enumerate(theme_dic):
    data = theme_dic[theme]['politics']
    n = data['count']
    heights = [data[i]/n if i in data else 0 for i in categories]
    if(max(heights) > max_y):
            max_y = max(heights)
    plt.bar(x = [i*3+ind*(1.5*width)-(1.5*width*(len(theme_dic)-1)/2) for i in range(len(categories))], height=heights, width=width, color=cmap[ind])
plt.xticks([i*3 for i in range(len(categories))],[j if ind % 2 == 0 else " \n"+j for ind,j in enumerate(categories)])
plt.legend(theme_dic)
plt.ylabel("relative percentage")
plt.yticks([i/10 for i in range(min(11,int(max_y*10) + 2))], [str(i*10)+'%' for i in range(min(11,int(max_y*10) + 2))])
plt.title("Political bias in all groups")
plt.savefig("./overview/political.pdf")
plt.show()

In [None]:
theme_dic['5G']

In [None]:
# Each subreddit of a category

cmap = ['orangered', 'darkmagenta', 'aqua', 'violet', 'gold', 'grey', 'blue', 'lime', 'steelblue', 'silver', 'deeppink', 'olivedrab']

for group in theme_groups:
    plt.figure(figsize=(16,8))
    max_y = 0
    for ind, sub_reddit in enumerate(theme_groups[group]):
        if (sub_reddit not in sub_reddit_dic):
            continue
        data = sub_reddit_dic[sub_reddit]['politics']
        categories = ['EXTREME_LEFT','LEFT','LEFT_CENTER','LEAST_BIASED','RIGHT_CENTER','RIGHT','EXTREME_RIGHT','CONSPIRACY_PSEUDOSCIENCE','PRO_RUSSIAN_PROPAGANDA','PRO_SCIENCE']
        n = data['count']
        width = 1/len(theme_groups[group])
        heights = [data[i]/n if i in data else 0 for i in categories]
        if(max(heights) > max_y):
            max_y = max(heights)
        plt.bar(x = [i*3+ind*(1.5*width)-(1.5*width*(len(theme_groups[group])-1)/2) for i in range(len(categories))], height=heights, width=width, color=cmap[ind])
    plt.xticks([i*3 for i in range(len(categories))],[j if ind % 2 == 0 else " \n"+j for ind,j in enumerate(categories)])
    plt.legend(theme_groups[group])
    plt.title("Political tendencies for sub-reddits in group: " + group, fontsize=20)
    plt.ylabel("relative percentage", fontsize=16)
    plt.yticks([i/10 for i in range(min(11,int(max_y*10) + 2))], [str(i*10)+'%' for i in range(min(11,int(max_y*10) + 2))])
    plt.savefig("./categories/" + group + ".pdf")
    plt.show()