In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# data
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
theme_groups = {}
theme_groups['SARS-CoV-2'] = ['r/CovidVaccinated', 'r/Masks4All', 'r/NoLockdownsNoMasks', 'r/EndTheLockdowns', 'r/COVID19', 'r/COVID19positive', 'r/CoronavirusCanada', 'r/CoronavirusRecession', 'r/CoronavirusUK', 'r/CoronavirusUS', 'r/Coronavirus', 'r/LockdownSkepticism']
theme_groups['Vaccines'] = ['r/CovidVaccinated', 'r/VACCINES', 'r/vaxxhappened', 'r/AntiVaxxers', 'r/antivax', 'r/TrueAntiVaccination', 'r/DebateVaccine', 'r/DebateVaccines']
theme_groups['Abortion'] = ['r/AskProchoice', 'r/prochoice', 'r/insaneprolife', 'r/prolife', 'r/ProLifeLibertarians', 'r/Abortiondebate', 'r/abortion']
theme_groups['womens-and-mens-rights'] = ['r/Feminism', 'r/feminisms', 'r/RadicalFeminism', 'r/RadicalFeminismUSA', 'r/MRActivism', 'r/MensRights', 'r/antifeminists', 'r/feminismformen', 'r/masculism', 'r/GenderCritical', 'r/Egalitarianism']
theme_groups['Gun-control'] = ['r/Firearms', 'r/GunsAreCool', 'r/liberalgunowners', 'r/progun', 'r/guncontrol', 'r/GunDebates', 'r/GunResearch', 'r/gunpolitics']
theme_groups['Climate-change'] = ['r/climateskeptics', 'r/GlobalClimateChange', 'r/climate', 'r/climatechange']
theme_groups['5G'] = ['r/5GDebate']
theme_groups['general-political-debate'] = ['r/JoeBiden', 'r/LeftistsForMen', 'r/Liberal', 'r/LockdownCriticalLeft', 'r/democrats', 'r/Conservative', 'r/ConservativesOnly', 'r/conservatives', 'r/Republican', 'r/RepublicanValues', 'r/politics', 'r/uspolitics']

In [None]:
map_biases = {'': None,
 'LEFT_CENTER': -0.5,
 'RIGHT': 1,
 'EXTREME_RIGHT': 1.5,
 'LEFT': -1,
 'RIGHT_CENTER': 0.5,
 'PRO_SCIENCE': 1,
 'CONSPIRACY_PSEUDOSCIENCE': -1,
 'SATIRE': None,
 'PRO_RUSSIAN_PROPAGANDA': None,
 'LEAST_BIASED': 0,
 'EXTREME_LEFT': -1.5}
# Why is PRO_SCIENCE mapped to 1 and Pseudoscience to -1?
# If there are a lot a pro science posts there wouldnt be a differnce to a lot of right biased posts

political_bias = ['LEFT_CENTER',
 'RIGHT',
 'EXTREME_RIGHT',
 'LEFT',
 'RIGHT_CENTER',
 'LEAST_BIASED',
 'EXTREME_LEFT']

science_bias = ['PRO_SCIENCE',
 'CONSPIRACY_PSEUDOSCIENCE']

other_bias = ['', 'SATIRE',
 'PRO_RUSSIAN_PROPAGANDA']

map_factual = {'VERY_LOW': -1.5, 'LOW': -1, 'MOSTLY_FACTUAL': 0.5, 'VERY_HIGH': 1.5, 'HIGH': 1, 'MIXED': -0.5}

In [None]:
# Build bias counter
bias_counter = {}
factual_counter = {}

for row in df.iterrows():
    data = row[1]
    documents = data['documents']
    
    for doc_id, text, date, sub_reddit, labels in documents:
        if len(labels) > 0:
            #print(labels)
            if sub_reddit not in bias_counter:
                bias_counter[sub_reddit] = {'sum':0,'LEFT_CENTER':0,'LEFT':0,'LEAST_BIASED':0,'RIGHT_CENTER':0,'RIGHT':0,'EXTREME_LEFT':0,'EXTREME_RIGHT':0}
                factual_counter[sub_reddit] =  {'sum':0,'HIGH':0,'VERY_HIGH':0,'MOSTLY_FACTUAL':0,'MIXED':0,'VERY_LOW':0,'LOW':0}
            for label in labels:
                pol_bias = label[2][0]
                fact_bias = label[3]
                if len(pol_bias) > 0 and pol_bias in political_bias:
                    bias_counter[sub_reddit][pol_bias] += 1
                    bias_counter[sub_reddit]['sum'] += 1
                if len(fact_bias) > 0:
                    factual_counter[sub_reddit][fact_bias] += 1
                    factual_counter[sub_reddit]['sum'] += 1
        

In [None]:
political_factors = {}
factual_factors = {}

for group in theme_groups:
    for i in theme_groups[group]:
        if i not in bias_counter:
            continue 
        x = bias_counter[i]
        y = factual_counter[i]
        try:
            pb_factor = sum([x[key]*map_biases[key] 
              for key in x.keys() if key in political_bias])/x['sum']
        except ZeroDivisionError:
            pb_factor = None

        try:
            factual_factor = sum([y[key]*map_factual[key] 
              for key in y.keys() if key!='sum'])/y['sum']
        except ZeroDivisionError:
            factual_factor = None

        political_factors[i] = pb_factor
        factual_factors[i] = factual_factor

In [None]:
print(political_factors)
print(factual_factors)

In [None]:
cmap = ['orangered', 'darkmagenta', 'aqua', 'violet', 'gold', 'grey', 'blue', 'lime', 'steelblue', 'silver', 'deeppink', 'olivedrab']

# plt.figure(figsize=(10,10))
for group in theme_groups:
    plt.figure(figsize=(10,10))
    for ind, sub in enumerate(theme_groups[group]):
        if sub not in political_factors:
            continue
        plt.scatter(political_factors[sub], factual_factors[sub], color=cmap[ind], marker='x', linewidths=3)
    plt.yticks([-1.5,-1,-0.5,0,0.5,1,1.5],['VERY LOW', 'LOW', 'MIXED', 'UNBIASED', 'MOSTLY FACTUAL', 'HIGH', 'VERY HIGH'])
    plt.xticks([-1.5,-1,-0.5,0,0.5,1,1.5],['EXTREME LEFT','LEFT','LEFT CENTER','LEAST BIASED','RIGHT CENTER','RIGHT','EXTREME RIGHT']);
    plt.xlim(-1.6, 1.6)
    plt.ylim(-1.6, 1.6)
    plt.legend(theme_groups[group]);
    plt.savefig('./categories/' + group + '_pol_and_fac_bias.pdf')
    plt.show();