In [None]:
import os
import pickle
import torch
import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm import tqdm
from tensorboard.plugins import projector

In [None]:
with open('../data/post_emb.pkl', 'rb') as f:
    embeddings = pickle.load(f)

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
id_to_info = {}

for row in tqdm(df.iterrows(), total=len(df)):
    data = row[1]
    documents = data['documents']
    
    for doc_id, text, date, sub_reddit, labels in documents:
        if len(labels) == 1 and len(labels[0][2][0]) > 0:
            id_to_info[doc_id] = {'s': sub_reddit}
            if labels[0][1] == 1:
                id_to_info[doc_id]['n'] = 'Fake'
            else:
                id_to_info[doc_id]['n'] = 'Real'
            id_to_info[doc_id]['p'] = labels[0][2][0]
        else:
            if doc_id in embeddings:
                del embeddings[doc_id]
                
for key in embeddings.keys():
    if key not in id_to_info:
        del embeddings[key]

In [None]:
theme_groups = {}
theme_groups['SARS-CoV-2'] = ['r/CovidVaccinated', 'r/Masks4All', 'r/NoLockdownsNoMasks', 'r/EndTheLockdowns', 'r/COVID19', 'r/COVID19positive', 'r/CoronavirusCanada', 'r/CoronavirusRecession', 'r/CoronavirusUK', 'r/CoronavirusUS', 'r/Coronavirus', 'r/LockdownSkepticism', 'r/NoNewNormal']
theme_groups['Vaccines'] = ['r/CovidVaccinated', 'r/VACCINES', 'r/vaxxhappened', 'r/AntiVaxxers', 'r/antivax', 'r/TrueAntiVaccination', 'r/DebateVaccine', 'r/DebateVaccines']
theme_groups['Abortion'] = ['r/AskProchoice', 'r/prochoice', 'r/insaneprolife', 'r/prolife', 'r/ProLifeLibertarians', 'r/Abortiondebate', 'r/abortion']
theme_groups['womens-and-mens-rights'] = ['r/Feminism', 'r/feminisms', 'r/RadicalFeminism', 'r/RadicalFeminismUSA', 'r/MRActivism', 'r/MensRights', 'r/antifeminists', 'r/feminismformen', 'r/masculism', 'r/GenderCritical', 'r/Egalitarianism']
theme_groups['Gun-control'] = ['r/Firearms', 'r/GunsAreCool', 'r/liberalgunowners', 'r/progun', 'r/guncontrol', 'r/GunDebates', 'r/GunResearch', 'r/gunpolitics']
theme_groups['Climate-change'] = ['r/climateskeptics', 'r/GlobalClimateChange', 'r/climate', 'r/climatechange']
theme_groups['5G'] = ['r/5GDebate']
theme_groups['general-political-debate'] = ['r/JoeBiden', 'r/LeftistsForMen', 'r/Liberal', 'r/LockdownCriticalLeft', 'r/democrats', 'r/Conservative', 'r/ConservativesOnly', 'r/conservatives', 'r/Republican', 'r/RepublicanValues', 'r/politics', 'r/uspolitics', 'r/Impeach_Trump']

inverse_theme_groups = {}
for theme in theme_groups:
    for sub in theme_groups[theme]:
        inverse_theme_groups[sub] = theme

In [None]:
labels = [id_to_info[i]['n'] + "-News, " + id_to_info[i]['p'] + " bias, " + id_to_info[i]['s'] + "\t" + id_to_info[i]['n'] + "\t" + id_to_info[i]['p'] + "\t" + inverse_theme_groups[id_to_info[i]['s']] for i in list(embeddings.keys())]
tensors = np.array([i.numpy() for i in list(embeddings.values())])

log_dir = './embeddings/subreddits'

if not os.path.isdir(log_dir):
    os.makedirs(log_dir)

# Save Labels separately on a line-by-line manner.
with open(os.path.join(log_dir, 'metadata.tsv'), "w", encoding='utf-8') as f:
    f.write("Index\tNews\tPolitical\tSubreddit\n")
    for subwords in labels:
        f.write("{}\n".format(subwords))

# Save the weights we want to analyze as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, here
# we will remove this value.
weights = tf.Variable(tensors)
# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = 'embedding/.ATTRIBUTES/VARIABLE_VALUE'
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [None]:
inverse_theme_groups = {}
for theme in theme_groups:
    for sub in theme_groups[theme]:
        inverse_theme_groups[sub] = theme