In [None]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
import os
import tensorflow as tf
import numpy as np
import pickle
from tensorboard.plugins import projector

In [None]:
theme_groups = {}
theme_groups['SARS-CoV-2'] = ['r/CovidVaccinated', 'r/Masks4All', 'r/NoLockdownsNoMasks', 'r/EndTheLockdowns', 'r/COVID19', 'r/COVID19positive', 'r/CoronavirusCanada', 'r/CoronavirusRecession', 'r/CoronavirusUK', 'r/CoronavirusUS', 'r/Coronavirus', 'r/LockdownSkepticism', 'r/NoNewNormal']
theme_groups['Vaccines'] = ['r/CovidVaccinated', 'r/VACCINES', 'r/vaxxhappened', 'r/AntiVaxxers', 'r/antivax', 'r/TrueAntiVaccination', 'r/DebateVaccine', 'r/DebateVaccines']
theme_groups['Abortion'] = ['r/AskProchoice', 'r/prochoice', 'r/insaneprolife', 'r/prolife', 'r/ProLifeLibertarians', 'r/Abortiondebate', 'r/abortion']
theme_groups['womens-and-mens-rights'] = ['r/Feminism', 'r/feminisms', 'r/RadicalFeminism', 'r/RadicalFeminismUSA', 'r/MRActivism', 'r/MensRights', 'r/antifeminists', 'r/feminismformen', 'r/masculism', 'r/GenderCritical', 'r/Egalitarianism']
theme_groups['Gun-control'] = ['r/Firearms', 'r/GunsAreCool', 'r/liberalgunowners', 'r/progun', 'r/guncontrol', 'r/GunDebates', 'r/GunResearch', 'r/gunpolitics']
theme_groups['Climate-change'] = ['r/climateskeptics', 'r/GlobalClimateChange', 'r/climate', 'r/climatechange']
theme_groups['5G'] = ['r/5GDebate']
theme_groups['general-political-debate'] = ['r/JoeBiden', 'r/LeftistsForMen', 'r/Liberal', 'r/LockdownCriticalLeft', 'r/democrats', 'r/Conservative', 'r/ConservativesOnly', 'r/conservatives', 'r/Republican', 'r/RepublicanValues', 'r/politics', 'r/uspolitics', 'r/Impeach_Trump']

inverse_theme_groups = {}
for theme in theme_groups:
    for sub in theme_groups[theme]:
        inverse_theme_groups[sub] = theme

In [None]:
users_embeddings = {}
liwc_frame = pd.read_pickle('../data/new_static_LIWC_features.pkl')
for index, row in liwc_frame.iterrows():
       users_embeddings[index] = torch.tensor(row.values)
            
personality_frame = pd.read_pickle('../data/new_static_personality_features.pkl')
for index, row in personality_frame.iterrows():
        v = users_embeddings[index]
        users_embeddings[index] = [torch.cat((v, torch.tensor(row.values)))]       
        
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
user_subreddits = pd.read_pickle('../data/user_subreddits.pkl')
label_info = {}

for user in users_embeddings:
    label_info[user] = {}
    d = df[df['user_id'] == user]
    label_info[user]['fake_news'] = str(d['fake_news_spreader'].to_numpy()[0])
    if d['pb_factor'].to_numpy()[0] < 4:
        label_info[user]['pb_factor'] = str(round(d['pb_factor'].to_numpy()[0]))
    else:
        label_info[user]['pb_factor'] = '4'
    if d['factual_factor'].to_numpy()[0] < 4:
        label_info[user]['factual_factor'] = str(round(d['factual_factor'].to_numpy()[0]))
    else:
        label_info[user]['factual_factor'] = '4'
    
    user_dic = user_subreddits[user]
    groups = {'SARS-CoV-2': 0, 'Vaccines': 0, 'Abortion': 0, 'womens-and-mens-rights': 0, 'Gun-control': 0, 'Climate-change': 0, '5G': 0, 'general-political-debate': 0}
    for sub in user_dic:
        groups[inverse_theme_groups[sub]] += user_dic[sub]
    
    groups = dict(sorted(groups.items(), key=lambda item: item[1], reverse=True))
    label_info[user]['group'] = list(groups.keys())[0]

In [None]:
labels = [user + '\t' + label_info[user]['fake_news'] + '\t' + label_info[user]['pb_factor'] + '\t' + label_info[user]['factual_factor'] + '\t' + label_info[user]['group'] for user in users_embeddings]
tensors = np.array([i[0].numpy() for i in list(users_embeddings.values())])

log_dir = './embeddings/user_features/'

if not os.path.isdir(log_dir):
    os.makedirs(log_dir)

# Save Labels separately on a line-by-line manner.
with open(os.path.join(log_dir, 'metadata.tsv'), "w", encoding='utf-8') as f:
    f.write("User\tNews\tpb_f\tfac_f\tgroup\n")
    for subwords in labels:
        f.write("{}\n".format(subwords))

# Save the weights we want to analyze as a variable. Note that the first
# value represents any unknown word, which is not in the metadata, here
# we will remove this value.
weights = tf.Variable(tensors)
# Create a checkpoint from embedding, the filename and key are the
# name of the tensor.
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

# Set up config.
config = projector.ProjectorConfig()
embedding = config.embeddings.add()
# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = 'embedding/.ATTRIBUTES/VARIABLE_VALUE'
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)

In [None]:
path = (os.path.abspath('.') + log_dir).replace('\\', '/')
print("tensorboard --logdir", path)

In [None]:
user_subreddits = pd.read_pickle('../data/user_subreddits.pkl')
for user in user_subreddits:
    user_dic = user_subreddits[user]
    groups = {'SARS-CoV-2': 0, 'Vaccines': 0, 'Abortion': 0, 'womens-and-mens-rights': 0, 'Gun-control': 0, 'Climate-change': 0, '5G': 0, 'general-political-debate': 0}
    for sub in user_dic:
        groups[inverse_theme_groups[sub]] += user_dic[sub]
    
    groups = dict(sorted(groups.items(), key=lambda item: item[1], reverse=True))
    print(user, list(groups.keys())[0])
    break

In [None]:
label_info['ddf62a134f09db7a6056190d594bd41ff2f4cf04faf5db32406bbec473f46935']