In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
multi_links = {}

for row in tqdm(df.iterrows(), desc='Getting multi link posts', total=len(df)):
    
    data = row[1]
    documents = data['documents']
    user_id = data['user_id']
    
    dic = {'fn': data['fake_news_spreader'], 'amounts': data['amounts'], 'multi-posts': {}}
    multi_post_amounts = [0, 0]
    
    for doc_id, text, date, sub_reddit, labels in documents:
        
        if len(labels) > 1:
            dic['multi-posts'][doc_id] = labels
            for label in labels:
                multi_post_amounts[label[1]] += 1
            
    if len(dic['multi-posts']) > 0:
        dic['multi-post_amounts'] = multi_post_amounts
        dic['num_docs'] = data['num_docs']
        multi_links[user_id] = dic
    

In [None]:
# Would a user change to rn-spreader, if we remove multi-link posts?
changed_users = {}

for user in multi_links:
    
    user_amounts = multi_links[user]['amounts']
    
    # skip rn spreaders
    if user_amounts[1] == 0:
        continue
    
    fn_amount = 0
    
    for post in multi_links[user]['multi-posts']:
        for label in multi_links[user]['multi-posts'][post]:
            if label[1] == 1:
                fn_amount += 1
                break
    
    if user_amounts[1] - fn_amount == 0:
        changed_users[user] = user_amounts + (multi_links[user]['num_docs'], 0)

# in total 198 users are left with no more fn posts
len(changed_users)

In [None]:
changed_users

In [None]:
info = np.array(list(changed_users.values())).T

#plt.figure(figsize=(13,9))
fig, ax = plt.subplots()
cax = fig.add_axes([0.95, 0.1, 0.05, 0.8])

plot = ax.scatter(info[0], info[1], marker='x', c=np.log(info[2]), s=60)
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_title('All 198 users that changed to rn-spreaders\nwhen disregarding multi-link posts')
ax.set_ylabel('Number of fn-post with multilinks')
ax.set_xlabel('Number of rn-posts');

cbar = fig.colorbar(plot, cax=cax, orientation='vertical')

cbar.ax.set_yticklabels([int(np.exp(i)) for i in cbar.get_ticks().tolist()])
cbar.set_label('Number of total posts', rotation=270, labelpad=18)

plt.savefig('mulit-link-changes.pdf', bbox_inches='tight')

In [None]:
# looking at users that had only one fn-multi-link-post.
# What is the distrbution of fn-and-rn links in these posts?
num_rn_links = []
num_fn_links = []

for user in changed_users:
    
    # counter for links
    fn = 0
    rn = 0
    
    # check that only one multi-link post is fake-news
#     if changed_users[user][1] != 1:
#         continue
    
    
    # collect the number of real and fake news links
    for post in multi_links[user]['multi-posts']:
        for label in multi_links[user]['multi-posts'][post]:
            if label[1] == 1:
                fn += 1
            else:
                rn += 1
    # 
    num_fn_links.append(fn)
    num_rn_links.append(rn)

plt.scatter(num_rn_links, num_fn_links, marker='x', s=60)
plt.xscale('log')
plt.title('Relation of fn- and rn-links\nin multi link posts of changed users')
plt.ylabel('Number of fn-links')
plt.xlabel('Number of rn-links')
plt.savefig('links-of-channged-users.pdf', bbox_inches='tight')