In [None]:
import pandas as pd
# import spacy
import matplotlib.pyplot as plt
from tqdm import tqdm

# Disable all unused parts to boost the calculation
#nlp = spacy.load('en_core_web_sm', disable=['parser', 'lemmatizer', 'tok2vec', 'attribute_ruler', 'senter', 'tagger'])

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
named_entities = {}
for row in tqdm(df.iterrows(), desc='Evaluating Named Entities', total=len(df)):
    data = row[1]
    documents = data['documents']
    user_id = data['user_id']
    
    named_entities[user_id] = {'all': {}}
    
    for doc_id, text, date, sub_reddit, labels in documents:
            
        doc = nlp(text)
        named_entities[user_id][doc_id] = {}
        for named_entity in doc.ents:
            if named_entity.label_ not in named_entities[user_id][doc_id]:
                named_entities[user_id][doc_id][named_entity.label_] = 0
            named_entities[user_id][doc_id][named_entity.label_] += 1
        
        for ent in named_entities[user_id][doc_id]:
            if ent not in named_entities[user_id]['all']:
                named_entities[user_id]['all'][ent] = 0
            named_entities[user_id]['all'][ent] += named_entities[user_id][doc_id][ent]

In [None]:
import pickle

with open('../data/named_entities.pickel', 'wb') as f:
    pickle.dump(named_entities, f)

In [None]:
import pickle

with open('../data/named_entities.pickel', 'rb') as f:
    named_entities = pickle.load(f)

## Plots

### Post level

In [None]:
rn_entities = {'TOTAL': 0}
fn_entities = {'TOTAL': 0}

fn_users = {'TOTAL': 0}
rn_users = {'TOTAL': 0}

for row in tqdm(df.iterrows(), total=len(df)):
    data = row[1]
    documents = data['documents']
    user_id = data['user_id']
    fn = data['fake_news_spreader']
    
    for doc_id, text, date, sub_reddit, labels in documents:
        
        if fn == 0:
            for ent in named_entities[user_id][doc_id]:
                if ent not in rn_users:
                    rn_users[ent] = 0
                rn_users[ent] += named_entities[user_id][doc_id][ent]
                rn_users['TOTAL'] += named_entities[user_id][doc_id][ent]
        else:
            for ent in named_entities[user_id][doc_id]:
                if ent not in rn_users:
                    fn_users[ent] = 0
                fn_users[ent] += named_entities[user_id][doc_id][ent]
                fn_users['TOTAL'] += named_entities[user_id][doc_id][ent]
        
        if len(labels) == 1:
            if labels[0][1] == 0:
                for ent in named_entities[user_id][doc_id]:
                    if ent not in rn_entities:
                        rn_entities[ent] = 0
                    rn_entities[ent] += named_entities[user_id][doc_id][ent]
                    rn_entities['TOTAL'] += named_entities[user_id][doc_id][ent]
            else:
                for ent in named_entities[user_id][doc_id]:
                    if ent not in fn_entities:
                        fn_entities[ent] = 0
                    fn_entities[ent] += named_entities[user_id][doc_id][ent]
                    fn_entities['TOTAL'] += named_entities[user_id][doc_id][ent]


In [None]:
entities = sorted(list(set([i for i in (list(rn_entities.keys()) + list(fn_entities.keys())) if i != 'TOTAL'])), key=lambda x: rn_entities[x], reverse=True)
for i, e in enumerate(entities):
    plt.bar(x=2*i-0.3, width=0.45, height=rn_entities[e]/rn_entities['TOTAL'], color='steelblue')
    plt.bar(x=2*i+0.3, width=0.45, height=fn_entities[e]/fn_entities['TOTAL'], color='darkred')
plt.xticks([i for i in range(0,2*len(entities), 2)], list(entities), rotation=90)
plt.legend(['Real News Posts', 'Fake News Posts'])
plt.ylabel('Density of Named Entitiy')
plt.yticks([i/1000 for i in range(25, 260, 25)], ['{:2}%'.format(i/10) for i in range(25, 260, 25)])
plt.ylim(0, 0.275)
plt.title('Named Entities in Real- and Fake-News Posts');
plt.savefig('../visualization/overview/NER_posts.pdf', bbox_inches='tight')
plt.savefig('../visualization/overview/NER_posts.png', bbox_inches='tight')

In [None]:
#entities = set([i for i in (list(rn_users.keys()) + list(fn_users.keys())) if i != 'TOTAL'])
for i, e in enumerate(entities):
    plt.bar(x=2*i-0.3, width=0.45, height=rn_users[e]/rn_users['TOTAL'], color='steelblue')
    plt.bar(x=2*i+0.3, width=0.45, height=fn_users[e]/fn_users['TOTAL'], color='darkred')
plt.xticks([i for i in range(0,2*len(entities), 2)], list(entities), rotation=90)
plt.legend(['Real News Users', 'Fake News Users'])
plt.ylabel('Density of Named Entitiy')
plt.yticks([i/1000 for i in range(25, 260, 25)], ['{:2}%'.format(i/10) for i in range(25, 260, 25)])
plt.ylim(0, 0.275)
plt.title('Named Entities from Real- and Fake-News Spreaders');
plt.savefig('../visualization/overview/NER_users.pdf', bbox_inches='tight')
plt.savefig('../visualization/overview/NER_users.png', bbox_inches='tight')