In [None]:
import pandas as pd
#import spacy
from tqdm import tqdm
import matplotlib.pyplot as plt

# Disable all unused parts to boost the calculation
#nlp = spacy.load('en_core_web_sm', disable=['ner', 'parser', 'lemmatizer', 'tok2vec', 'attribute_ruler', 'senter'])

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
%%time
user_features = {}
for row in tqdm(df.iterrows(), desc='Evaluating Named Entities', total=len(df)):
    data = row[1]
    documents = data['documents']
    user_id = data['user_id']
    
    user_features[user_id] = {'all': {}}
    
    for doc_id, text, date, sub_reddit, labels in documents:
        
        doc = nlp(text)
        
        user_features[user_id][doc_id] = {'tokens': len(doc)}
        for token in doc:
            if token.pos_ not in user_features[user_id][doc_id]:
                user_features[user_id][doc_id][token.pos_] = 0
            user_features[user_id][doc_id][token.pos_] += 1
            
        for pos in user_features[user_id][doc_id]:
            if pos not in user_features[user_id]['all']:
                user_features[user_id]['all'][pos] = 0
            user_features[user_id]['all'][pos] += user_features[user_id][doc_id][pos]

user_features

In [None]:
import pickle

with open('../data/linguistic_features.pickel', 'wb') as f:
    pickle.dump(user_features, f)

In [None]:
import pickle

with open('../data/linguistic_features.pickel', 'rb') as f:
    user_features = pickle.load(f)

In [None]:
fn_users = {}
fn_posts = {}
rn_users = {}
rn_posts = {}


for r in tqdm(df.iterrows(), total=len(df)):
    data = r[1]
    fn = data['fake_news_spreader']
    user_id = data['user_id']
    
    for doc in data['documents']:
        
        # Post level
        pl = False
        if len(doc[4]) != 0:
            pl = True
            p = doc[4][0][1]
                
        for feat in user_features[user_id][doc[0]]:
            
            # User level
            if fn == 1:
                if feat not in fn_users:
                    fn_users[feat] = 0
                fn_users[feat] += user_features[user_id][doc[0]][feat]
            else:
                if feat not in rn_users:
                    rn_users[feat] = 0
                rn_users[feat] += user_features[user_id][doc[0]][feat]
                
            # Posts
            if pl:
                if p == 1:
                    if feat not in fn_posts:
                        fn_posts[feat] = 0
                    fn_posts[feat] += user_features[user_id][doc[0]][feat]
                else:
                    if feat not in rn_posts:
                        rn_posts[feat] = 0
                    rn_posts[feat] += user_features[user_id][doc[0]][feat]

In [None]:
entities = sorted(list(set([i for i in (list(fn_users.keys()) + list(rn_users.keys())) if i != 'tokens'])), key=lambda x: rn_users[x], reverse=True)
for i, e in enumerate(entities):
    plt.bar(x=2*i-0.3, width=0.45, height=rn_users[e]/rn_users['tokens'], color='steelblue')
    plt.bar(x=2*i+0.3, width=0.45, height=fn_users[e]/fn_users['tokens'], color='darkred')
plt.xticks([i for i in range(0,2*len(entities), 2)], list(entities), rotation=90)
plt.legend(['Real News Users', 'Fake News Users'])
plt.ylabel('Density of Linguistic Features')
plt.yticks([i/1000 for i in range(25, 260, 25)], ['{:2}%'.format(i/10) for i in range(25, 260, 25)])
plt.ylim(0, 0.174)
plt.title('Linguistic Features in Real- and Fake-News Users');
plt.savefig('ling_feat_users.pdf', bbox_inches='tight')

In [None]:
entities = sorted(list(set([i for i in (list(fn_posts.keys()) + list(rn_posts.keys())) if i != 'tokens'])), key=lambda x: rn_users[x], reverse=True)
for i, e in enumerate(entities):
    plt.bar(x=2*i-0.3, width=0.45, height=rn_posts[e]/rn_posts['tokens'], color='steelblue')
    plt.bar(x=2*i+0.3, width=0.45, height=fn_posts[e]/fn_posts['tokens'], color='darkred')
plt.xticks([i for i in range(0,2*len(entities), 2)], list(entities), rotation=90)
plt.legend(['Real News Posts', 'Fake News Posts'])
plt.ylabel('Density of Linguistic Features')
plt.yticks([i/1000 for i in range(25, 260, 25)], ['{:2}%'.format(i/10) for i in range(25, 260, 25)])
plt.ylim(0, 0.199)
plt.title('Linguistic Features in Real- and Fake-News Posts')
plt.savefig('ling_feat_posts.pdf', bbox_inches='tight')

In [None]:
entities

In [None]:
rn_users