In [None]:
import pandas as pd

In [None]:
# Loading data set
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
# Loading automatic labels provided by Ezzeddine
correct_auto = set(pd.read_csv('../data/three_models_predicted_correctly.csv')['doc_id'])

In [None]:
test_set = set(pd.read_csv('../data/all_test_posts.csv')['doc_id'])

In [None]:
# Calculating the number of correctly classified posts per user
cc_factor = []
for r in df.iterrows():
    data = r[1]
    user_posts = set([i[0] for i in data['documents']]) & test_set
    
    if len(user_posts) == 0:
        cc_factor.append(-1)
    else:
        cc_factor.append(len(user_posts & correct_auto)/len(user_posts))
df['cc'] = cc_factor

In [None]:
# Drop users with no post in the test set
# Sort users by cc
df = df[df['cc'] >= 0].sort_values(by='cc', ascending=False)

In [None]:
# get fn top 1k users with best cc, then sort them by factual_factor (decending) and get the the top 250 
fn_spreaders = df.sort_values(by='fn_rn_ratio', ascending=False).iloc[:500,:].sort_values(by='factual_factor',ascending=False).iloc[:250, :]
fn_spreaders

In [None]:
# get rn top 1k users with best cc, then sort them by factual_factor (ascending) and get the the top 250 
rn_spreaders = df[df['fake_news_spreader'] == 0].iloc[:1000,:].sort_values(by='factual_factor', ascending=True).iloc[:250, :]

In [None]:
# Final list of 250 rn- and 250 fn-spreaders
final_list = fn_spreaders.append(rn_spreaders)
final_list

In [None]:
label_frame = pd.DataFrame(columns=['user_id', 'post_id', 'auto_label',
                                    'subreddit', 'text'])

fact_map = {'VERY_LOW': -3, 'LOW': -2, 'MIXED': -1,
            'MOSTLY_FACTUAL': 1, 'HIGH': 2, 'VERY_HIGH': 3}

# geting one post per user
for r in final_list.iterrows():
    data = r[1]
    posts = [i for i in data['documents'] if i[0] in correct_auto]
    
    posts = sorted(posts, key=lambda p: fact_map[p[4][0][3]], reverse=data['fake_news_spreader'] == 0)
    
    for p in posts[:1]:
        label_frame = label_frame.append({'user_id':data['user_id'], 'post_id': p[0], 'auto_label': p[4][0][1],
                            'subreddit': p[3], 'text': p[1]}, ignore_index=True)

In [None]:
label_frame

In [None]:
label_frame.to_csv('manual_labeling.csv')