In [1]:
import numpy as np
import pandas as pd
import random
from neeka_lib import *
from reddit_helper import *
from sklearn.cluster import KMeans

In [2]:
# Parameters
MIN_VOTES_PER_USER = 16
MIN_VOTES_PER_POST = 16
N_GRAPH_ITERATIONS = 99
VECTOR_LENGTH = 300 # length of user/post embeddings
SUBREDDIT_NAMES = ['politics', 'news', 'worldnews']

In [3]:
# get ALL votes
all_votes_df = pd.read_csv('reddit_votes_data/44_million_reddit_votes/44_million_votes.txt',sep='\t')

In [4]:
# get ALL submissions
all_submissions_df = pd.read_csv('reddit_votes_data/submission_info/submission_info.txt',sep='\t')
all_submissions_df.set_index('SUBMISSION_ID',inplace=True)

In [5]:
# get all submissions in specified subreddits submissions
submissions_df = all_submissions_df[all_submissions_df['SUBREDDIT'].isin(SUBREDDIT_NAMES)]

# get all submissions in specified subreddits submissions
r_subreddit_names = [f'r/{NAME}' for NAME in SUBREDDIT_NAMES]
votes_df = all_votes_df[all_votes_df['SUBREDDIT'].isin(r_subreddit_names)]
votes_df = all_votes_df[all_votes_df['SUBMISSION_ID'].isin(submissions_df.index)]

In [6]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)

In [7]:
# get all politics users 
pol_users = votes_df['USERNAME'].unique()

# create our dataframe of users
vectors = pd.Series([ np.random.choice([1,-1],VECTOR_LENGTH) for _ in range(len(pol_users)) ])
intermediary_vectors = pd.Series([ np.zeros(VECTOR_LENGTH) for _ in range(len(pol_users)) ])

users_df = pd.DataFrame({'USERS':pol_users,'VECTOR':vectors, 'INTERMEDIARY':intermediary_vectors})
users_df.set_index('USERS',inplace=True)

In [8]:
reset_users(users_df)

In [9]:
# Iterate graph to form user embeddings
for _ in range(N_GRAPH_ITERATIONS):
    iterate_graph(votes_df, users_df)
    resolve_graph(users_df)

          17500 / 17547         

In [10]:
# Perform k-means clustering to group users - this is needed for testing
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(  np.stack(users_df['VECTOR'].to_numpy())  )
group_labels = kmeans.labels_
group_ids, group_counts = np.unique(group_labels, return_counts=True)
for grpid, grpcnt in zip(group_ids,group_counts):
    print(f'{grpid}: {grpcnt}')

0: 3322
1: 4636


In [11]:
# Put users into groups - this is needed for testing
users_df['GROUP'] = "NO_GROUP"
users_df.loc[group_labels == 0,'GROUP'] = 'GROUP_0'
users_df.loc[group_labels == 1,'GROUP'] = 'GROUP_1' 

In [12]:
# initialize the posts_df dataframe
posts_df = pd.DataFrame({'POST_ID':list(set(votes_df['SUBMISSION_ID']))})
posts_df.set_index('POST_ID',inplace=True)
reset_post_stats(posts_df)

In [13]:
# calculate some base statistics for each post
calculate_post_stats(users_df, votes_df, posts_df)

In [14]:
# calvulate the Neeka score for each post
neeka_score_calculation(posts_df)

In [15]:
# calculate percentile rank 
x_index = np.logical_not(np.logical_or( posts_df['GROUP1_X'].isna(), posts_df['GROUP0_X'].isna())) 
posts_x_df = posts_df[x_index].copy()
calculate_percentile_rank(posts_x_df)

# Results

### Statistics

In [16]:
results = calculate_test_results(posts_x_df)
print_test_results(results)


    simple_agreement: 50.34 %
    neeka_agreement: 61.94 %
    agreement_change: 11.6 %

    simple_apolar: 47.4 %
    neeka_apolar: 60.03 %
    apolar_change: 12.63 %

    simple_neutrality: 79.75 %
    neeka_neutrality: 89.99 %
    neutrality_change: 10.24 %

    Overall:
    simple_quality: 59.16 %
    neeka_quality: 70.65 %
    quality_change: 11.49 %

    #######################################

    simple_group0_bias: 28.88 %
    simple_group1_bias: 49.13 %
    
    neeka_group0_bias: 39.03 %
    neeka_group1_bias: 49.04 %
    


### Most Promoted/Demoted ALL POSTS 

In [17]:
sample = posts_df
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "USA Powerlifting bans all trans women from competing as women"
2. "China will not change its position that Taiwan belongs to it and the world will only ever recognize that there is “one China”, Beijing said on Sunday after President Tsai Ing-wen won re-election and said she would not submit to China’s threats."
3. "Apple CEO becomes chairman of China university board"
4. "Doctor treating Paris coronavirus patients says virus ‘less serious’ than SARS"
5. "Prince Andrew Says He Doesn't Regret His 'Very Useful' Relationship With Jeffrey Epstein"
6. "Fallout from Russia's mysterious missile disaster suggests a nuclear reactor blew up"
7. "Huge Hong Kong protest against extradition bill"
8. "'Lying has become a norm': Hong Kong police falsely accused protesters of blocking ambulances, democrats say."
9. "The head of Interpol disappears whilst visiting China"
10. "Israeli scientists find way to

### Most Promoted/Demoted SAMPLE A

In [18]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "The head of Interpol disappears whilst visiting China"
2. "France declares first two confirmed cases of coronavirus"
3. "Verizon throttled 'unlimited' data of Calif. fire department during wildfire"
4. "Tokyo 2020 Olympics officially postponed until 2021"
5. "Jail video of Jeffrey Epstein’s first suicide attempt was deleted, prosecutors reveal"
6. "Phoenix officer to be fired for pulling gun on parents over doll taken by 4-year-old"
7. "BBC News: Suspending Parliament was unlawful, court rules"
8. "Russians are meddling in the Democratic primary. Is anyone paying attention?"
9. "Soldiers deploying to Middle East not allowed to take cell phones"
10. "UK consulate worker says arrested Hong Kong protesters are being tortured in Shenzhen"
-
Most Promoted by **simple-consensus** over Neeka (most demoted by Neeka):
1. "Churchill's grandson slams Trump for skipping cemetery visit because of weat

### Most Promoted/Demoted SAMPLE B

In [19]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Father of Guatemalan girl who died in US custody has 'no complaints' about her treatment, consul says"
2. "#PresidentCuomo trends as governor’s star status rises over coronavirus response"
3. "Phoenix officer to be fired for pulling gun on parents over doll taken by 4-year-old"
4. "TURKEY ATTACKS KURDS, SDF IN IRAQ, SYRIA AS U.S. WITHDRAWS"
5. "Facebook faces $35 billion class-action lawsuit regarding misuse of Facial Recognition data."
6. "A 14-year-old confessed to killing all five of his family members in an Alabama home, authorities say"
7. "Iran ‘hides 500,000 virus cases,’ says member country’s anti-virus taskforce"
8. "Deutsche Bank Executive Who Signed Off On Trump Loans Kills Self At Age 55"
9. "Four Google employees fired, including staffer who helped organize labor protests"
10. "'Captain Crozier! Captain Crozier!': Videos show sailors sending off ousted USS Roosevelt commander

### Most Promoted/Demoted SAMPLE C

In [20]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Missing Romanian teenager 'begged' police to 'stay on the line':"Please stay with me on the line, I'm really scared," Alexandra Macesanu told a police officer while crying during her third and last call, according to a transcript released on Facebook by her uncle"
2. "Protestors Rally Outside Blizzcon 2019 with 'Winnie the Pooh' Costumes"
3. "Martin Shkreli Placed in Solitary Confinement After Allegedly Running Company Behind Bars: Report"
4. "F1 Legend Niki Lauda dies aged 70"
5. "#PresidentCuomo trends as governor’s star status rises over coronavirus response"
6. "Apple fined for slowing down old iPhones"
7. "Three fired over Nazi salute photo with West Virginia corrections employees"
8. "Trump suggests he could serve more than eight years as president over 'stolen time' of Russia investigation"
9. ""When all Kashmiris will be killed, only then the world will realise our sufferings," sa