In [1]:
import numpy as np
import pandas as pd
import random
from neeka_lib import *
from reddit_helper import *
from sklearn.cluster import KMeans

In [2]:
# Parameters
MIN_VOTES_PER_USER = 4
MIN_VOTES_PER_POST = 4
N_GRAPH_ITERATIONS = 3
VECTOR_LENGTH = 300 # length of user/post embeddings
SUBREDDIT_NAMES = ['politics', 'news', 'worldnews']

In [3]:
# get ALL votes
all_votes_df = pd.read_csv('reddit_votes_data/44_million_reddit_votes/44_million_votes.txt',sep='\t')

In [4]:
# get ALL submissions
all_submissions_df = pd.read_csv('reddit_votes_data/submission_info/submission_info.txt',sep='\t')
all_submissions_df.set_index('SUBMISSION_ID',inplace=True)

In [5]:
# get all submissions in specified subreddits submissions
submissions_df = all_submissions_df[all_submissions_df['SUBREDDIT'].isin(SUBREDDIT_NAMES)]

# get all submissions in specified subreddits submissions
r_subreddit_names = [f'r/{NAME}' for NAME in SUBREDDIT_NAMES]
votes_df = all_votes_df[all_votes_df['SUBREDDIT'].isin(r_subreddit_names)]
votes_df = all_votes_df[all_votes_df['SUBMISSION_ID'].isin(submissions_df.index)]

In [6]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)

In [7]:
# get all politics users 
pol_users = votes_df['USERNAME'].unique()

# create our dataframe of users
vectors = pd.Series([ np.random.choice([1,-1],VECTOR_LENGTH) for _ in range(len(pol_users)) ])
intermediary_vectors = pd.Series([ np.zeros(VECTOR_LENGTH) for _ in range(len(pol_users)) ])

users_df = pd.DataFrame({'USERS':pol_users,'VECTOR':vectors, 'INTERMEDIARY':intermediary_vectors})
users_df.set_index('USERS',inplace=True)

In [8]:
reset_users(users_df)

In [9]:
# Iterate graph to form user embeddings
for _ in range(N_GRAPH_ITERATIONS):
    iterate_graph(votes_df, users_df)
    resolve_graph(users_df)

          71000 / 71048         

In [10]:
# Perform k-means clustering to group users - this is needed for testing
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(  np.stack(users_df['VECTOR'].to_numpy())  )
group_labels = kmeans.labels_
group_ids, group_counts = np.unique(group_labels, return_counts=True)
for grpid, grpcnt in zip(group_ids,group_counts):
    print(f'{grpid}: {grpcnt}')

0: 7366
1: 9870


In [11]:
# Put users into groups - this is needed for testing
users_df['GROUP'] = "NO_GROUP"
users_df.loc[group_labels == 0,'GROUP'] = 'GROUP_0'
users_df.loc[group_labels == 1,'GROUP'] = 'GROUP_1' 

In [12]:
# initialize the posts_df dataframe
posts_df = pd.DataFrame({'POST_ID':list(set(votes_df['SUBMISSION_ID']))})
posts_df.set_index('POST_ID',inplace=True)
reset_post_stats(posts_df)

In [13]:
# calculate some base statistics for each post
calculate_post_stats(users_df, votes_df, posts_df)

In [14]:
# calvulate the Neeka score for each post
neeka_score_calculation(posts_df)

In [15]:
# calculate percentile rank 
x_index = np.logical_not(np.logical_or( posts_df['GROUP1_X'].isna(), posts_df['GROUP0_X'].isna())) 
posts_x_df = posts_df[x_index].copy()
calculate_percentile_rank(posts_x_df)

# Results

### Statistics

In [16]:
results = calculate_test_results(posts_x_df)
print_test_results(results)


    simple_agreement: 52.24 %
    neeka_agreement: 60.96 %
    agreement_change: 8.71 %

    simple_apolar: 28.23 %
    neeka_apolar: 36.53 %
    apolar_change: 8.3 %

    simple_neutrality: 76.78 %
    neeka_neutrality: 86.53 %
    neutrality_change: 9.75 %

    Overall:
    simple_quality: 52.42 %
    neeka_quality: 61.34 %
    quality_change: 8.92 %

    #######################################

    simple_group0_bias: 37.11 %
    simple_group1_bias: 60.33 %
    
    neeka_group0_bias: 46.13 %
    neeka_group1_bias: 59.6 %
    


### Most Promoted/Demoted ALL POSTS 

In [17]:
sample = posts_df
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "US school 'sorry' for foster care threat over lunch debt"
2. "Four rockets hit Iraqi military base near Baghdad airport"
3. "Nearly 175 Saudi military aviation students grounded in U.S. after base shooting"
4. "Hungary Withdraws From European Singing Competition Because 'It's Too Gay'"
5. "India to buy US$1 billion of US crude for the first time ever"
6. "Students called a suicide hotline listed on their ID cards. It was sex hotline instead."
7. "Man Shot Dead by UK Police Wanted Girlfriend to Behead Her Parents"
8. "Trump signs executive order to support moon mining, tap asteroid resources"
9. "A Russian military ammo depot that blew up earlier this week just exploded again"
10. "Judge rules law enforcement can’t force suspects to unlock their iPhone with Face ID or Touch ID"
-
Most Promoted by **simple-consensus** over Neeka (most demoted by Neeka):
1. "Greta Thunberg is Time's 2019 Per

### Most Promoted/Demoted SAMPLE A

In [18]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Kevin Spacey accuser Ari Behn, ex of Norwegian princess, died by suicide, manager says"
2. "eSports player comments on his ban for supporting Hong Kiong: “Today, what I have lost in Hearthstone is four years of time, but if Hong Kong loses it would be the matter of a lifetime.""
3. "Arnold Schwarzenegger said in a press conference that the U.S. Supreme Court's ruling legalizing same-sex marriage nationwide was "the right decision" – and he rebuffed those politicians "not having the balls" to lead"
4. "India Warned by Australian Cyber Officials Against Using Huawei: Reports. | Australia in 2018 became the first country to ban Huawei from supplying equipment for a 5G mobile network."
5. "A Million People Are Jailed at China's Gulags. I Managed to Escape. Here's What Really Goes on Inside"
6. "John Kerry on jet lag: "When I'm flying, I usually take an Ambien and listen to one of my own speec

### Most Promoted/Demoted SAMPLE B

In [19]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Trump Met Brazilian Official Who Tested Positive for Coronavirus"
2. "The drug industry wants us to think Martin Shkreli is a rogue CEO. He isn’t."
3. "Teen's TikTok video about China's Muslim camps goes viral"
4. "Bernie Sanders slams Joe Biden on trade before Michigan rally in Detroit"
5. "Federal judge pushes back against Trump criticism of handling of Roger Stone case"
6. "Saturday Morning Political Cartoon Thread"
7. "The Biggest Social Media Operation You’ve Never Heard Of Is Run Out of Cyprus by Russians"
8. "The "What happened in your state last week?" Megathread, Week 47"
9. "70 cases of COVID-19 at French schools days after re-opening"
10. "Judge restores Obama-era drilling ban in Arctic"
-
Most Promoted by **simple-consensus** over Neeka (most demoted by Neeka):
1. "Dan Crenshaw is worried that universal background checks might keep him from arming his friends - It is unclear w

### Most Promoted/Demoted SAMPLE C

In [20]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Rich Sportswriter Cheers Rioters Burning Poor Areas but Calls Them ‘Animals’ When They Get To His"
2. "US active duty face poor living conditions in on base housing."
3. "Support for Trump is fading among active-duty troops, new poll shows"
4. "North Korea calls Donald Trump 'senile' and 'bereft of reason'"
5. "Donald Trump incorrect that U.S. dollar is strongest it's ever been"
6. "U.N. says it has credible reports that China holds million Uighurs in secret camps"
7. "Gwyneth Paltrow called out by NASA for selling a ten pack of body-stickers for $60 that "re-balance energy.""
8. "Couple donates bug collection worth $10m, a goldmine for researchers: Collection will help scientists piece together a large branch of insects’ family tree and be a resource for scientists who study natural controls on the environment"
9. "Video Played at Trump Resort Portrays Him Murdering Media, Opponents"
10.