In [1]:
import numpy as np
import pandas as pd
import random
from neeka_lib import *
from reddit_helper import *
from sklearn.cluster import KMeans

In [2]:
# Parameters
MIN_VOTES_PER_USER = 8
MIN_VOTES_PER_POST = 8
N_GRAPH_ITERATIONS = 19
VECTOR_LENGTH = 300 # length of user/post embeddings
SUBREDDIT_NAMES = ['politics', 'news', 'worldnews']

In [3]:
# get ALL votes
all_votes_df = pd.read_csv('reddit_votes_data/44_million_reddit_votes/44_million_votes.txt',sep='\t')

In [4]:
# get ALL submissions
all_submissions_df = pd.read_csv('reddit_votes_data/submission_info/submission_info.txt',sep='\t')
all_submissions_df.set_index('SUBMISSION_ID',inplace=True)

In [5]:
# get all submissions in specified subreddits submissions
submissions_df = all_submissions_df[all_submissions_df['SUBREDDIT'].isin(SUBREDDIT_NAMES)]

# get all submissions in specified subreddits submissions
r_subreddit_names = [f'r/{NAME}' for NAME in SUBREDDIT_NAMES]
votes_df = all_votes_df[all_votes_df['SUBREDDIT'].isin(r_subreddit_names)]
votes_df = all_votes_df[all_votes_df['SUBMISSION_ID'].isin(submissions_df.index)]

In [6]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)

In [7]:
len(votes_df)

1174890

In [8]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)
len(votes_df)

1174890

In [9]:
# get all politics users 
pol_users = votes_df['USERNAME'].unique()

# create our dataframe of users
vectors = pd.Series([ np.random.choice([1,-1],VECTOR_LENGTH) for _ in range(len(pol_users)) ])
intermediary_vectors = pd.Series([ np.zeros(VECTOR_LENGTH) for _ in range(len(pol_users)) ])

users_df = pd.DataFrame({'USERS':pol_users,'VECTOR':vectors, 'INTERMEDIARY':intermediary_vectors})
users_df.set_index('USERS',inplace=True)

In [10]:
reset_users(users_df)

In [11]:
# Iterate graph to form user embeddings
for _ in range(N_GRAPH_ITERATIONS):
    iterate_graph(votes_df, users_df)
    resolve_graph(users_df)

          35600 / 35608         

In [12]:
# Perform k-means clustering to group users - this is needed for testing
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(  np.stack(users_df['VECTOR'].to_numpy())  )
group_labels = kmeans.labels_
group_ids, group_counts = np.unique(group_labels, return_counts=True)
for grpid, grpcnt in zip(group_ids,group_counts):
    print(f'{grpid}: {grpcnt}')

0: 4597
1: 8059


In [13]:
# Put users into groups - this is needed for testing
users_df['GROUP'] = "NO_GROUP"
users_df.loc[group_labels == 0,'GROUP'] = 'GROUP_0'
users_df.loc[group_labels == 1,'GROUP'] = 'GROUP_1' 

In [14]:
# initialize the posts_df dataframe
posts_df = pd.DataFrame({'POST_ID':list(set(votes_df['SUBMISSION_ID']))})
posts_df.set_index('POST_ID',inplace=True)
reset_post_stats(posts_df)

In [15]:
# calculate some base statistics for each post
calculate_post_stats(users_df, votes_df, posts_df)

In [16]:
# calvulate the Neeka score for each post
neeka_score_calculation(posts_df)

In [17]:
# calculate percentile rank 
x_index = np.logical_not(np.logical_or( posts_df['GROUP1_X'].isna(), posts_df['GROUP0_X'].isna())) 
posts_x_df = posts_df[x_index].copy()
calculate_percentile_rank(posts_x_df)

# Results

### Statistics

In [18]:
results = calculate_test_results(posts_x_df)
print_test_results(results)


    simple_agreement: 46.21 %
    neeka_agreement: 65.27 %
    agreement_change: 19.06 %

    simple_apolar: 33.42 %
    neeka_apolar: 54.0 %
    apolar_change: 20.58 %

    simple_neutrality: 66.32 %
    neeka_neutrality: 93.9 %
    neutrality_change: 27.58 %

    Overall:
    simple_quality: 48.65 %
    neeka_quality: 71.06 %
    quality_change: 22.41 %

    #######################################

    simple_group0_bias: 24.84 %
    simple_group1_bias: 58.52 %
    
    neeka_group0_bias: 45.05 %
    neeka_group1_bias: 51.15 %
    


### Most Promoted/Demoted ALL POSTS 

In [19]:
sample = posts_df
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Apple and Samsung fined for deliberately slowing down phones"
2. "FBI: Nation-state actors have breached two US municipalities"
3. "A former neighbor of Joe Biden's accuser Tara Reade has come forward to corroborate her sexual assault account, saying Reade discussed the allegations in detail in the mid-1990s"
4. "Spain: prosecutors claim attack not rape as victim ‘did not fight back’ - Six men accused of sexual abuse of 14-year-old girl in abandoned factory in Manresa"
5. "Rapper Pop Smoke killed in home invasion"
6. "Not a conspiracy anymore"
7. "Feb. 11 Is 'The Day We Fight Back' Against NSA Surveillance | More than 5,300 web-based companies and other organizations, including Reddit, Imgur, Tumblr, Mozilla the Electronic Frontier Foundation and the ACLU, have joined forces to protest National Security Agency surveillance on Feb. 11."
8. "Banksy says that most "Banksy" works are fake"
9.

### Most Promoted/Demoted SAMPLE A

In [20]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Iceland sentences 26 corrupt bankers to 74 years in prison"
2. "Woman who partied while children died in hot car to serve 40 years in prison"
3. "Shell Workers Had To Attend Trump Speech To Be Paid, Were Ordered Not To Protest: Report"
4. "My name is Bill Browder, I’m the founder and CEO of Hermitage Capital Management, head of the Global Magnitsky Justice Campaign and the author of the New York Times bestseller - Red Notice. I am also Putin’s number one enemy. AMA"
5. "Montreal police tracked journalist's iPhone: “I was living in the fiction that police officers wouldn’t dare do that, and in the fiction that judges were protecting journalists -- and hence the public -- against this type of police intrusion,” Lagace said. “Clearly, I was naive.”"
6. "FCC chairman warns about China's 'leverage' over NBA: Imagine what Beijing can do with 5G networks"
7. "Canadians who smoke marijuana legall

### Most Promoted/Demoted SAMPLE B

In [21]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "World War II paratrooper Donald Malarky of Easy Company has passed away at 96"
2. "Shinzo Abe says there are issues with WHO, and Japan will review its funding after the pandemic"
3. "FCC chairman warns about China's 'leverage' over NBA: Imagine what Beijing can do with 5G networks"
4. "Paul Manafort to be sentenced Thursday; faces 25 years in prison"
5. "Boris Johnson's Conservative Party wins UK election with commanding majority, Sky News projects"
6. "Bill Barr, finally under fire: Will Trump's henchman survive a DOJ insurrection?"
7. "'Trump Is Getting a Little Bit Nervous,' Says Sanders Campaign as GOP Plots Anti-Bernie 'Victims of Socialism' Videos | "The president's campaign is quickly realizing that the only way to attack someone who spent his entire life standing with the working class is to lie.""
8. "Susan Collins’ disapproval rating spikes in new poll ahead of targeted 2020 re

### Most Promoted/Demoted SAMPLE C

In [22]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Spain: prosecutors claim attack not rape as victim ‘did not fight back’ - Six men accused of sexual abuse of 14-year-old girl in abandoned factory in Manresa"
2. "San Diego lab discovers COVID-19 vaccine in 3 hours | Inovio Pharmaceuticals created a vaccine that is going through pre-clinical trials. "We have an algorithm which we designed, and we put the DNA sequence into our algorithm and came up with the vaccine in that short amount of time""
3. "Shell Workers Had To Attend Trump Speech To Be Paid, Were Ordered Not To Protest: Report"
4. "Stacey Abrams Will Not Run for President in 2020, Focusing Instead on Fighting Voter Suppression"
5. "Couple jailed for genital mutilation of daughter"
6. "Ontario introduces 'pay transparency' bill to require all publicly advertised job postings to include a salary rate or range, bar employers from asking about past compensation and prohibit reprisal 