In [1]:
import numpy as np
import pandas as pd
import random
from neeka_lib import *
from reddit_helper import *
from sklearn.cluster import KMeans

In [2]:
# Parameters
MIN_VOTES_PER_USER = 8
MIN_VOTES_PER_POST = 8
N_GRAPH_ITERATIONS = 13
VECTOR_LENGTH = 300 # length of user/post embeddings
SUBREDDIT_NAMES = ['politics', 'news', 'worldnews']

In [3]:
# get ALL votes
all_votes_df = pd.read_csv('reddit_votes_data/44_million_reddit_votes/44_million_votes.txt',sep='\t')

In [4]:
# get ALL submissions
all_submissions_df = pd.read_csv('reddit_votes_data/submission_info/submission_info.txt',sep='\t')
all_submissions_df.set_index('SUBMISSION_ID',inplace=True)

In [5]:
# get all submissions in specified subreddits submissions
submissions_df = all_submissions_df[all_submissions_df['SUBREDDIT'].isin(SUBREDDIT_NAMES)]

# get all submissions in specified subreddits submissions
r_subreddit_names = [f'r/{NAME}' for NAME in SUBREDDIT_NAMES]
votes_df = all_votes_df[all_votes_df['SUBREDDIT'].isin(r_subreddit_names)]
votes_df = all_votes_df[all_votes_df['SUBMISSION_ID'].isin(submissions_df.index)]

In [6]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)

In [7]:
# get all politics users 
pol_users = votes_df['USERNAME'].unique()

# create our dataframe of users
vectors = pd.Series([ np.random.choice([1,-1],VECTOR_LENGTH) for _ in range(len(pol_users)) ])
intermediary_vectors = pd.Series([ np.zeros(VECTOR_LENGTH) for _ in range(len(pol_users)) ])

users_df = pd.DataFrame({'USERS':pol_users,'VECTOR':vectors, 'INTERMEDIARY':intermediary_vectors})
users_df.set_index('USERS',inplace=True)

In [8]:
reset_users(users_df)

In [9]:
# Iterate graph to form user embeddings
for _ in range(N_GRAPH_ITERATIONS):
    iterate_graph(votes_df, users_df)
    resolve_graph(users_df)

          35600 / 35608         

In [10]:
# Perform k-means clustering to group users - this is needed for testing
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(  np.stack(users_df['VECTOR'].to_numpy())  )
group_labels = kmeans.labels_
group_ids, group_counts = np.unique(group_labels, return_counts=True)
for grpid, grpcnt in zip(group_ids,group_counts):
    print(f'{grpid}: {grpcnt}')

0: 4614
1: 8042


In [11]:
# Put users into groups - this is needed for testing
users_df['GROUP'] = "NO_GROUP"
users_df.loc[group_labels == 0,'GROUP'] = 'GROUP_0'
users_df.loc[group_labels == 1,'GROUP'] = 'GROUP_1' 

In [12]:
# initialize the posts_df dataframe
posts_df = pd.DataFrame({'POST_ID':list(set(votes_df['SUBMISSION_ID']))})
posts_df.set_index('POST_ID',inplace=True)
reset_post_stats(posts_df)

In [13]:
# calculate some base statistics for each post
calculate_post_stats(users_df, votes_df, posts_df)

In [14]:
# calvulate the Neeka score for each post
neeka_score_calculation(posts_df)

In [15]:
# calculate percentile rank 
x_index = np.logical_not(np.logical_or( posts_df['GROUP1_X'].isna(), posts_df['GROUP0_X'].isna())) 
posts_x_df = posts_df[x_index].copy()
calculate_percentile_rank(posts_x_df)

# Results

### Statistics

In [16]:
results = calculate_test_results(posts_x_df)
print_test_results(results)


    simple_agreement: 46.46 %
    neeka_agreement: 65.13 %
    agreement_change: 18.67 %

    simple_apolar: 33.43 %
    neeka_apolar: 53.55 %
    apolar_change: 20.12 %

    simple_neutrality: 66.5 %
    neeka_neutrality: 92.94 %
    neutrality_change: 26.45 %

    Overall:
    simple_quality: 48.8 %
    neeka_quality: 70.54 %
    quality_change: 21.75 %

    #######################################

    simple_group0_bias: 25.11 %
    simple_group1_bias: 58.61 %
    
    neeka_group0_bias: 44.66 %
    neeka_group1_bias: 51.72 %
    


### Most Promoted/Demoted ALL POSTS 

In [17]:
sample = posts_df
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Apple and Samsung fined for deliberately slowing down phones"
2. "FBI: Nation-state actors have breached two US municipalities"
3. "A former neighbor of Joe Biden's accuser Tara Reade has come forward to corroborate her sexual assault account, saying Reade discussed the allegations in detail in the mid-1990s"
4. "Spain: prosecutors claim attack not rape as victim ‘did not fight back’ - Six men accused of sexual abuse of 14-year-old girl in abandoned factory in Manresa"
5. "Feb. 11 Is 'The Day We Fight Back' Against NSA Surveillance | More than 5,300 web-based companies and other organizations, including Reddit, Imgur, Tumblr, Mozilla the Electronic Frontier Foundation and the ACLU, have joined forces to protest National Security Agency surveillance on Feb. 11."
6. "Rapper Pop Smoke killed in home invasion"
7. "Not a conspiracy anymore"
8. "Houston Chronicle calls on O'Rourke to end White 

### Most Promoted/Demoted SAMPLE A

In [18]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Photos Emerge From Kashmir, a Land on Lockdown. Indian photographers managed to work around a communication blockade to publish their images"
2. "The Pentagon awarded Lockheed Martin $946 million on behalf of the Kingdom of Saudi Arabia for the defense giants’ THAAD missile defense system"
3. "Chinese schools have begun enforcing "smart uniforms" embedded with computer chips to monitor student movements and prevent them from skipping classes. As students enter the school, the time and date is recorded along with a short video that parents can access via a mobile app."
4. "Mormon Senator goes ahead with medical marijuana bill despite his church's objection."
5. "Anger grows as Puerto Rico misses power restoration deadline"
6. "The Liz and Bernie show: Progressive duo shows us why they're electable |Warren and Sanders lay waste to moderate foes, and to the false dichotomy between "electable

### Most Promoted/Demoted SAMPLE B

In [19]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Emmanuel Macron to introduce new tax on expensive jewellery, supercars and luxury yachts"
2. "The bushfires in Australia are so big they're generating their own weather systems — 'pyrocumulonimbus' thunderstorms that can start more fires."
3. "US police arrest 36-year-old nurse after patient in a vegetative state gave birth"
4. "China Killing Prisoners To Harvest Organs For Transplant: BMC Report Accuses China Of ‘Falsifying’ Data"
5. "Tibetan students and government workers are banned from participating in religious observances, as Chinese authorities continue efforts to separate Tibetans from their cultural heritage and identity."
6. "Chinese Passengers refuse to board flight with Wuhan natives after recognizing their accents, causing a 5-hour standoff at Nagoya Airport"
7. "Questioning Tara Reade’s story doesn’t make one a rape apologist: On Joe Biden and #MeToo"
8. "CBS News poll: Maj

### Most Promoted/Demoted SAMPLE C

In [20]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Rapper Pop Smoke killed in home invasion"
2. "In a break from tradition, I am endorsing all 12 Democratic candidates"
3. "Texas Accepts Mexico's Offer of Harvey Relief"
4. "CDC has stopped disclosing number of Coronavirus tests and cases in the U. S."
5. "CBS News poll: Majority of Americans and Democrats approve of Trump impeachment inquiry"
6. "Fox News' Shepard Smith departs network, steps down as chief news anchor"
7. "OK Boomers, Wake Up! — Sanders is the person you used to be but forgot about."
8. "South Dakota will require "In God We Trust" signs in all public schools"
9. "Marion "Suge" Knight sentenced to 28 years in prison"
10. "Atlanta student found dead filed a police report on unwanted kissing days before she vanished, authorities say"
-
Most Promoted by **simple-consensus** over Neeka (most demoted by Neeka):
1. "Millennials support socialism because they want to make America