In [1]:
import numpy as np
import pandas as pd
import random
from neeka_lib import *
from reddit_helper import *
from sklearn.cluster import KMeans

In [2]:
# Parameters
MIN_VOTES_PER_USER = 8
MIN_VOTES_PER_POST = 8
N_GRAPH_ITERATIONS = 7
VECTOR_LENGTH = 300 # length of user/post embeddings
SUBREDDIT_NAMES = ['politics', 'news', 'worldnews']

In [3]:
# get ALL votes
all_votes_df = pd.read_csv('reddit_votes_data/44_million_reddit_votes/44_million_votes.txt',sep='\t')

In [4]:
# get ALL submissions
all_submissions_df = pd.read_csv('reddit_votes_data/submission_info/submission_info.txt',sep='\t')
all_submissions_df.set_index('SUBMISSION_ID',inplace=True)

In [5]:
# get all submissions in specified subreddits submissions
submissions_df = all_submissions_df[all_submissions_df['SUBREDDIT'].isin(SUBREDDIT_NAMES)]

# get all submissions in specified subreddits submissions
r_subreddit_names = [f'r/{NAME}' for NAME in SUBREDDIT_NAMES]
votes_df = all_votes_df[all_votes_df['SUBREDDIT'].isin(r_subreddit_names)]
votes_df = all_votes_df[all_votes_df['SUBMISSION_ID'].isin(submissions_df.index)]

In [6]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)

In [7]:
# get all politics users 
pol_users = votes_df['USERNAME'].unique()

# create our dataframe of users
vectors = pd.Series([ np.random.choice([1,-1],VECTOR_LENGTH) for _ in range(len(pol_users)) ])
intermediary_vectors = pd.Series([ np.zeros(VECTOR_LENGTH) for _ in range(len(pol_users)) ])

users_df = pd.DataFrame({'USERS':pol_users,'VECTOR':vectors, 'INTERMEDIARY':intermediary_vectors})
users_df.set_index('USERS',inplace=True)

In [8]:
reset_users(users_df)

In [9]:
# Iterate graph to form user embeddings
for _ in range(N_GRAPH_ITERATIONS):
    iterate_graph(votes_df, users_df)
    resolve_graph(users_df)

          35600 / 35608         

In [10]:
# Perform k-means clustering to group users - this is needed for testing
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(  np.stack(users_df['VECTOR'].to_numpy())  )
group_labels = kmeans.labels_
group_ids, group_counts = np.unique(group_labels, return_counts=True)
for grpid, grpcnt in zip(group_ids,group_counts):
    print(f'{grpid}: {grpcnt}')

0: 4677
1: 7979


In [11]:
# Put users into groups - this is needed for testing
users_df['GROUP'] = "NO_GROUP"
users_df.loc[group_labels == 0,'GROUP'] = 'GROUP_0'
users_df.loc[group_labels == 1,'GROUP'] = 'GROUP_1' 

In [12]:
# initialize the posts_df dataframe
posts_df = pd.DataFrame({'POST_ID':list(set(votes_df['SUBMISSION_ID']))})
posts_df.set_index('POST_ID',inplace=True)
reset_post_stats(posts_df)

In [13]:
# calculate some base statistics for each post
calculate_post_stats(users_df, votes_df, posts_df)

In [14]:
# calvulate the Neeka score for each post
neeka_score_calculation(posts_df)

In [15]:
# calculate percentile rank 
x_index = np.logical_not(np.logical_or( posts_df['GROUP1_X'].isna(), posts_df['GROUP0_X'].isna())) 
posts_x_df = posts_df[x_index].copy()
calculate_percentile_rank(posts_x_df)

# Results

### Statistics

In [16]:
results = calculate_test_results(posts_x_df)
print_test_results(results)


    simple_agreement: 47.78 %
    neeka_agreement: 64.86 %
    agreement_change: 17.09 %

    simple_apolar: 34.45 %
    neeka_apolar: 52.87 %
    apolar_change: 18.43 %

    simple_neutrality: 67.2 %
    neeka_neutrality: 89.77 %
    neutrality_change: 22.56 %

    Overall:
    simple_quality: 49.81 %
    neeka_quality: 69.17 %
    quality_change: 19.36 %

    #######################################

    simple_group0_bias: 25.77 %
    simple_group1_bias: 58.57 %
    
    neeka_group0_bias: 43.16 %
    neeka_group1_bias: 53.4 %
    


### Most Promoted/Demoted ALL POSTS 

In [17]:
sample = posts_df
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Apple and Samsung fined for deliberately slowing down phones"
2. "FBI: Nation-state actors have breached two US municipalities"
3. "A former neighbor of Joe Biden's accuser Tara Reade has come forward to corroborate her sexual assault account, saying Reade discussed the allegations in detail in the mid-1990s"
4. "Feb. 11 Is 'The Day We Fight Back' Against NSA Surveillance | More than 5,300 web-based companies and other organizations, including Reddit, Imgur, Tumblr, Mozilla the Electronic Frontier Foundation and the ACLU, have joined forces to protest National Security Agency surveillance on Feb. 11."
5. "Putin approves law targeting journalists as 'foreign agents'"
6. "Iceland sentences 26 corrupt bankers to 74 years in prison"
7. "Major tech firms urge U.S. to retain net neutrality rules"
8. "Houston Chronicle calls on O'Rourke to end White House bid, run for Senate"
9. "Vinnie Paul Dea

### Most Promoted/Demoted SAMPLE A

In [18]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "N. Macedonia becomes 30th NATO member."
2. "Australian intelligence determined China was responsible for a cyber-attack on its national parliament and three largest political parties before the general election in May, five people with direct knowledge of the matter told Reuters."
3. "Shep Smith Out at Fox News Amid Trump Tension"
4. "IRS, FBI Search Home of Baltimore Mayor Catherine Pugh, Baltimore City Hall"
5. "Shaq Defends Free Speech on NBA Opening Night: ‘Daryl Morey Was Right’"
6. "60 students without vaccines kept out of school in Fargo, West Fargo"
7. "'Medicare-for-all' gets unexpected surge of support, even in red states"
8. "Trump Has Dropped the Pretense of Playing by the Rules of Democracy"
9. "Democratic Primaries: Great opportunity to test ranked choice voting on a national level"
10. "Questioning Tara Reade’s story doesn’t make one a rape apologist: On Joe Biden and #MeTo

### Most Promoted/Demoted SAMPLE B

In [19]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Photos Emerge From Kashmir, a Land on Lockdown. Indian photographers managed to work around a communication blockade to publish their images"
2. "Canada's Bianca Andreescu defeats Serena Williams to win U.S. Open"
3. "Woman who stood on ship's railing for selfie barred for life from cruises"
4. "Shell Workers Had To Attend Trump Speech To Be Paid, Were Ordered Not To Protest: Report"
5. "Rape convict exonerated 36 years later"
6. "Remains of 3 people found during renovations at the Alamo"
7. "Shinzo Abe says there are issues with WHO, and Japan will review its funding after the pandemic"
8. "NSA whistleblower Edward Snowden nominated for Nobel Peace Prize"
9. "Magic mushroom ingredient psilocybin could be key to treating depression - studies | Immediate reduction in depression and anxiety for up to eight months seen in patients with advanced cancer given a single dose of psilocybin"
10. "

### Most Promoted/Demoted SAMPLE C

In [20]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Not a conspiracy anymore"
2. "Alberta veterinarians vote to ban declawing, ear cropping, tail docking surgeries: ‘It’s inhumane’"
3. "US police arrest 36-year-old nurse after patient in a vegetative state gave birth"
4. "Comey: ‘If this were a case about somebody other than the president, they'd already have been indicted’"
5. "Greta Thunberg leaves US with simple climate crisis message: vote"
6. "In Year-End Address, Dying Healthcare Activist Ady Barkan Reminds Public Joe Biden Only 2020 Democrat Not to Meet With Him"
7. "Canadians who smoke marijuana legally, or work or invest in the industry, will be barred from the U.S.: Customs and Border Protection official"
8. "A Georgia death row inmate has asked to be executed by firing squad"
9. "Trump signs executive order denying asylum to illegal border crossers"
10. "Saudi journalist recorded his own torture and murder on Apple Watch"
-
Most