In [1]:
import numpy as np
import pandas as pd
import random
from neeka_lib import *
from reddit_helper import *
from sklearn.cluster import KMeans

In [2]:
# Parameters
MIN_VOTES_PER_USER = 8
MIN_VOTES_PER_POST = 8
N_GRAPH_ITERATIONS = 19
DIV_WEIGHT = 1
CENT_WEIGHT = 1
POL_WEIGHT = 0
VECTOR_LENGTH = 300 # length of user/post embeddings
SUBREDDIT_NAMES = ['politics', 'news', 'worldnews']

In [3]:
# get ALL votes
all_votes_df = pd.read_csv('reddit_votes_data/44_million_reddit_votes/44_million_votes.txt',sep='\t')

In [4]:
# get ALL submissions
all_submissions_df = pd.read_csv('reddit_votes_data/submission_info/submission_info.txt',sep='\t')
all_submissions_df.set_index('SUBMISSION_ID',inplace=True)

In [5]:
# get all submissions in specified subreddits submissions
submissions_df = all_submissions_df[all_submissions_df['SUBREDDIT'].isin(SUBREDDIT_NAMES)]

# get all submissions in specified subreddits submissions
r_subreddit_names = [f'r/{NAME}' for NAME in SUBREDDIT_NAMES]
votes_df = all_votes_df[all_votes_df['SUBREDDIT'].isin(r_subreddit_names)]
votes_df = all_votes_df[all_votes_df['SUBMISSION_ID'].isin(submissions_df.index)]

In [6]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)

In [7]:
# get all politics users 
pol_users = votes_df['USERNAME'].unique()

# create our dataframe of users
vectors = pd.Series([ np.random.choice([1,-1],VECTOR_LENGTH) for _ in range(len(pol_users)) ])
intermediary_vectors = pd.Series([ np.zeros(VECTOR_LENGTH) for _ in range(len(pol_users)) ])

users_df = pd.DataFrame({'USERS':pol_users,'VECTOR':vectors, 'INTERMEDIARY':intermediary_vectors})
users_df.set_index('USERS',inplace=True)

In [8]:
reset_users(users_df)

In [9]:
# Iterate graph to form user embeddings
for _ in range(N_GRAPH_ITERATIONS):
    iterate_graph(votes_df, users_df)
    resolve_graph(users_df)

          35600 / 35608         

In [10]:
# Perform k-means clustering to group users - this is needed for testing
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(  np.stack(users_df['VECTOR'].to_numpy())  )
group_labels = kmeans.labels_
group_ids, group_counts = np.unique(group_labels, return_counts=True)
for grpid, grpcnt in zip(group_ids,group_counts):
    print(f'{grpid}: {grpcnt}')

0: 4587
1: 8069


In [11]:
# Put users into groups - this is needed for testing
users_df['GROUP'] = "NO_GROUP"
users_df.loc[group_labels == 0,'GROUP'] = 'GROUP_0'
users_df.loc[group_labels == 1,'GROUP'] = 'GROUP_1' 

In [12]:
# initialize the posts_df dataframe
posts_df = pd.DataFrame({'POST_ID':list(set(votes_df['SUBMISSION_ID']))})
posts_df.set_index('POST_ID',inplace=True)
reset_post_stats(posts_df)

In [13]:
# calculate some base statistics for each post
calculate_post_stats(users_df, votes_df, posts_df)

In [14]:
# calvulate the Neeka score for each post
neeka_score_calculation(posts_df, div_weight=DIV_WEIGHT, cent_weight=CENT_WEIGHT, pol_weight=POL_WEIGHT)

In [15]:
# calculate percentile rank 
x_index = np.logical_not(np.logical_or( posts_df['GROUP1_X'].isna(), posts_df['GROUP0_X'].isna())) 
posts_x_df = posts_df[x_index].copy()
calculate_percentile_rank(posts_x_df)

# Results

### Statistics

In [16]:
results = calculate_test_results(posts_x_df)
print_test_results(results)


    simple_agreement: 46.25 %
    neeka_agreement: 65.33 %
    agreement_change: 19.08 %

    simple_apolar: 33.62 %
    neeka_apolar: 54.21 %
    apolar_change: 20.59 %

    simple_neutrality: 66.2 %
    neeka_neutrality: 93.55 %
    neutrality_change: 27.36 %

    Overall:
    simple_quality: 48.69 %
    neeka_quality: 71.03 %
    quality_change: 22.34 %

    #######################################

    simple_group0_bias: 24.74 %
    simple_group1_bias: 58.55 %
    
    neeka_group0_bias: 44.84 %
    neeka_group1_bias: 51.29 %
    


### Most Promoted/Demoted ALL POSTS 

In [17]:
sample = posts_df
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Apple and Samsung fined for deliberately slowing down phones"
2. "FBI: Nation-state actors have breached two US municipalities"
3. "A former neighbor of Joe Biden's accuser Tara Reade has come forward to corroborate her sexual assault account, saying Reade discussed the allegations in detail in the mid-1990s"
4. "Spain: prosecutors claim attack not rape as victim ‘did not fight back’ - Six men accused of sexual abuse of 14-year-old girl in abandoned factory in Manresa"
5. "Rapper Pop Smoke killed in home invasion"
6. "Not a conspiracy anymore"
7. "Feb. 11 Is 'The Day We Fight Back' Against NSA Surveillance | More than 5,300 web-based companies and other organizations, including Reddit, Imgur, Tumblr, Mozilla the Electronic Frontier Foundation and the ACLU, have joined forces to protest National Security Agency surveillance on Feb. 11."
8. "Banksy says that most "Banksy" works are fake"
9.

### Most Promoted/Demoted SAMPLE A

In [18]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Two years after Ireland introduced some of the strictest laws in the world on lobbying transparency, the reforms are being held up as the gold standard for policymakers looking to shine a light on the often murky world of influence peddling."
2. "Germany cuts fares for long-distance rail travel in response to climate crisis"
3. "Walmart says it will raise age restriction to 21 for gun purchases, remove items resembling assault-style rifles from website"
4. "Sanders Files Bill to Break Up Big Banks"
5. "Comey: ‘If this were a case about somebody other than the president, they'd already have been indicted’"
6. "Trump Taunts Democrats: “We Have All the Material. They Don’t Have the Material.”"
7. "Fast food cashier and manager fired for refusing to serve police officer"
8. "St. Louis will drop minimum wage from $10 to $7.70."
9. "FDA: CBD and hemp are now legal in the U.S"
10. "Motorcade car

### Most Promoted/Demoted SAMPLE B

In [19]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "US announces withdrawal from TPP"
2. "Mitch McConnell should drop the lame excuses. Let the Senate vote on bipartisan justice, prison reforms."
3. "FCC chairman warns about China's 'leverage' over NBA: Imagine what Beijing can do with 5G networks"
4. "Republican candidate body-slams Guardian reporter in Montana"
5. "Trump Administration to LGBT Couples: Your ‘Out of Wedlock’ Kids Aren’t Citizens"
6. "West Virginia Law Makers Vote to Let Foster Care Agencies Turn Away LGBTQ Children, Parents"
7. "First bear in Germany in 16 years"
8. "Houston billionaire Tilman Fertitta furloughed 40,000 workers nationwide"
9. "China Tears Down the Tibetan City in the Sky: demolishing homes and evicting thousands from Larung Gar, the world’s largest Tibetan Buddhist institution"
10. "Mass Shooting in Virginia: Witnesses Say Gunman Opened Fire on Members of Congress"
-
Most Promoted by **simple-consensus** 

### Most Promoted/Demoted SAMPLE C

In [20]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Doctor who exposed Sars cover-up under house arrest in China, family confirms"
2. "Hong Kong protests: second car rams protesters as teargas deployed"
3. "Wisconsin judge: Strip clubs should be eligible for emergency loans"
4. "Salman Khan and his Khan Academy online platform that offers free classes worldwide have won Spain's Princess of Asturias award for international cooperation."
5. "Newly found video evidence may indicate turkish coup was facade"
6. "'Medicare-for-all' gets unexpected surge of support, even in red states"
7. ""FCC’s claim that one ISP counts as “competition” faces scrutiny in court""
8. "Democrat Warren vows to use 'every tool' to combat white nationalist violence"
9. "The UK, Australia, and US generally have the highest climate denial rates in the world. The big thing they have in common (aside from language) is a major Murdoch media presence"
10. "Opinion: I Asked