In [1]:
import numpy as np
import pandas as pd
import random
from neeka_lib import *
from reddit_helper import *
from sklearn.cluster import KMeans

In [2]:
# Parameters
MIN_VOTES_PER_USER = 8
MIN_VOTES_PER_POST = 8
N_GRAPH_ITERATIONS = 99
VECTOR_LENGTH = 300 # length of user/post embeddings
SUBREDDIT_NAMES = ['politics', 'news', 'worldnews']

In [3]:
# get ALL votes
all_votes_df = pd.read_csv('reddit_votes_data/44_million_reddit_votes/44_million_votes.txt',sep='\t')

In [4]:
# get ALL submissions
all_submissions_df = pd.read_csv('reddit_votes_data/submission_info/submission_info.txt',sep='\t')
all_submissions_df.set_index('SUBMISSION_ID',inplace=True)

In [5]:
# get all submissions in specified subreddits submissions
submissions_df = all_submissions_df[all_submissions_df['SUBREDDIT'].isin(SUBREDDIT_NAMES)]

# get all submissions in specified subreddits submissions
r_subreddit_names = [f'r/{NAME}' for NAME in SUBREDDIT_NAMES]
votes_df = all_votes_df[all_votes_df['SUBREDDIT'].isin(r_subreddit_names)]
votes_df = all_votes_df[all_votes_df['SUBMISSION_ID'].isin(submissions_df.index)]

In [6]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)

In [7]:
len(votes_df)

1174890

In [8]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)
len(votes_df)

1174890

In [9]:
# get all politics users 
pol_users = votes_df['USERNAME'].unique()

# create our dataframe of users
vectors = pd.Series([ np.random.choice([1,-1],VECTOR_LENGTH) for _ in range(len(pol_users)) ])
intermediary_vectors = pd.Series([ np.zeros(VECTOR_LENGTH) for _ in range(len(pol_users)) ])

users_df = pd.DataFrame({'USERS':pol_users,'VECTOR':vectors, 'INTERMEDIARY':intermediary_vectors})
users_df.set_index('USERS',inplace=True)

In [10]:
reset_users(users_df)

In [11]:
# Iterate graph to form user embeddings
for _ in range(N_GRAPH_ITERATIONS):
    iterate_graph(votes_df, users_df)
    resolve_graph(users_df)

          35600 / 35608         

In [12]:
# Perform k-means clustering to group users - this is needed for testing
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(  np.stack(users_df['VECTOR'].to_numpy())  )
group_labels = kmeans.labels_
group_ids, group_counts = np.unique(group_labels, return_counts=True)
for grpid, grpcnt in zip(group_ids,group_counts):
    print(f'{grpid}: {grpcnt}')

0: 4613
1: 8043


In [13]:
# Put users into groups - this is needed for testing
users_df['GROUP'] = "NO_GROUP"
users_df.loc[group_labels == 0,'GROUP'] = 'GROUP_0'
users_df.loc[group_labels == 1,'GROUP'] = 'GROUP_1' 

In [14]:
# initialize the posts_df dataframe
posts_df = pd.DataFrame({'POST_ID':list(set(votes_df['SUBMISSION_ID']))})
posts_df.set_index('POST_ID',inplace=True)
reset_post_stats(posts_df)

In [15]:
# calculate some base statistics for each post
calculate_post_stats(users_df, votes_df, posts_df)

In [16]:
# calvulate the Neeka score for each post
neeka_score_calculation(posts_df)

In [17]:
# calculate percentile rank 
x_index = np.logical_not(np.logical_or( posts_df['GROUP1_X'].isna(), posts_df['GROUP0_X'].isna())) 
posts_x_df = posts_df[x_index].copy()
calculate_percentile_rank(posts_x_df)

# Results

### Statistics

In [18]:
results = calculate_test_results(posts_x_df)
print_test_results(results)


    simple_agreement: 46.46 %
    neeka_agreement: 65.75 %
    agreement_change: 19.29 %

    simple_apolar: 33.83 %
    neeka_apolar: 54.69 %
    apolar_change: 20.86 %

    simple_neutrality: 66.34 %
    neeka_neutrality: 94.47 %
    neutrality_change: 28.13 %

    Overall:
    simple_quality: 48.87 %
    neeka_quality: 71.63 %
    quality_change: 22.76 %

    #######################################

    simple_group0_bias: 24.85 %
    simple_group1_bias: 58.52 %
    
    neeka_group0_bias: 45.41 %
    neeka_group1_bias: 50.94 %
    


### Most Promoted/Demoted ALL POSTS 

In [19]:
sample = posts_df
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Apple and Samsung fined for deliberately slowing down phones"
2. "FBI: Nation-state actors have breached two US municipalities"
3. "Spain: prosecutors claim attack not rape as victim ‘did not fight back’ - Six men accused of sexual abuse of 14-year-old girl in abandoned factory in Manresa"
4. "Rapper Pop Smoke killed in home invasion"
5. "A former neighbor of Joe Biden's accuser Tara Reade has come forward to corroborate her sexual assault account, saying Reade discussed the allegations in detail in the mid-1990s"
6. "Not a conspiracy anymore"
7. "President Bernie Sanders would dismantle NSA spying. "Kids will grow up knowing that every damn thing that they do is going to be recorded somewhere in a file, and I think that will have a very Orwellian and inhibiting impact on our lives.""
8. "Vinnie Paul Dead: Pantera Drummer Dies at 54 | Billboard"
9. "Banksy says that most "Banksy" works ar

### Most Promoted/Demoted SAMPLE A

In [20]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "French tourist prevents a bus with 21 passengers from plunging over a 100m (328ft) cliff in the Austrian Alps after the driver, 76, passed out. The passenger leapt from his seat as the vehicle crashed through the wooden roadside guardrail and applied the brake, leaving the bus hanging over the edge."
2. "FCC chairman warns about China's 'leverage' over NBA: Imagine what Beijing can do with 5G networks"
3. "Paul Manafort to be sentenced Thursday; faces 25 years in prison"
4. "Toys R Us yesterday opened its UK stores for a “quiet hour” designed to accommodate the needs of children with autism. Measures included dimmed fluorescent lighting and a ban on music and in-store announcements."
5. "Australia registers hottest day in recorded history. Outback towns become some of warmest places on Earth as report details season of unprecedented fire risk."
6. "Coca-Cola to cut 1,200 jobs as consumers

### Most Promoted/Demoted SAMPLE B

In [21]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Trump refuses to rule out opponents being put to death for treason"
2. "Joe Biden’s coronavirus speech was far better than Trump’s"
3. "Ontario introduces 'pay transparency' bill to require all publicly advertised job postings to include a salary rate or range, bar employers from asking about past compensation and prohibit reprisal against employees who do discuss or disclose compensation"
4. "Malaysia government to stop palm oil expansion, keep 50% land as forest"
5. "Taiwan calls on China to share 'correct' virus information"
6. "I think it's pretty clear Reddit won't vote for Romney and Ryan. Instead of beating a dead horse, why don't we start looking into corrupt politics in the congress, which is where the real power is?"
7. "OK Boomers, Wake Up! — Sanders is the person you used to be but forgot about."
8. "Trump's food stamp cuts will strip lifeline from millions of Americans"
9. "U

### Most Promoted/Demoted SAMPLE C

In [22]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Anger grows as Puerto Rico misses power restoration deadline"
2. "Oregon governor prepared to use executive powers to pass climate legislation after GOP walkout"
3. "Shep Smith Out at Fox News Amid Trump Tension"
4. "Paul Manafort to be sentenced Thursday; faces 25 years in prison"
5. "Protests rage in India against citizenship law amid restrictions"
6. "60 students without vaccines kept out of school in Fargo, West Fargo"
7. "Russia's top secret $155m Su-57 stealth fighter crashes just days before it is due to enter service"
8. "Activists call for protests Thursday to protect Mueller investigation"
9. "Google latest to withdraw from Saudi conference"
10. "Republicans Outnumbered by Independents for First Time in US History | But Voters Registered as Democrat Still Outnumber Both"
-
Most Promoted by **simple-consensus** over Neeka (most demoted by Neeka):
1. "NPR Poll: Majority Of America