In [1]:
import numpy as np
import pandas as pd
import random
from neeka_lib import *
from reddit_helper import *
from sklearn.cluster import KMeans

In [2]:
# Parameters
MIN_VOTES_PER_USER = 16
MIN_VOTES_PER_POST = 16
N_GRAPH_ITERATIONS = 19
VECTOR_LENGTH = 300 # length of user/post embeddings
SUBREDDIT_NAMES = ['politics', 'news', 'worldnews']

In [3]:
# get ALL votes
all_votes_df = pd.read_csv('reddit_votes_data/44_million_reddit_votes/44_million_votes.txt',sep='\t')

In [4]:
# get ALL submissions
all_submissions_df = pd.read_csv('reddit_votes_data/submission_info/submission_info.txt',sep='\t')
all_submissions_df.set_index('SUBMISSION_ID',inplace=True)

In [5]:
# get all submissions in specified subreddits submissions
submissions_df = all_submissions_df[all_submissions_df['SUBREDDIT'].isin(SUBREDDIT_NAMES)]

# get all submissions in specified subreddits submissions
r_subreddit_names = [f'r/{NAME}' for NAME in SUBREDDIT_NAMES]
votes_df = all_votes_df[all_votes_df['SUBREDDIT'].isin(r_subreddit_names)]
votes_df = all_votes_df[all_votes_df['SUBMISSION_ID'].isin(submissions_df.index)]

In [6]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)

In [7]:
# get all politics users 
pol_users = votes_df['USERNAME'].unique()

# create our dataframe of users
vectors = pd.Series([ np.random.choice([1,-1],VECTOR_LENGTH) for _ in range(len(pol_users)) ])
intermediary_vectors = pd.Series([ np.zeros(VECTOR_LENGTH) for _ in range(len(pol_users)) ])

users_df = pd.DataFrame({'USERS':pol_users,'VECTOR':vectors, 'INTERMEDIARY':intermediary_vectors})
users_df.set_index('USERS',inplace=True)

In [8]:
reset_users(users_df)

In [9]:
# Iterate graph to form user embeddings
for _ in range(N_GRAPH_ITERATIONS):
    iterate_graph(votes_df, users_df)
    resolve_graph(users_df)

          17500 / 17547         

In [10]:
# Perform k-means clustering to group users - this is needed for testing
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(  np.stack(users_df['VECTOR'].to_numpy())  )
group_labels = kmeans.labels_
group_ids, group_counts = np.unique(group_labels, return_counts=True)
for grpid, grpcnt in zip(group_ids,group_counts):
    print(f'{grpid}: {grpcnt}')

0: 3247
1: 4711


In [11]:
# Put users into groups - this is needed for testing
users_df['GROUP'] = "NO_GROUP"
users_df.loc[group_labels == 0,'GROUP'] = 'GROUP_0'
users_df.loc[group_labels == 1,'GROUP'] = 'GROUP_1' 

In [12]:
# initialize the posts_df dataframe
posts_df = pd.DataFrame({'POST_ID':list(set(votes_df['SUBMISSION_ID']))})
posts_df.set_index('POST_ID',inplace=True)
reset_post_stats(posts_df)

In [13]:
# calculate some base statistics for each post
calculate_post_stats(users_df, votes_df, posts_df)

In [14]:
# calvulate the Neeka score for each post
neeka_score_calculation(posts_df)

In [15]:
# calculate percentile rank 
x_index = np.logical_not(np.logical_or( posts_df['GROUP1_X'].isna(), posts_df['GROUP0_X'].isna())) 
posts_x_df = posts_df[x_index].copy()
calculate_percentile_rank(posts_x_df)

# Results

### Statistics

In [16]:
results = calculate_test_results(posts_x_df)
print_test_results(results)


    simple_agreement: 49.79 %
    neeka_agreement: 61.8 %
    agreement_change: 12.01 %

    simple_apolar: 45.86 %
    neeka_apolar: 58.98 %
    apolar_change: 13.11 %

    simple_neutrality: 79.41 %
    neeka_neutrality: 90.19 %
    neutrality_change: 10.78 %

    Overall:
    simple_quality: 58.36 %
    neeka_quality: 70.32 %
    quality_change: 11.97 %

    #######################################

    simple_group0_bias: 28.75 %
    simple_group1_bias: 49.34 %
    
    neeka_group0_bias: 39.09 %
    neeka_group1_bias: 48.9 %
    


### Most Promoted/Demoted ALL POSTS 

In [17]:
sample = posts_df
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "USA Powerlifting bans all trans women from competing as women"
2. "China will not change its position that Taiwan belongs to it and the world will only ever recognize that there is “one China”, Beijing said on Sunday after President Tsai Ing-wen won re-election and said she would not submit to China’s threats."
3. "Apple CEO becomes chairman of China university board"
4. "Doctor treating Paris coronavirus patients says virus ‘less serious’ than SARS"
5. "Fallout from Russia's mysterious missile disaster suggests a nuclear reactor blew up"
6. "Huge Hong Kong protest against extradition bill"
7. "Verizon throttled 'unlimited' data of Calif. fire department during wildfire"
8. "Prince Andrew Says He Doesn't Regret His 'Very Useful' Relationship With Jeffrey Epstein"
9. "'Lying has become a norm': Hong Kong police falsely accused protesters of blocking ambulances, democrats say."
10. "A scien

### Most Promoted/Demoted SAMPLE A

In [18]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Joe Biden says he would consider a Republican for his running mate"
2. "Phoenix officer to be fired for pulling gun on parents over doll taken by 4-year-old"
3. "Video showing hundreds of shackled, blindfolded prisoners in China is 'genuine'"
4. "Teachers at school built on ‘toxic site’ have the same rare cancer"
5. "BBC News: Suspending Parliament was unlawful, court rules"
6. "Hong Kong Protesters Wearing LeBron James Masks at Recent Protests"
7. "Justice Dept. confirms Trump Jr. and McGahn did not testify to grand jury"
8. "CNN’s Jake Tapper Calls Out Joe Biden For Declining Interview Requests"
9. "Deutsche Bank Executive Who Signed Off On Trump Loans Kills Self At Age 55"
10. "Firms withdraw from China on worsening business conditions"
-
Most Promoted by **simple-consensus** over Neeka (most demoted by Neeka):
1. "'The Door Is Open, Come On In,' Sanders Says to Buttigieg and Klobuchar

### Most Promoted/Demoted SAMPLE B

In [19]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Huge Hong Kong protest against extradition bill"
2. "Rampant Chinese cheating exposed at the Boston Marathon"
3. "Silicon Valley giants accused of avoiding $100 billion in taxes"
4. "Trump administration blames Iran for oil tanker attacks in Middle East"
5. "Protestors Rally Outside Blizzcon 2019 with 'Winnie the Pooh' Costumes"
6. "Video showing hundreds of shackled, blindfolded prisoners in China is 'genuine'"
7. "Michigan AG: If Trump 'fails to wear a mask, he's going to be asked not to return to any enclosed facility inside our state'"
8. "Three fired over Nazi salute photo with West Virginia corrections employees"
9. "Virus-hit Chinese city to shut public transport"
10. "Mike Bloomberg in 2011: ‘Enormous Cohort’ of Young Black and Latino Men ‘Don’t Know How to Behave in the Workplace’"
-
Most Promoted by **simple-consensus** over Neeka (most demoted by Neeka):
1. "Millennials support

### Most Promoted/Demoted SAMPLE C

In [20]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Verizon throttled 'unlimited' data of Calif. fire department during wildfire"
2. "The head of Interpol disappears whilst visiting China"
3. "Impeachment shows unelected government employees are heart of democracy"
4. "Trump administration blames Iran for oil tanker attacks in Middle East"
5. "Apple Safari browser sends some user IP addresses to Chinese conglomerate Tencent by default"
6. "Teachers at school built on ‘toxic site’ have the same rare cancer"
7. "McDonald's apologized after a restaurant in Guangzhou, China, refused to service black customers"
8. "This week's Arctic blast will be so cold, forecasters expect it to break more than 200 records across US"
9. "Chinese researcher accused of trying to smuggle vials of ‘biological material’ out of US hidden in a sock"
10. "Russian media mentions of Tulsi Gabbard are 200% more often positive than for any other US Democratic Candidate. 