In [1]:
import numpy as np
import pandas as pd
import random
from neeka_lib import *
from reddit_helper import *
from sklearn.cluster import KMeans

In [2]:
# Parameters
MIN_VOTES_PER_USER = 16
MIN_VOTES_PER_POST = 16
N_GRAPH_ITERATIONS = 3
VECTOR_LENGTH = 300 # length of user/post embeddings
SUBREDDIT_NAMES = ['politics', 'news', 'worldnews']

In [3]:
# get ALL votes
all_votes_df = pd.read_csv('reddit_votes_data/44_million_reddit_votes/44_million_votes.txt',sep='\t')

In [4]:
# get ALL submissions
all_submissions_df = pd.read_csv('reddit_votes_data/submission_info/submission_info.txt',sep='\t')
all_submissions_df.set_index('SUBMISSION_ID',inplace=True)

In [5]:
# get all submissions in specified subreddits submissions
submissions_df = all_submissions_df[all_submissions_df['SUBREDDIT'].isin(SUBREDDIT_NAMES)]

# get all submissions in specified subreddits submissions
r_subreddit_names = [f'r/{NAME}' for NAME in SUBREDDIT_NAMES]
votes_df = all_votes_df[all_votes_df['SUBREDDIT'].isin(r_subreddit_names)]
votes_df = all_votes_df[all_votes_df['SUBMISSION_ID'].isin(submissions_df.index)]

In [6]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)

In [7]:
# get all politics users 
pol_users = votes_df['USERNAME'].unique()

# create our dataframe of users
vectors = pd.Series([ np.random.choice([1,-1],VECTOR_LENGTH) for _ in range(len(pol_users)) ])
intermediary_vectors = pd.Series([ np.zeros(VECTOR_LENGTH) for _ in range(len(pol_users)) ])

users_df = pd.DataFrame({'USERS':pol_users,'VECTOR':vectors, 'INTERMEDIARY':intermediary_vectors})
users_df.set_index('USERS',inplace=True)

In [8]:
reset_users(users_df)

In [9]:
# Iterate graph to form user embeddings
for _ in range(N_GRAPH_ITERATIONS):
    iterate_graph(votes_df, users_df)
    resolve_graph(users_df)

          17500 / 17547         

In [10]:
# Perform k-means clustering to group users - this is needed for testing
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(  np.stack(users_df['VECTOR'].to_numpy())  )
group_labels = kmeans.labels_
group_ids, group_counts = np.unique(group_labels, return_counts=True)
for grpid, grpcnt in zip(group_ids,group_counts):
    print(f'{grpid}: {grpcnt}')

0: 3158
1: 4800


In [11]:
# Put users into groups - this is needed for testing
users_df['GROUP'] = "NO_GROUP"
users_df.loc[group_labels == 0,'GROUP'] = 'GROUP_0'
users_df.loc[group_labels == 1,'GROUP'] = 'GROUP_1' 

In [12]:
# initialize the posts_df dataframe
posts_df = pd.DataFrame({'POST_ID':list(set(votes_df['SUBMISSION_ID']))})
posts_df.set_index('POST_ID',inplace=True)
reset_post_stats(posts_df)

In [13]:
# calculate some base statistics for each post
calculate_post_stats(users_df, votes_df, posts_df)

In [14]:
# calvulate the Neeka score for each post
neeka_score_calculation(posts_df)

In [15]:
# calculate percentile rank 
x_index = np.logical_not(np.logical_or( posts_df['GROUP1_X'].isna(), posts_df['GROUP0_X'].isna())) 
posts_x_df = posts_df[x_index].copy()
calculate_percentile_rank(posts_x_df)

# Results

### Statistics

In [16]:
results = calculate_test_results(posts_x_df)
print_test_results(results)


    simple_agreement: 49.66 %
    neeka_agreement: 60.24 %
    agreement_change: 10.58 %

    simple_apolar: 44.91 %
    neeka_apolar: 56.44 %
    apolar_change: 11.52 %

    simple_neutrality: 77.96 %
    neeka_neutrality: 85.93 %
    neutrality_change: 7.96 %

    Overall:
    simple_quality: 57.51 %
    neeka_quality: 67.54 %
    quality_change: 10.02 %

    #######################################

    simple_group0_bias: 27.72 %
    simple_group1_bias: 49.75 %
    
    neeka_group0_bias: 36.44 %
    neeka_group1_bias: 50.52 %
    


### Most Promoted/Demoted ALL POSTS 

In [17]:
sample = posts_df
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "China will not change its position that Taiwan belongs to it and the world will only ever recognize that there is “one China”, Beijing said on Sunday after President Tsai Ing-wen won re-election and said she would not submit to China’s threats."
2. "Apple CEO becomes chairman of China university board"
3. "Silicon Valley giants accused of avoiding $100 billion in taxes"
4. "Pakistani professor sentenced to death for blasphemy"
5. "Jail video of Jeffrey Epstein’s first suicide attempt was deleted, prosecutors reveal"
6. "USA Powerlifting bans all trans women from competing as women"
7. "Crystal Geyser accidentally emailed the Chronicle its strategy to put a bottling plant in Randle, WA including an astroturf PR campaign and suing opposed neighbors "get them to the table." The company is now threatening to sue the paper for publishing."
8. "Fallout from Russia's mysterious missile disaster 

### Most Promoted/Demoted SAMPLE A

In [18]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "In an angry rant on social media, Brazilian President Jair Bolsonaro denied his connection to the murder of councilwoman and human rights activist Marielle Franco in Rio de Janeiro last year and threatened to cancel the license of the television network that aired the allegations."
2. "Missing Romanian teenager 'begged' police to 'stay on the line':"Please stay with me on the line, I'm really scared," Alexandra Macesanu told a police officer while crying during her third and last call, according to a transcript released on Facebook by her uncle"
3. "#PresidentCuomo trends as governor’s star status rises over coronavirus response"
4. "Martin Shkreli Placed in Solitary Confinement After Allegedly Running Company Behind Bars: Report"
5. "Firms withdraw from China on worsening business conditions"
6. "U.S. makes it harder to sue corporations over franchise wage law violations"
7. "Finland to 

### Most Promoted/Demoted SAMPLE B

In [19]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "UN peacekeepers 'fathered hundreds of children in Haiti mission', report says"
2. "Christopher Tolkien son of J. R. R. Tolkien has died at 95"
3. "Women can swim topless in Barcelona as city rules it would be sex discrimination to to cover up"
4. "New Trove of Ukraine Documents Exposes 'Clear Paper Trail From Rudy Giuliani to the Oval Office to Secretary Pompeo': "We can see why Mike Pompeo has refused to release this information to Congress. The evidence is only going to get worse.""
5. "Prince Andrew Says He Doesn't Regret His 'Very Useful' Relationship With Jeffrey Epstein"
6. "Fort Worth police officer who fatally shot Atatiana Jefferson indicted on murder charge"
7. "California firefighters are heading to Australia to battle deadly brushfires on the ground"
8. "Private clinics in the UK are reselling COVID-19 test kits at £125 despite only costing $2 to manufacture"
9. "French Union 

### Most Promoted/Demoted SAMPLE C

In [20]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Pakistani professor sentenced to death for blasphemy"
2. "Apple Safari browser sends some user IP addresses to Chinese conglomerate Tencent by default"
3. "The head of Interpol disappears whilst visiting China"
4. "French riot police 'spray tear gas' in faces of seated Extinction Rebellion activists"
5. "China Invents Rice That Can Grow in Salt Water, Can Feed Over 200 Million People"
6. "Berlin to freeze rents and give tenants rights to sue landlords after rising costs force residents out to suburbs - New law tackling rising prices sets cap for 1.5 million properties built before 2014"
7. "Almost 200 North Korean soldiers have reportedly died from coronavirus cases while thousands more are in quarantine. The government won't acknowledge they even exist."
8. "Prague revamp reveals Jewish gravestones used to pave streets: Dozens of paving stones made from Jewish headstones have been found 