In [1]:
import numpy as np
import pandas as pd
import random
from neeka_lib import *
from reddit_helper import *
from sklearn.cluster import KMeans

In [2]:
# Parameters
MIN_VOTES_PER_USER = 8
MIN_VOTES_PER_POST = 8
N_GRAPH_ITERATIONS = 3
VECTOR_LENGTH = 300 # length of user/post embeddings
SUBREDDIT_NAMES = ['politics', 'news', 'worldnews']

In [3]:
# get ALL votes
all_votes_df = pd.read_csv('reddit_votes_data/44_million_reddit_votes/44_million_votes.txt',sep='\t')

In [4]:
# get ALL submissions
all_submissions_df = pd.read_csv('reddit_votes_data/submission_info/submission_info.txt',sep='\t')
all_submissions_df.set_index('SUBMISSION_ID',inplace=True)

In [5]:
# get all submissions in specified subreddits submissions
submissions_df = all_submissions_df[all_submissions_df['SUBREDDIT'].isin(SUBREDDIT_NAMES)]

# get all submissions in specified subreddits submissions
r_subreddit_names = [f'r/{NAME}' for NAME in SUBREDDIT_NAMES]
votes_df = all_votes_df[all_votes_df['SUBREDDIT'].isin(r_subreddit_names)]
votes_df = all_votes_df[all_votes_df['SUBMISSION_ID'].isin(submissions_df.index)]

In [6]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)

In [7]:
len(votes_df)

1174890

In [8]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)
len(votes_df)

1174890

In [9]:
# get all politics users 
pol_users = votes_df['USERNAME'].unique()

# create our dataframe of users
vectors = pd.Series([ np.random.choice([1,-1],VECTOR_LENGTH) for _ in range(len(pol_users)) ])
intermediary_vectors = pd.Series([ np.zeros(VECTOR_LENGTH) for _ in range(len(pol_users)) ])

users_df = pd.DataFrame({'USERS':pol_users,'VECTOR':vectors, 'INTERMEDIARY':intermediary_vectors})
users_df.set_index('USERS',inplace=True)

In [10]:
reset_users(users_df)

In [11]:
# Iterate graph to form user embeddings
for _ in range(N_GRAPH_ITERATIONS):
    iterate_graph(votes_df, users_df)
    resolve_graph(users_df)

          35600 / 35608         

In [12]:
# Perform k-means clustering to group users - this is needed for testing
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(  np.stack(users_df['VECTOR'].to_numpy())  )
group_labels = kmeans.labels_
group_ids, group_counts = np.unique(group_labels, return_counts=True)
for grpid, grpcnt in zip(group_ids,group_counts):
    print(f'{grpid}: {grpcnt}')

0: 5009
1: 7647


In [13]:
# Put users into groups - this is needed for testing
users_df['GROUP'] = "NO_GROUP"
users_df.loc[group_labels == 0,'GROUP'] = 'GROUP_0'
users_df.loc[group_labels == 1,'GROUP'] = 'GROUP_1' 

In [14]:
# initialize the posts_df dataframe
posts_df = pd.DataFrame({'POST_ID':list(set(votes_df['SUBMISSION_ID']))})
posts_df.set_index('POST_ID',inplace=True)
reset_post_stats(posts_df)

In [15]:
# calculate some base statistics for each post
calculate_post_stats(users_df, votes_df, posts_df)

In [16]:
# calvulate the Neeka score for each post
neeka_score_calculation(posts_df)

In [17]:
# calculate percentile rank 
x_index = np.logical_not(np.logical_or( posts_df['GROUP1_X'].isna(), posts_df['GROUP0_X'].isna())) 
posts_x_df = posts_df[x_index].copy()
calculate_percentile_rank(posts_x_df)

# Results

### Statistics

In [18]:
results = calculate_test_results(posts_x_df)
print_test_results(results)


    simple_agreement: 48.84 %
    neeka_agreement: 60.44 %
    agreement_change: 11.6 %

    simple_apolar: 35.02 %
    neeka_apolar: 47.52 %
    apolar_change: 12.5 %

    simple_neutrality: 68.79 %
    neeka_neutrality: 80.62 %
    neutrality_change: 11.83 %

    Overall:
    simple_quality: 50.88 %
    neeka_quality: 62.86 %
    quality_change: 11.97 %

    #######################################

    simple_group0_bias: 26.89 %
    simple_group1_bias: 58.1 %
    
    neeka_group0_bias: 37.97 %
    neeka_group1_bias: 57.35 %
    


### Most Promoted/Demoted ALL POSTS 

In [19]:
sample = posts_df
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Putin approves law targeting journalists as 'foreign agents'"
2. "The Latest: US plan to allow prescription drugs from Canada"
3. "Canada's Bianca Andreescu defeats Serena Williams to win U.S. Open"
4. "'Fallacy' to say Amazon is the heritage of humankind, Brazil's Bolsonaro tells UN climate forum"
5. "The Pentagon awarded Lockheed Martin $946 million on behalf of the Kingdom of Saudi Arabia for the defense giants’ THAAD missile defense system"
6. "Virginia teacher sues school after being fired for not using transgender student's pronouns"
7. "FBI: Nation-state actors have breached two US municipalities"
8. "Mexico’s El Chapo sentenced to life in prison"
9. "Apple and Samsung fined for deliberately slowing down phones"
10. "Australian police find $200 million of meth hidden inside Sriracha bottles"
-
Most Promoted by **simple-consensus** over Neeka (most demoted by Neeka):
1. "Greta Thunb

### Most Promoted/Demoted SAMPLE A

In [20]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Mormon Senator goes ahead with medical marijuana bill despite his church's objection."
2. "North Korea Leader Kim-Jong Il has died"
3. "Newly found video evidence may indicate turkish coup was facade"
4. "Police are using unconstitutional "Geofence Warrants" to gather location data of anyone who was near the location of a crime"
5. "Australia's PM abused by community members of NSW town hit by bushfires"
6. "Nine Years After Citizens United, Calls to Overturn 'Horrendous' Decision and Pass Pro-Democracy HR1 - "Until Citizens United is overturned, the corporate oligarchy will maintain the power to block the policies favored by the majority of Americans.""
7. "NSA whistleblower Edward Snowden nominated for Nobel Peace Prize"
8. "FBI admits Patriot Act snooping powers didn't crack any major terrorism cases."
9. "A college football fan's sign asking for beer money raised more than $1 million.

### Most Promoted/Demoted SAMPLE B

In [21]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Elderly man attacked, humiliated while collecting cans in San Francisco neighborhood"
2. "Iceland sentences 26 corrupt bankers to 74 years in prison"
3. "Trump tweets altered video of Ilhan Omar belittling 9/11"
4. "A Prison Guard Drove A Truck Through A Group Of Jewish ICE Protesters, Injuring Several"
5. "Queensland government was warned about risks of Chinese company's water extraction. Exclusive: Concerns about groundwater security at Cherrabah overridden by Campbell Newman’s government"
6. "Hong Kong protests: second car rams protesters as teargas deployed"
7. "Coca-Cola will not ditch single-use plastic bottles because consumers still want them, the firm's head of sustainability told the BBC"
8. "LeBron James: Daryl Morey was 'misinformed' about the situation in Hong Kong"
9. "Founder of Bob's Red Mill Natural Foods transfers business to employees"
10. "A rare protest has been held 

### Most Promoted/Demoted SAMPLE C

In [22]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Virginia teacher sues school after being fired for not using transgender student's pronouns"
2. "Chinese schools have begun enforcing "smart uniforms" embedded with computer chips to monitor student movements and prevent them from skipping classes. As students enter the school, the time and date is recorded along with a short video that parents can access via a mobile app."
3. "Magic mushroom ingredient psilocybin could be key to treating depression - studies | Immediate reduction in depression and anxiety for up to eight months seen in patients with advanced cancer given a single dose of psilocybin"
4. "Hong Kong protests: 70-year-old man hit by brick during clashes in Sheung Shui dies in hospital"
5. "Air Asia flight loses contact with air traffic control"
6. "Boeing ex-CEO is denied severance, forfeits at least $44 million"
7. "Pelosi Statement on Timing of Appointment of Impeachment M