In [1]:
import numpy as np
import pandas as pd
import random
from neeka_lib import *
from reddit_helper import *
from sklearn.cluster import KMeans

In [2]:
# Parameters
MIN_VOTES_PER_USER = 8
MIN_VOTES_PER_POST = 8
N_GRAPH_ITERATIONS = 19
DIV_WEIGHT = 1
CENT_WEIGHT = 1
POL_WEIGHT = 0
VECTOR_LENGTH = 300 # length of user/post embeddings
SUBREDDIT_NAMES = ['politics', 'news', 'worldnews']

In [3]:
# get ALL votes
all_votes_df = pd.read_csv('reddit_votes_data/44_million_reddit_votes/44_million_votes.txt',sep='\t')

In [4]:
# get ALL submissions
all_submissions_df = pd.read_csv('reddit_votes_data/submission_info/submission_info.txt',sep='\t')
all_submissions_df.set_index('SUBMISSION_ID',inplace=True)

In [5]:
# get all submissions in specified subreddits submissions
submissions_df = all_submissions_df[all_submissions_df['SUBREDDIT'].isin(SUBREDDIT_NAMES)]

# get all submissions in specified subreddits submissions
r_subreddit_names = [f'r/{NAME}' for NAME in SUBREDDIT_NAMES]
votes_df = all_votes_df[all_votes_df['SUBREDDIT'].isin(r_subreddit_names)]
votes_df = all_votes_df[all_votes_df['SUBMISSION_ID'].isin(submissions_df.index)]

In [6]:
votes_df = filter_to_multiple_votes(votes_df, min_usr_votes=MIN_VOTES_PER_USER, min_post_votes=MIN_VOTES_PER_POST)

In [7]:
# get all politics users 
pol_users = votes_df['USERNAME'].unique()

# create our dataframe of users
vectors = pd.Series([ np.random.choice([1,-1],VECTOR_LENGTH) for _ in range(len(pol_users)) ])
intermediary_vectors = pd.Series([ np.zeros(VECTOR_LENGTH) for _ in range(len(pol_users)) ])

users_df = pd.DataFrame({'USERS':pol_users,'VECTOR':vectors, 'INTERMEDIARY':intermediary_vectors})
users_df.set_index('USERS',inplace=True)

In [8]:
reset_users(users_df)

In [9]:
# Iterate graph to form user embeddings
for _ in range(N_GRAPH_ITERATIONS):
    iterate_graph(votes_df, users_df)
    resolve_graph(users_df)

          35600 / 35608         

In [10]:
# Perform k-means clustering to group users - this is needed for testing
kmeans = KMeans(n_clusters=2, random_state=0, n_init='auto')
kmeans.fit(  np.stack(users_df['VECTOR'].to_numpy())  )
group_labels = kmeans.labels_
group_ids, group_counts = np.unique(group_labels, return_counts=True)
for grpid, grpcnt in zip(group_ids,group_counts):
    print(f'{grpid}: {grpcnt}')

0: 4606
1: 8050


In [11]:
# Put users into groups - this is needed for testing
users_df['GROUP'] = "NO_GROUP"
users_df.loc[group_labels == 0,'GROUP'] = 'GROUP_0'
users_df.loc[group_labels == 1,'GROUP'] = 'GROUP_1' 

In [12]:
# initialize the posts_df dataframe
posts_df = pd.DataFrame({'POST_ID':list(set(votes_df['SUBMISSION_ID']))})
posts_df.set_index('POST_ID',inplace=True)
reset_post_stats(posts_df)

In [13]:
# calculate some base statistics for each post
calculate_post_stats(users_df, votes_df, posts_df)

In [14]:
# calvulate the Neeka score for each post
neeka_score_calculation(posts_df, div_weight=DIV_WEIGHT, cent_weight=CENT_WEIGHT, pol_weight=POL_WEIGHT)

In [15]:
# calculate percentile rank 
x_index = np.logical_not(np.logical_or( posts_df['GROUP1_X'].isna(), posts_df['GROUP0_X'].isna())) 
posts_x_df = posts_df[x_index].copy()
calculate_percentile_rank(posts_x_df)

# Results

### Statistics

In [16]:
results = calculate_test_results(posts_x_df)
print_test_results(results)


    simple_agreement: 46.5 %
    neeka_agreement: 51.45 %
    agreement_change: 4.95 %

    simple_apolar: 33.87 %
    neeka_apolar: 38.24 %
    apolar_change: 4.36 %

    simple_neutrality: 66.34 %
    neeka_neutrality: 72.17 %
    neutrality_change: 5.84 %

    Overall:
    simple_quality: 48.9 %
    neeka_quality: 53.95 %
    quality_change: 5.05 %

    #######################################

    simple_group0_bias: 24.86 %
    simple_group1_bias: 58.52 %
    
    neeka_group0_bias: 29.29 %
    neeka_group1_bias: 57.12 %
    


### Most Promoted/Demoted ALL POSTS 

In [17]:
sample = posts_df
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Exclusive: Grand jury returns 16 felony counts against Jussie Smollett"
2. "Andrew Yang endorses Joe Biden: “The math says” the former VP is the presumptive nominee"
3. "Trump attacks protections for immigrants from ‘shithole’ countries in Oval Office meeting"
4. "Andrew Yang accuses NBC of turning off his mic during debate"
5. "The Great Republican Abdication | A party that no longer believes in American values."
6. "Trump’s lawyers seek to undercut Mueller’s Russia investigation"
7. "Tibetan students and government workers are banned from participating in religious observances, as Chinese authorities continue efforts to separate Tibetans from their cultural heritage and identity."
8. "ISIS leader al-Baghdadi believed to have been killed in a US military raid, sources say"
9. "Google found it was underpaying more men than women for similar jobs."
10. "Justin Trudeau brownface photo surfa

### Most Promoted/Demoted SAMPLE A

In [18]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "D.C. Circuit sent complaints about Kavanaugh’s testimony to Chief Justice Roberts"
2. "Ugandan President wants to ban oral sex; says the 'mouth is for eating'"
3. "A college football fan's sign asking for beer money raised more than $1 million. He's giving it to charity"
4. "Journalist in Ghana who helped expose FIFA corruption shot dead"
5. "‘Leave Tanks for Red Square’: Trump’s July 4 Celebration Unsettles Military"
6. "Michael Avenatti arrested for alleged $20 million extortion attempt on Nike"
7. "Virginia teacher sues school after being fired for not using transgender student's pronouns"
8. "Trump sues Deutsche Bank and Capital One to block release of financial records"
9. "Apple and Samsung fined for deliberately slowing down phones"
10. "The Senate is yet again trying to pass CISPA."
-
Most Promoted by **simple-consensus** over Neeka (most demoted by Neeka):
1. "Mr. Impeached Prete

### Most Promoted/Demoted SAMPLE B

In [19]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Trump attacks protections for immigrants from ‘shithole’ countries in Oval Office meeting"
2. "IRS, FBI Search Home of Baltimore Mayor Catherine Pugh, Baltimore City Hall"
3. "The UK, Australia, and US generally have the highest climate denial rates in the world. The big thing they have in common (aside from language) is a major Murdoch media presence"
4. "Paul Manafort earned $600,000 a month from pro-Russia party, says Ukrainian report"
5. "Major tech firms urge U.S. to retain net neutrality rules"
6. "Nigeria goes three years without a case of polio"
7. "R. Kelly turns himself in to Chicago police after being indicted on sexual abuse charges"
8. "Trump said on Monday he was worried that any statements under oath he provides Mueller could be used to bring perjury charges against him as part of the probe into Russia’s electoral interference."
9. "Dog waited weeks for owners at home burne

### Most Promoted/Demoted SAMPLE C

In [20]:
sample = posts_df.sample(1000)
print_most_changed(sample, submissions_df)

Most Promoted by **Neeka Consensus** over simple-consensus (most demoted by simple-consensus):
1. "Andrew Yang endorses Joe Biden: “The math says” the former VP is the presumptive nominee"
2. "Justin Trudeau brownface photo surfaces ahead of Canada election"
3. "MMR vaccine does not cause autism, another study confirms"
4. "Kansas man wrongfully imprisoned for 23 years receives no compensation from state"
5. "Michael Jordan donates $2 million for Florence relief and recovery aid"
6. "Thousands arrive at Richmond rally to protest gun-control legislation"
7. "If I Didn't Know This Was All Rigged, I'd Think Brett Kavanaugh Is in Serious Trouble"
8. "LDS Church leaders didn't report man who confessed to making child porn, docs say"
9. "Journalist in Ghana who helped expose FIFA corruption shot dead"
10. "Houston Chronicle calls on O'Rourke to end White House bid, run for Senate"
-
Most Promoted by **simple-consensus** over Neeka (most demoted by Neeka):
1. "Joe Scarborough: The lies Trump 