In [7]:
import numpy as np
import pandas as pd
import seaborn as sn
import random
import copy

# Simulating amplification metrics under biased historical algorithm

Algorithmic amplification is difficult to define, partially because it is by its nature a relative metric-- we need to define amplification with respect to some baseline. Here, I look at how measures of algorithmic amplification are misleading if the baseline is the result of a biased algorithm previously. Specifically, suppose we have a population of _n_ accounts, all of which are "identical". That is, at each time point, each account draws a value from a N(0,1) and tweets it. The other accounts' preference is to see the _K_ accounts whose value is closest to theirs, with a slight preference for accounts they already follow. At each time point, they follow each account they have seen with probability _p_.  

I consider three types of algorthms for determing which content they see: (1) a "biased" algorithm, in which the first _num_preferred_ accounts (the "preferred users") get a boost in their ranking, (2) a "baseline" algorithm, in which every account sees content according to their known preferences as described above, and (3) reverse chron. 

Our goal is to measure how much "amplification" the biased algorithm has, which we define as the number of impressions given to the preferred users under the biased algorithm divided by the number of impressions given to the preferred users under reverse chron. 

What happens if the the "biased" algorithm is used at first? And then, we measure amplification going forward.
What happens if the "baseline" algorithm is used at first?

How does this change what we conclude about algorithmic amplification?

In [49]:
#set parameters
n = 100
p = .05
num_iter = 200
K = 10
follow_factor = 0.1
follow_prob = 0.05
num_preferred = 10

In [138]:
class simulation:
    def __init__(self, n):
        self.follows = {key : [] for key in list(range(n))}
        self.impressions = {key : [] for key in list(range(n))}
        self.n = n
        
    def get_timeline_baseline(self, values, i, K, follow_factor):
        follow_ind = np.zeros(self.n)
        follow_ind[self.follows[i]] = 1
        diff = np.abs(values[i] - values) - follow_factor*follow_ind
        top_k =  sorted(range(len(diff)), key = lambda sub: diff[sub])
        top_k.remove(i)
        return(top_k[0:K])
    
    def get_timeline_reverse_chron(self, values, i, K):
        #automatically take all followers
        top_k = self.follows[i]
        random.shuffle(top_k)
        #just switch back to an empty list in the case where top_k is empty
        if not top_k:
            top_k = []
        if (len(top_k) < K):
            diff = np.abs(values[i] - values)
            top_k = top_k + sorted(range(len(diff)), key = lambda sub: diff[sub])
            top_k.remove(i)
        top_k =  top_k[0:K] 
        return(top_k)
        
    def get_timeline_biased(self, values, i, K, follow_factor, num_preferred, preferred_factor):
        follow_ind = np.zeros(self.n)
        follow_ind[self.follows[i]] = 1
        diff = np.abs(values[i] - values) - follow_factor*follow_ind
        diff[0:num_preferred] -= preferred_factor
        top_k =  sorted(range(len(diff)), key = lambda sub: diff[sub])
        top_k.remove(i)
        return(top_k[:K])
    
    def run_simulation(self, num_iter, sim_type = "baseline",  K=10, follow_factor=0.1, \
                     follow_prob=0.05, num_preferred=10, preferred_factor=0.2, mu = 0):
        for iter in range(num_iter):
            #simulate values
            values = np.random.normal(size=self.n, loc = mu)
            for i in range(self.n):
                if sim_type == "baseline":
                    top_k = self.get_timeline_baseline(values, i, K, follow_factor)
                if sim_type == "biased":
                    top_k = self.get_timeline_biased(values, i, K, follow_factor, \
                                                      num_preferred, preferred_factor)
                if sim_type == "reverse_chron":
                    top_k = self.get_timeline_reverse_chron(values, i, K)
                
                #top_k.sort()
                #print(top_k)
                self.impressions[i].extend(top_k)
                follows = [j for j in top_k if np.random.uniform()<follow_prob]
                self.follows[i] = list(set(self.follows[i]).union(set(follows)))
                #this is ugly, changing from list to set back to list, maybe can make this prettier?
    
    def erase_impression_history(self):
        self.impressions = {key : [] for key in list(range(n))}
      
    def barplot_impression_counts(self, K=10, last_iters=5):
        
        df = pd.DataFrame(self.impressions).tail(K*last_iters)
        num_impressions = df.apply(lambda x: x.value_counts()).sum(axis=1)
        sn.barplot(x=num_impressions.index.values,y=num_impressions.values)

    def histogram_impression_counts(self, K=10, last_iters=5):
        print(K*last_iters)
        df = pd.DataFrame(self.impressions).tail(K*last_iters)
        num_impressions = df.apply(lambda x: x.value_counts()).sum(axis=1)
        sn.histplot(num_impressions)
    
    def get_impressions_to_preferred(self, num_preferred=10, last_iters=5):
        df = pd.DataFrame(self.impressions).tail(K*last_iters)
        num_impressions = df.apply(lambda x: x.value_counts()).sum(axis=1)
        preferred_impressions = num_impressions[num_impressions.index < num_preferred].sum()
        return(preferred_impressions)
    

        

        

In [159]:
def calculate_amplification(n, init_iter, init_algo, final_iter, final_algo_num,\
                      final_algo_denom, K=10, follow_factor = .1, follow_prob=0.05,  num_preferred=10,\
                         preferred_factor=0.2, mu=0, last_iters = 5, return_all=False):
    
        #initialize by running follow network forward init_iter iterations
        sim_init = simulation(n)
        sim_init.run_simulation(init_iter, init_algo, K, follow_factor,\
                                follow_prob, num_preferred, preferred_factor, mu)
        sim_init.erase_impression_history()
        
        #copy initialized state
        sim_final = simulation(n)
        sim_final.follows = copy.deepcopy(sim_init.follows)
        
        
        #run both forward from the same state
        sim_init.run_simulation(final_iter, final_algo_num, K, follow_factor,\
                                follow_prob, num_preferred, preferred_factor, mu)
        sim_final.run_simulation(final_iter, final_algo_denom, K, follow_factor,\
                                follow_prob, num_preferred, preferred_factor, mu)
        
        amplification = sim_init.get_impressions_to_preferred(num_preferred, last_iters)/\
            sim_final.get_impressions_to_preferred(num_preferred, last_iters)
        if return_all:
            return(sim_init, sim_final, amplification)
        if not return_all:
            return(amplification)

In [156]:
#begin with biased algorithm, compare biased to reverse chron
calculate_amplification(n, 50, "biased", 10, "biased", "reverse_chron", last_iters = 5, preferred_factor = 3)

1.082995951417004

In [157]:
#begin with "baseline", compare biased to reverse chron
calculate_amplification(n, 50, "baseline", 10, "biased", "reverse_chron", last_iters = 5, preferred_factor = 3)

9.875251509054326

In [158]:
#begin with "reverse chron", compare biased to reverse chron
calculate_amplification(n, 50, "reverse_chron", 10, "biased", "reverse_chron", last_iters = 5, preferred_factor = 3)

9.486973947895791

In [154]:
# now let's try if different individuals have different means (those near the middle or around whom there is clustering have different preferences)
#let's make 5 clusters of means. 
loc = [-10, -5, 0, 5, 10]*20
mu = np.random.normal(size=n, loc=loc)
mu.sort()

sim_init, sim_final, amplification = calculate_amplification(n, 50, "biased", 1, "biased", "reverse_chron", \
                                                             preferred_factor = 3, mu=mu, return_all=True)
amplification

1.134020618556701

In [155]:
#compare to the case where we used a baseline algorithm before
calculate_amplification(n, 50, "baseline", 1, "biased", "reverse_chron", \
                                                             preferred_factor = 3, mu=mu)

2.2613636363636362