# Introduction

This notebook explores how different simulation settings affect the frequency of false results being published in a pseudo-scientific setting. It demonstrates that sevearal hypothesized effects emerged in single-generation simulations.

In [111]:
import seaborn as sns
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
from scipy.stats import beta, binom
import random

# simulation-wide global variables
num_bins = 3
num_draws = 10
num_participants = 100

###  Reporting Settings
A participant is in one of three settings for how they are allowed to report their data
1. **Rate**: Pick a single bin and report the survival rate of its pill contents.
2. **Data**: Pick a single bin and report the total number of rats that died and rats that stayed alive
3. **Subset**: Pick a single bin and choose a set of data to publish

In [112]:
class ReportingSetting:
    def __init__(self, name):
        if name not in {"rate", "data", "subset"}:
            raise ValueError("Improper setting name")
        self.name = name

### Participants
A participant implements a given strategy for how they gather data and a strategy for how they report data

In [113]:
class Participant:
    def __init__(self, strategy_gather, strategy_bin, strategy_report, reporting_setting):
        self.strategy_gather = strategy_gather                               # strategy to collect data
        self.strategy_bin = strategy_bin                                     # strategy to select bin to report
        self.strategy_report = strategy_report                               # strategy to report data in the chosen bin
        self.reporting_setting = reporting_setting                           # type of report a participant can make
        self.bin_sample_order = []                                           # order of bins sampled
        self.values_sampled = []                                             # values received across draws
        self.bin_choice = -1                                                 # the bin chosen to be reported
        reported_results = None                                              # the results reported
        
    def sample(self):
        sample_number = len(self.bin_sample_order)
        bin_number, value = self.strategy_gather.draw(len(self.values_sampled), self.bin_sample_order, self.values_sampled)
        self.bin_sample_order.append(bin_number)
        self.values_sampled.append(value)
        
    def choose_bin(self, bin_sample_order, values_sampled):
        self.bin_choice = self.strategy_bin.choose_bin(self.bin_sample_order, self.values_sampled)
        
    def report(self, alpha):
        history = get_full_history(self.bin_sample_order, self.values_sampled)
        bin_history = history[num_draws - 1][self.bin_choice]
        self.reported_results = self.strategy_report.report(self.reporting_setting.name, alpha, bin_history)

In [114]:
# returns a data structure that shows, on each draw, the values seen in each bin at that point
def get_full_history(bin_sample_order, values_sampled):
    history = {draw_number: {bin_number: [] for bin_number in range(num_bins)} for draw_number in range(num_draws)}

    for draw in range(len(bin_sample_order)):
        if draw == 0:
            history[draw][bin_sample_order[draw]].append(values_sampled[draw])
        else:
            prev_history = history[draw - 1].copy()
            for bin_num in prev_history:
                if bin_num == bin_sample_order[draw]:
                    history[draw][bin_num] = prev_history[bin_num] + [values_sampled[draw]]
                else:
                    history[draw][bin_num] = prev_history[bin_num][:]
    return history

# Hypothesized Participant Strategies

### Gathering Strategies
There are three hypothesized strategies that participants will use to gather data
1. **Random Strategy**: Choose random bins
2. **Adaptive Strategy**: Choose random bins for the first half of draws, then choose the bins with the highest posterior probability of success
3. **Conservative Strategy**: Choose random bins for the first half of draws, then always sample from the most successful bin

In [115]:
class RandomGatheringStrategy():
    def __init__(self):
        pass
    
    def draw(self, draw_number, bin_sample_order, values_sampled):
        bin_number = random.randint(0, num_bins - 1)
        value = random.choice([0, 1])
        return(bin_number, value)

In [116]:
class AdaptiveGatheringStrategy():
    def __init__(self):
        pass
    
    def draw(self, draw_number, bin_sample_order, values_sampled):
        # first half: random
        if draw_number < num_draws/2:
            bin_number = random.randint(0, num_bins - 1)
        
        # second half: choose best bin so far
        else:
            history = get_full_history(bin_sample_order, values_sampled)
            
            # pick the bin that you've seen the greatest proportion of positive values
            best_bin = 0
            best_ratio = 0
            for bin_num in history[draw_number - 1]:
                num_zeros = history[draw_number - 1][bin_num].count(0)
                num_ones = history[draw_number - 1][bin_num].count(1)
                
                # you've sampled from the bin and it's the best so far
                if (num_ones + num_zeros != 0) and (num_ones/(num_ones + num_zeros) > best_ratio):
                    best_ratio = num_ones/(num_ones + num_zeros)
                    best_bin = bin_num
            
            bin_number = best_bin
            
        value = random.choice([0, 1])
        return(bin_number, value)

In [117]:
class ConservativeGatheringStrategy():
    def __init__(self):
        self.best_bin_at_halfway = -1
    
    def draw(self, draw_number, bin_sample_order, values_sampled):
        # first half: random
        if draw_number < num_draws/2:
            bin_number = random.randint(0, num_bins - 1)
         
        # second half: choose best bin at halfway point
        else:
            # if you've never chosen the best bin so far, pick one with the greatest proportion of positive values
            if (self.best_bin_at_halfway == -1):
                history = get_full_history(bin_sample_order, values_sampled)
                best_bin = 0
                best_ratio = 0
                for bin_num in history[draw_number - 1]:
                    num_zeros = history[draw_number - 1][bin_num].count(0)
                    num_ones = history[draw_number - 1][bin_num].count(1)

                    # you've sampled from the bin and it's the best so far
                    if (num_ones + num_zeros != 0) and (num_ones/(num_ones + num_zeros) > best_ratio):
                        best_ratio = num_ones/(num_ones + num_zeros)
                        best_bin = bin_num
                self.best_bin_at_halfway = best_bin
                
            # case where you've only seen negative results
            if (self.best_bin_at_halfway == -1):
                self.best_bin_at_halfway = 0
            
            bin_number = self.best_bin_at_halfway
            
        value = random.choice([0, 1])
        return(bin_number, value)

### Bin-Choosing Strategies
We hypothesize that participants could use one of the following strategies for how to choose the single bin whose results they will be asked to report.
1. **Maximum Data**: Choose the bin for which you have collected the most data
2. **Maximum Success Rate**: Choose the bin for which you have the highest success rates

In [118]:
class MaximumDataChoosingStrategy():
    def __init__(self):
        pass
    
    # bin for which you've collected the most data
    def choose_bin(self, bin_sample_order, values_sampled):
        history = get_full_history(participant.bin_sample_order, participant.values_sampled)
        bin_with_most_data = -1
        most_draws = 0
        
        for bin_num in history[num_draws -1]:
            num_draws_in_bin = len(history[num_draws -1][bin_num])
            if num_draws_in_bin > most_draws:
                most_draws = num_draws_in_bin
                bin_with_most_data = bin_num
                
        print(f"the most draws: {most_draws}")
        return(bin_with_most_data)

In [119]:
class MaximumSuccessChoosingStrategy():
    def __init__(self):
        pass
    
    # bin for which you have the highest success rate
    def choose_bin(self, bin_sample_order, values_sampled):
        history = get_full_history(bin_sample_order, values_sampled)
        best_bin = 0
        best_ratio = 0
        
        for bin_num in history[num_draws - 1]:
            num_zeros = history[num_draws - 1][bin_num].count(0)
            num_ones = history[num_draws - 1][bin_num].count(1)

            # you've sampled from the bin and it's the best so far
            if (num_ones + num_zeros != 0) and (num_ones/(num_ones + num_zeros) > best_ratio):
                best_ratio = num_ones/(num_ones + num_zeros)
                best_bin = bin_num
                    
        return(best_bin)

### Reporting Strategies
We hypothesize that the participants will report their results with some degree $\alpha$ of exaggeration. When $\alpha = 0$, this reduces to the strategy of reporting honest, unmanipulated results. When $\alpha = 1$, this reduces to the strategy of reporting maximum values.
- Softmax over the utility function?

In [132]:
class ReportingStrategy():
    def __init__(self):
        pass
    
    def report(self, reporting_setting, alpha, bin_history):
        print("bin history:")
        print(bin_history)
        print(f"reporting setting: {reporting_setting}")
        num_zeros = bin_history.count(0)
        num_ones = bin_history.count(1)
        
        if alpha < 0 or alpha > 1:
            raise ValueError("Alpha must be between 0 and 1")
        
        # overreport by a proportion of alpha of the remaining rate to get to a value of 1
        if reporting_setting == "rate":
            accurate_rate = num_ones / (num_ones + num_zeros)
            return(accurate_rate + alpha * (1 - accurate_rate))
            
        # overreport the number of '1's and underreport the number of '0's by a rate of alpha 
        elif reporting_setting == "data":
            num_reported_zeros = round(num_zeros * (1 - alpha))
            num_reported_ones = round(num_ones * (1 + alpha))
            return({"0": num_reported_zeros, "1": num_reported_ones})
        
        # remove (100 * alpha)% of the '0' results
        elif reporting_setting == "subset":
            num_reported_zeros = round(num_zeros * (1 - alpha))
            return({"0": num_reported_zeros, "1": num_ones})

# Simulations

In [135]:
strat_gather = RandomGatheringStrategy()
# strat_gather = AdaptiveGatheringStrategy()
# strat_gather = ConservativeGatheringStrategy()

# strat_bin = MaximumDataChoosingStrategy()
strat_bin = MaximumSuccessChoosingStrategy()

strat_report = ReportingStrategy()

# report_set = ReportingSetting("rate")
# report_set = ReportingSetting("data")
report_set = ReportingSetting("subset")

participant = Participant(strategy_gather=strat_gather, strategy_bin=strat_bin, strategy_report=strat_report, reporting_setting=report_set)

for i in range(0, num_draws):
    participant.sample()
    
print('Participant History')
print(get_full_history(participant.bin_sample_order, participant.values_sampled))
print()
print('Bins sampled, in order')
print(participant.bin_sample_order)
print()
print('Values Sampled, in order')
print(participant.values_sampled)
print()
print('Bin Chosen')
participant.choose_bin(participant.bin_sample_order, participant.values_sampled)
print(participant.bin_choice)
print()
print("Reported results")
participant.report(0.5)
print(participant.reported_results)

Participant History
{0: {0: [1], 1: [], 2: []}, 1: {0: [1], 1: [], 2: [0]}, 2: {0: [1], 1: [], 2: [0, 0]}, 3: {0: [1], 1: [1], 2: [0, 0]}, 4: {0: [1], 1: [1], 2: [0, 0, 1]}, 5: {0: [1], 1: [1], 2: [0, 0, 1, 1]}, 6: {0: [1], 1: [1, 0], 2: [0, 0, 1, 1]}, 7: {0: [1], 1: [1, 0], 2: [0, 0, 1, 1, 1]}, 8: {0: [1], 1: [1, 0], 2: [0, 0, 1, 1, 1, 0]}, 9: {0: [1], 1: [1, 0], 2: [0, 0, 1, 1, 1, 0, 1]}}

Bins sampled, in order
[0, 2, 2, 1, 2, 2, 1, 2, 2, 2]

Values Sampled, in order
[1, 0, 0, 1, 1, 1, 0, 1, 0, 1]

Bin Chosen
0

Reported results
bin history:
[1]
reporting setting: subset
{'0': 0, '1': 1}
