# Introduction
This notebook explores how false publications in science can emerge as the product of a multi-generational simulation of science.

In [22]:
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
import scipy
from scipy.stats import beta, binom, entropy
import random
import json
import copy
import math
import pickle

# my modules
import scientist
import evaluation
import helper
import settings
import publisher

# global variables
num_bins = 3
num_draws = 10
num_participants = 10
num_generations = 5

In [23]:
# distribution of bins
bins_to_probs = {}
for i in range(0, num_bins):
    bins_to_probs[i] = 0.5

## Initialize participants

In [24]:
def make_participants(setting, alpha_value):
    participants = []

    for i in range (0, num_participants):
        if setting == "rate":
            report_set = settings.ReportingSetting("rate")
        elif setting == "data":
            report_set = settings.ReportingSetting("data")
        elif setting == "subset":
            report_set = settings.ReportingSetting("subset")

        # make participant
        participant = scientist.Participant(alpha=alpha_value, reporting_setting=report_set)
                        
        participants.append(participant)

    return(participants)

## Run an experiment

The multi-generational experiment is run, given reporting setting and exaggeration values.

In [25]:
def run_experiment(setting, alpha_value, rel_pl_data_val, rel_pl_surprise_val, rel_pl_bias_val):
    # each experiment starts with a blank cannon (starts with 1-1 prior)
    scientific_record = {}
    for bin_num in range(0, num_bins):
        scientific_record[bin_num] = {} 
        scientific_record[bin_num][0] = 1
        scientific_record[bin_num][1] = 1
    
    for generation in range(0, num_generations):
        print(f"\n* Generation {generation}...")
        helper.print_record(scientific_record, num_bins)
        print(f"   Arm score: {evaluation.arm_parameter_score(scientific_record, bins_to_probs)}")
        print(f"   Entropy score: {evaluation.total_entropy_score(scientific_record, bins_to_probs)}")
        
        # each generation gets an entirely new set of participants
        participants = make_participants(setting, alpha_value)

        # scientists explore and submit reports
        for participant in participants:
            # sample
            for i in range(0, num_draws):
                bin_number, value = participant.sample(scientific_record, num_bins, bins_to_probs)
                
                print(f"   sample from bin {bin_number}: {value}")

            # choose the bin
            bin_choice = participant.choose_bin(scientific_record, num_bins, num_draws)
            print(f"   chose bin {bin_choice}")

            # make a report
            participant.report(num_bins, num_draws)
            
        # the peer review board selects reports for publication and returns the updated scientific record
        scientific_record = publisher.peer_review(participants, scientific_record, rel_pl_data_val, rel_pl_surprise_val, rel_pl_bias_val)
        
    print("\n\n* FINAL RESULTS")
    helper.print_record(scientific_record, num_bins)
    
#     # final metric of how well scientists play the multi-armed bandit game
    print(evaluation.arm_parameter_score(scientific_record, bins_to_probs))

#     # final metric of how well scientists reduce the entropy of the scientific record
    print(evaluation.total_entropy_score(scientific_record, bins_to_probs))
    
    return(evaluation.arm_parameter_score(scientific_record, bins_to_probs), evaluation.total_entropy_score(scientific_record, bins_to_probs))

In [76]:
run_experiment("data", 0, 1, 1, 1)


* Generation 0...
Scientific record
   bin 0: 1 zero(s), 1 one(s)
   bin 1: 1 zero(s), 1 one(s)
   bin 2: 1 zero(s), 1 one(s)

   Arm score: 0.0
   Entropy score: 0.0
   sample from bin 1: 1
   sample from bin 1: 1
   sample from bin 1: 1
   sample from bin 1: 1
   sample from bin 1: 0
   sample from bin 1: 0
   sample from bin 0: 1
   sample from bin 0: 0
   sample from bin 2: 1
   sample from bin 2: 0
   chose bin 1
   sample from bin 0: 0
   sample from bin 0: 0
   sample from bin 0: 1
   sample from bin 1: 1
   sample from bin 1: 0
   sample from bin 2: 1
   sample from bin 2: 1
   sample from bin 2: 1
   sample from bin 2: 1
   sample from bin 2: 0
   chose bin 2
   sample from bin 1: 0
   sample from bin 1: 1
   sample from bin 2: 1
   sample from bin 2: 0
   sample from bin 0: 1
   sample from bin 0: 0
   sample from bin 1: 1
   sample from bin 1: 1
   sample from bin 1: 0
   sample from bin 2: 1
   chose bin 2
   sample from bin 0: 0
   sample from bin 0: 0
   sample from bin 

(0.08433048433048433, 0.017126967327172747)

## Searching over the space of publishing policies

Search across relative weights of how much data is associated with a report, how surprising the report is, and publication bias

In [26]:
# scale over amount of supporting data 
rel_pl_data = np.linspace(0, 10, 21)

# scale over how surprising the data is
rel_pl_surprise = np.linspace(0, 10, 21)

# rate of bump for publication bias (0.01 = 1% publication bias)
rel_pl_bias = np.linspace(0, 1, 21)

In [49]:
publishing_policies_space = {}
exp_no = 0

for rel_pl_data_val in rel_pl_data:
    print(f"examining value: {rel_pl_data_val}")
    for rel_pl_surprise_val in rel_pl_surprise:
        for rel_pl_bias_val in rel_pl_bias:
            total_arm_score = 0
            total_entropy_score = 0
           
            for i in range(0, 5):
                arm_score, entropy_score = run_experiment("data", 0, rel_pl_data_val, rel_pl_surprise_val, rel_pl_bias_val)
                total_arm_score += arm_score
                total_entropy_score += entropy_score
            
            key = (rel_pl_data_val, rel_pl_surprise_val, rel_pl_bias_val)
            publishing_policies_space[key] = [total_arm_score / 5, total_entropy_score / 5] # average over 5 runs of each combination
            exp_no += 1
            
# save the results
pickle.dump(
    publishing_policies_space,
    open("/Users/marinamancoridis/Thesis/Thesis_Simulations/publishing_policies_3_bins.p", "wb")
)

examining value: 0.0
examining value: 0.5
examining value: 1.0
examining value: 1.5
examining value: 2.0
examining value: 2.5
examining value: 3.0
examining value: 3.5
examining value: 4.0
examining value: 4.5
examining value: 5.0
examining value: 5.5
examining value: 6.0
examining value: 6.5
examining value: 7.0
examining value: 7.5
examining value: 8.0
examining value: 8.5
examining value: 9.0
examining value: 9.5
examining value: 10.0


In [52]:
# global variables
num_bins = 30
num_draws = 10
num_participants = 10
num_generations = 5

# distribution of bins
bins_to_probs = {}
for i in range(0, num_bins):
    bins_to_probs[i] = 0.5

In [21]:
publishing_policies_space = {}
exp_no = 0

for rel_pl_data_val in rel_pl_data:
    print(f"examining value: {rel_pl_data_val}")
    for rel_pl_surprise_val in rel_pl_surprise:
        for rel_pl_bias_val in rel_pl_bias:
            total_arm_score = 0
            total_entropy_score = 0

            for i in range(0, 5):
                arm_score, entropy_score = run_experiment("data", 0, rel_pl_data_val, rel_pl_surprise_val, rel_pl_bias_val)
                total_arm_score += arm_score
                total_entropy_score += entropy_score

            key = (rel_pl_data_val, rel_pl_surprise_val, rel_pl_bias_val)
            publishing_policies_space[key] = [total_arm_score / 5, total_entropy_score / 5] # average over 5 runs of each combination
            exp_no += 1

# save the results
pickle.dump(
    publishing_policies_space,
    open(f"/Users/marinamancoridis/Thesis/Thesis_Simulations/publishing_policies_30_bins.p", "wb")
)

NameError: name 'rel_pl_data' is not defined

## Fix a setting and look at how the evaluation metric changes over generations

In [35]:
# global variables
num_bins = 30
num_draws = 10
num_participants = 10
num_generations = 100

# distribution of bins
bins_to_probs = {}
for i in range(0, num_bins):
    bins_to_probs[i] = 0.5

In [36]:
def run_experiment_gen_info(setting, alpha_value, rel_pl_data_val, rel_pl_surprise_val, rel_pl_bias_val):
    scientific_record = {}
    for bin_num in range(0, num_bins):
        scientific_record[bin_num] = {} 
        scientific_record[bin_num][0] = 1
        scientific_record[bin_num][1] = 1
    
    kl_per_gen = {}
    
    for generation in range(0, num_generations):       
        # each generation gets an entirely new set of participants
        participants = make_participants(setting, alpha_value)

        # scientists explore and submit reports
        for participant in participants:
            # sample
            for i in range(0, num_draws):
                bin_number, value = participant.sample(scientific_record, num_bins, bins_to_probs)

            # choose the bin
            bin_choice = participant.choose_bin(scientific_record, num_bins, num_draws)

            # make a report
            participant.report(num_bins, num_draws)
            
        # the peer review board selects reports for publication and returns the updated scientific record
        scientific_record = publisher.peer_review(participants, scientific_record, rel_pl_data_val, rel_pl_surprise_val, rel_pl_bias_val)
        
        kl_per_gen[generation] = evaluation.total_entropy_score(scientific_record, bins_to_probs)
    
    return(evaluation.arm_parameter_score(scientific_record, bins_to_probs), evaluation.total_entropy_score(scientific_record, bins_to_probs), kl_per_gen)

In [37]:
values = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
final_map = {}
for i in range(0, len(values)):
    final_map[values[i]] = {}
    
for value in values:
    print(value)
    # distribution of bins
    bins_to_probs = {}
    for i in range(0, num_bins):
        bins_to_probs[i] = value

    average_KL_per_generation = {}
    for i in range(0, num_generations):
        average_KL_per_generation[i] = 0

    for i in range(0, 5):
        arm_score, entropy_score, kl_per_gen = run_experiment_gen_info("data", 0, 1, 1, 0)
        for gen_no in kl_per_gen:
            average_KL_per_generation[gen_no] += kl_per_gen[gen_no]

    for key in average_KL_per_generation:
        average_KL_per_generation[key] /= 5
        
    final_map[value] = average_KL_per_generation

# save the results
pickle.dump(
    final_map,
    open("/Users/marinamancoridis/Thesis/Thesis_Simulations/final_map.p", "wb")
)

0
0.1
0.2
0.3
0.4
0.5
0.6
0.7
0.8
0.9
1
