In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [2]:
def process_array_data(path, num_runs=20, window_over=5, step=3, plot_results=False):
    """Loop through a collection of experiments,
    plotting the choices of the two bandits and using windowing to smooth out their reward data"""
    
    # Make sure that the string specifying the path works with the rest of the code
    print(path)
    if path[-1] != "/":
        path = path+"/"
    
    # Loop through experiments in this path
    for file in os.listdir(path):
        df = pd.read_csv(path+file+"/array data.csv", index_col=0)
        print("\n\n____"+str(file)+"____")
        for run in range(num_runs):
            if plot_results:
                # Plot and save the choices made by bandit 0 during this run and save it into the experiment folder
                plt.figure()
                plt.plot(df["Run "+str(run)+", Choices 0"])
                plt.title(str(file)+" Choices for Bandit 0")
                plt.ylabel("Choice: 0=stay silent, 1=betray")
                plt.xlabel("Time step")
                plt.savefig(str(path+file)+"/Run "+str(run)+"Choices for Bandit 0.png")
                plt.show()

                # Plot and save the choices made by bandit 1 during this run and save it into the experiment folder
                plt.figure()
                plt.plot(df["Run "+str(run)+", Choices 1"])
                plt.title(str(file)+" Choices for Bandit 1")
                plt.ylabel("Choice: 0=stay silent, 1=betray")
                plt.xlabel("Time step")
                plt.savefig(str(path+file)+"/Run "+str(run)+"Choices for Bandit 1.png")
                plt.show()

            # Average the rewards over windows to smooth out the data and hopefully make patterns easier to see
            rewards0 = df["Run "+str(run)+", Rewards 0"]
            rewards1 = df["Run "+str(run)+", Rewards 1"]
            windowed_r0 = []
            windowed_r1 = []
            for t in range(0,len(rewards0)-window_over,step):
                windowed_r0.append(np.sum(rewards0[t:t+window_over])/window_over)
                windowed_r1.append(np.sum(rewards1[t:t+window_over])/window_over)

            # Save the windowed rewards information for later referral
            if run == 0:
                window_frame = pd.DataFrame({"Run 0, Windowed Rewards 0":windowed_r0, "Run 0, Windowed Rewards 1":windowed_r1})
            else:
                window_frame["Run "+str(run)+", Windowed Rewards 0"] = windowed_r0
                window_frame["Run "+str(run)+", Windowed Rewards 1"] = windowed_r1
            window_frame.to_csv(str(path+file)+"/Windowed reward data, window size="+str(window_over)+", step="+str(step)+".csv")
        
    plt.close("all")

In [3]:
def get_avg_cumulative_rewards(path, num_runs=20):
    """Loop through a collection of experiments,
    plotting the choices of the two bandits and using windowing to smooth out their reward data"""
    
    # Make sure that the string specifying the path works with the rest of the code
    if path[-1] != "/":
        path = path+"/"
    
    # Loop through experiments in this path
    for file in os.listdir(path):
        df = pd.read_csv(path+file+"/array data.csv", index_col=0)
        print("\n____"+str(file)+"____")
        
        # Loop through runs, gathering cumulative reward data
        avg_cumulative_reward_0 = 0
        avg_cumulative_reward_1 = 0
        for run in range(num_runs):
            avg_cumulative_reward_0 += np.sum(df["Run "+str(run)+", Rewards 0"])
            avg_cumulative_reward_1 += np.sum(df["Run "+str(run)+", Rewards 1"])
        avg_cumulative_reward_0 /= float(num_runs)
        avg_cumulative_reward_1 /= float(num_runs)
        
        print("Average cumulative reward for", file)
        print("\tBandit 0:", avg_cumulative_reward_0)
        print("\tBandit 1:", avg_cumulative_reward_1)

In [4]:
print("Expected reward in Nash equilibrium (double betrayal):", -2*200)
print("Expected reward if learner always betrays but random is exactly 50:50:", 200*(0-2)/2)

Expected reward in Nash equilibrium (double betrayal): -400
Expected reward if learner always betrays but random is exactly 50:50: -200.0


This means that most of the time both learning bandits chose to betray with little variation for choice of hyperparameter. Given the simplicity of the set up, both learners shouldn't need a lot of exploration to learn the optimal long-term strategy. This is why both learners performed best when their hyperparameters were so small.

However, in order to exhibit cooperative behaviour, I expect that the learners will need to be slightly more willing to explore. That is why I have set the value of e to be 0.15 for the eGreedy bandit and the value of c to be 1 for the UCB bandit.

In [5]:
def get_avg_betrayal_proportion(path, num_runs=20):
    """Get the proportion of time that both bandits in an experiment betrayed the other bandit"""
    
    # Make sure that the string specifying the path works with the rest of the code
    if path[-1] != "/":
        path = path+"/"
    
    # Loop through experiments in this path
    for file in os.listdir(path):
        # Read in data
        df = pd.read_csv(path+file+"/array data.csv", index_col=0)
        print(file)

        avg_betrayal_proportion_0 = 0
        avg_betrayal_proportion_1 = 0
        # Gather the data
        for run in range(num_runs):
            avg_betrayal_proportion_0 += np.sum(df["Run "+str(run)+", Choices 0"])
            avg_betrayal_proportion_1 += np.sum(df["Run "+str(run)+", Choices 1"])

        avg_betrayal_proportion_0 /= len(df["Run 0, Choices 0"])*num_runs
        avg_betrayal_proportion_1 /= len(df["Run 0, Choices 0"])*num_runs
    
        print(avg_betrayal_proportion_0, avg_betrayal_proportion_1)
        print()

In [None]:
process_array_data("./Round Robin experiments 100 Runs/", num_runs=100)

In [None]:
get_avg_cumulative_rewards("./Round Robin experiments 100 Runs/", num_runs=100)

In [None]:
# Get betrayal proportion data for all matchups in the round robin
get_avg_betrayal_proportion("./Round Robin experiments 100 Runs", num_runs=100)

In [None]:
# Compare betrayal proportions and average reward for all 7 types of bandit
plt.plot([49.79625,62.7725,62.846875,74.30687])

In [6]:
process_array_data("./No-regret testing/", num_runs=100)

./No-regret testing/


____No-regret bandit vs eGreedy bandit e=0.15, T=200, 100 episodes____


____No-regret bandit vs No-regret bandit, T=200, 100 episodes____


____No-regret bandit vs Optimal bandit, T=200, 100 episodes____


____No-regret bandit vs Random bandit, T=200, 100 episodes____


____No-regret bandit vs Rational bandit, T=200, 100 episodes____


____No-regret bandit vs UCB bandit c=1, T=200, 100 episodes____


In [7]:
get_avg_cumulative_rewards("./No-regret testing/", num_runs=100)


____No-regret bandit vs eGreedy bandit e=0.15, T=200, 100 episodes____
Average cumulative reward for No-regret bandit vs eGreedy bandit e=0.15, T=200, 100 episodes
	Bandit 0: -364.94
	Bandit 1: -407.39

____No-regret bandit vs No-regret bandit, T=200, 100 episodes____
Average cumulative reward for No-regret bandit vs No-regret bandit, T=200, 100 episodes
	Bandit 0: -392.98
	Bandit 1: -393.16

____No-regret bandit vs Optimal bandit, T=200, 100 episodes____
Average cumulative reward for No-regret bandit vs Optimal bandit, T=200, 100 episodes
	Bandit 0: -402.85
	Bandit 1: -388.69

____No-regret bandit vs Random bandit, T=200, 100 episodes____
Average cumulative reward for No-regret bandit vs Random bandit, T=200, 100 episodes
	Bandit 0: -207.14
	Bandit 1: -485.9

____No-regret bandit vs Rational bandit, T=200, 100 episodes____
Average cumulative reward for No-regret bandit vs Rational bandit, T=200, 100 episodes
	Bandit 0: -406.42
	Bandit 1: -387.16

____No-regret bandit vs UCB bandit c=

In [8]:
# Get betrayal proportion data for all matchups in the round robin
get_avg_betrayal_proportion("./No-regret testing", num_runs=100)

No-regret bandit vs eGreedy bandit e=0.15, T=200, 100 episodes
0.9662 0.89545

No-regret bandit vs No-regret bandit, T=200, 100 episodes
0.9655 0.9652

No-regret bandit vs Optimal bandit, T=200, 100 episodes
0.96705 0.99065

No-regret bandit vs Random bandit, T=200, 100 episodes
0.9649 0.5003

No-regret bandit vs Rational bandit, T=200, 100 episodes
0.9679 1.0

No-regret bandit vs UCB bandit c=1, T=200, 100 episodes
0.96625 0.9159



<h3>Previous experimental runs for reference</h3>

In [None]:
bandit_list = ["eGreedy", "UCB"]

for b0 in range(len(bandit_list)):
    process_array_data("Hyperparameter tuning "+bandit_list[b0]+" 2, random")

In [None]:
bandit_list = ["eGreedy", "UCB"]

for b0 in range(len(bandit_list)):
    get_avg_cumulative_rewards("Hyperparameter tuning "+bandit_list[b0]+" 2, random")

In [None]:
# Hyperparameter tuning against the optimal agent
bandit_list = ["eGreedy", "UCB"]

for b0 in range(len(bandit_list)):
    process_array_data("Hyperparameter tuning "+bandit_list[b0])

In [None]:
# Loop through experiments and runs for the round robin
path = "./Round robin experiments/"

window_over = 5
step = 3

for file in os.listdir(path):
    df = pd.read_csv(path+file+"/array data.csv", index_col=0)
    print("\n\n____"+str(file)+"____")
    for run in range(20):
        # Plot and save the choices made by bandit 0 during this run and save it into the experiment folder
        plt.figure()
        plt.plot(df["Run "+str(run)+", Choices 0"])
        plt.title(str(file)+" Choices for Bandit 0")
        plt.ylabel("Choice: 0=stay silent, 1=betray")
        plt.xlabel("Time step")
        plt.savefig(str(path+file)+"/Run "+str(run)+"Choices for Bandit 0.png")
        plt.show()

        # Plot and save the choices made by bandit 1 during this run and save it into the experiment folder
        plt.figure()
        plt.plot(df["Run "+str(run)+", Choices 1"])
        plt.title(str(file)+" Choices for Bandit 1")
        plt.ylabel("Choice: 0=stay silent, 1=betray")
        plt.xlabel("Time step")
        plt.savefig(str(path+file)+"/Run "+str(run)+"Choices for Bandit 1.png")
        plt.show()

        # Average the rewards over windows to smooth out the data and hopefully make patterns easier to see
        rewards0 = df["Run "+str(run)+", Rewards 0"]
        rewards1 = df["Run "+str(run)+", Rewards 1"]
        windowed_r0 = []
        windowed_r1 = []
        for t in range(0,len(rewards0)-window_over,step):
            windowed_r0.append(np.sum(rewards0[t:t+window_over])/window_over)
            windowed_r1.append(np.sum(rewards1[t:t+window_over])/window_over)

        # Save the windowed rewards information for later referral
        if run == 0:
            window_frame = pd.DataFrame({"Run 0, Windowed Rewards 0":windowed_r0, "Run 0, Windowed Rewards 1":windowed_r1})
        else:
            window_frame["Run "+str(run)+", Windowed Rewards 0"] = windowed_r0
            window_frame["Run "+str(run)+", Windowed Rewards 1"] = windowed_r1
        window_frame.to_csv(str(path+file)+"/Windowed reward data, window size="+str(window_over)+", step="+str(step)+".csv")