<a href="https://colab.research.google.com/github/Kshitij04Poojary/Iterated-Prisoners-Dilemma/blob/main/IPDRPcode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install optuna
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import optuna

# Global definitions for common parameters
input_size = 5
output_size = 2
epsilon_start = 1.0
epsilon_end = 0.01
gamma = 0.99
num_episodes = 200

class DQN(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, layer_num):
        super().__init__()
        self.lstmLayer = nn.LSTM(in_dim, hidden_dim, layer_num, batch_first=True)
        self.relu = nn.ReLU()
        self.fcLayer = nn.Linear(hidden_dim, out_dim)

    def forward(self, x):
        out, _ = self.lstmLayer(x)
        out = self.relu(out[:, -1, :])  # Take the output of the last time step
        out = self.fcLayer(out)
        return out

class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, reward, next_state):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return zip(*random.sample(self.buffer, batch_size))

    def __len__(self):
        return len(self.buffer)

class IteratedPrisonersDilemma:
    def __init__(self):
        self.num_actions = 2
        self.payoff_matrix = np.array([[3, 0], [5, 1], [1, 5], [0, 0]])

    def step(self, action1, action2):
        reward1 = self.payoff_matrix[action1][action2]
        reward2 = self.payoff_matrix[action2][action1]
        return reward1, reward2

def select_action(state, epsilon, policy_net, output_size):
    if np.random.rand() < epsilon:
        return np.random.randint(output_size)
    else:
        with torch.no_grad():
            state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
            q_values = policy_net(state_tensor)
            return q_values.argmax().item()

def update_q_values(replay_buffer, batch_size, policy_net, target_net, optimizer, gamma):
    if len(replay_buffer) < batch_size:
        return
    states, actions, rewards, next_states = replay_buffer.sample(batch_size)
    states = torch.tensor(states, dtype=torch.float32).view(batch_size, 1, -1)
    actions = torch.tensor(actions, dtype=torch.long)
    rewards = torch.tensor(rewards, dtype=torch.float32)
    next_states = torch.tensor(next_states, dtype=torch.float32).view(batch_size, 1, -1)

    q_values = policy_net(states).gather(1, actions.unsqueeze(1)).squeeze(1)
    next_q_values = target_net(next_states).max(1)[0].detach()
    expected_q_values = rewards + gamma * next_q_values

    loss = nn.functional.mse_loss(q_values, expected_q_values)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

def train_dqn(trial):
    # Hyperparameters to optimize
    hidden_size = trial.suggest_categorical('hidden_size', [32, 64, 128])
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
    batch_size = trial.suggest_categorical('batch_size', [32, 64, 128])
    epsilon_decay = trial.suggest_uniform('epsilon_decay', 0.99, 0.999)
    target_update = trial.suggest_int('target_update', 5, 20)

    policy_net = DQN(input_size, hidden_size, output_size, 1)
    target_net = DQN(input_size, hidden_size, output_size, 1)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
    replay_buffer = ReplayBuffer(capacity=10000)
    ipd_env = IteratedPrisonersDilemma()
    total_rewards = []

    for episode in range(num_episodes):
        state = [0, 0, 0, 0, 0]
        total_reward = 0
        for t in range(100):
            epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-episode / epsilon_decay)
            action = select_action(state, epsilon, policy_net, output_size)
            opponent_action = state[0]
            reward, opponent_reward = ipd_env.step(action, opponent_action)
            next_state = [action, opponent_action, reward, opponent_reward, t]
            replay_buffer.push(state, action, reward, next_state)
            state = next_state
            total_reward += reward
            update_q_values(replay_buffer, batch_size, policy_net, target_net, optimizer, gamma)
            if t % target_update == 0:
                target_net.load_state_dict(policy_net.state_dict())
        total_rewards.append(total_reward)

    avg_reward = np.mean(total_rewards)
    return avg_reward

study = optuna.create_study(direction='maximize')
study.optimize(train_dqn, n_trials=50)

print(f"Best trial: {study.best_trial.params}")

def play_ipd(policy_net, num_episodes, strategy, epsilon=0.1):
    ipd_env = IteratedPrisonersDilemma()
    total_rewards = []
    for episode in range(num_episodes):
        state = [0, 0, 0, 0, 0]
        total_reward = 0
        opponent_last_action = 0
        grim_trigger_active = False
        for t in range(100):
            with torch.no_grad():
                if np.random.rand() < epsilon:
                    action = np.random.randint(output_size)
                else:
                    state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).unsqueeze(0)
                    q_values = policy_net(state_tensor)
                    action = q_values.argmax().item()

            if strategy == "random":
                opponent_action = np.random.randint(2)
            elif strategy == "tft":
                opponent_action = opponent_last_action
            elif strategy == "grim_trigger":
                if grim_trigger_active:
                    opponent_action = 1
                else:
                    opponent_action = 0
                    if action == 1:
                        grim_trigger_active = True
            elif strategy == "always_cooperate":
                opponent_action = 0
            elif strategy == "always_defect":
                opponent_action = 1

            reward, _ = ipd_env.step(action, opponent_action)
            total_reward += reward

            next_state = [action, opponent_action, reward, 0, t]
            state = next_state
            opponent_last_action = action

        total_rewards.append(total_reward)
        print(f"Episode {episode + 1}, Total Reward: {total_reward}")

    return total_rewards

def test_against_tft(policy_net, num_episodes):
    tft_rewards = play_ipd(policy_net, num_episodes, strategy="tft", epsilon=0.1)
    avg_reward = np.mean(tft_rewards)
    print("Average reward against TFT strategy:", avg_reward)

# Use the best hyperparameters to train the final model
best_params = study.best_trial.params
hidden_size = best_params['hidden_size']
learning_rate = best_params['learning_rate']
batch_size = best_params['batch_size']
epsilon_decay = best_params['epsilon_decay']
target_update = best_params['target_update']

# Train the final model
policy_net = DQN(input_size, hidden_size, output_size, 1)
target_net = DQN(input_size, hidden_size, output_size, 1)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=learning_rate)
replay_buffer = ReplayBuffer(capacity=10000)
ipd_env = IteratedPrisonersDilemma()

for episode in range(num_episodes):
    state = [0, 0, 0, 0, 0]
    total_reward = 0
    for t in range(100):
        epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-episode / epsilon_decay)
        action = select_action(state, epsilon, policy_net, output_size)
        opponent_action = state[0]
        reward, opponent_reward = ipd_env.step(action, opponent_action)
        next_state = [action, opponent_action, reward, opponent_reward, t]
        replay_buffer.push(state, action, reward, next_state)
        state = next_state
        total_reward += reward
        update_q_values(replay_buffer, batch_size, policy_net, target_net, optimizer, gamma)
        if t % target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())
    print(f"Episode {episode + 1}, Total Reward: {total_reward}")

test_against_tft(policy_net, num_episodes=100)


Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.5 alembic-1.13.1 colorlog-6.8.2 optuna-3.6.1


[I 2024-06-09 11:13:25,066] A new study created in memory with name: no-name-59a044e4-824c-489f-93f6-5694440dbd28
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
  epsilon_decay = trial.suggest_uniform('epsilon_decay', 0.99, 0.999)
[I 2024-06-09 11:14:44,501] Trial 0 finished with value: 298.41 and parameters: {'hidden_size': 32, 'learning_rate': 0.0004426893419700587, 'batch_size': 128, 'epsilon_decay': 0.9922162223423997, 'target_update': 7}. Best is trial 0 with value: 298.41.
  learning_rate = trial.suggest_loguniform('learning_rate', 1e-4, 1e-2)
  epsilon_decay = trial.suggest_uniform('epsilon_decay', 0.99, 0.999)
[I 2024-06-09 11:15:58,311] Trial 1 finished with value: 282.05 and parameters: {'hidden_size': 64, 'learning_rate': 0.00781675509629425, 'batch_size': 32, 'epsilon_decay': 0.9913323461719815, 'target_update': 9}. Best is trial 0 with value: 298.41.
[I 2024-06-09 11:17:08,422] Trial 2 finished with value: 106.68 and parameters: {'hidden_size': 32,

Best trial: {'hidden_size': 32, 'learning_rate': 0.00010015743203805684, 'batch_size': 64, 'epsilon_decay': 0.9966272901999059, 'target_update': 16}
Episode 1, Total Reward: 226
Episode 2, Total Reward: 168
Episode 3, Total Reward: 130
Episode 4, Total Reward: 110
Episode 5, Total Reward: 107
Episode 6, Total Reward: 104
Episode 7, Total Reward: 104
Episode 8, Total Reward: 107
Episode 9, Total Reward: 104
Episode 10, Total Reward: 107
Episode 11, Total Reward: 104
Episode 12, Total Reward: 107
Episode 13, Total Reward: 104
Episode 14, Total Reward: 107
Episode 15, Total Reward: 104
Episode 16, Total Reward: 104
Episode 17, Total Reward: 104
Episode 18, Total Reward: 104
Episode 19, Total Reward: 110
Episode 20, Total Reward: 107
Episode 21, Total Reward: 104
Episode 22, Total Reward: 110
Episode 23, Total Reward: 107
Episode 24, Total Reward: 104
Episode 25, Total Reward: 104
Episode 26, Total Reward: 104
Episode 27, Total Reward: 104
Episode 28, Total Reward: 104
Episode 29, Total Re

In [None]:
# Grim Trigger strategy
def test_against_grim(policy_net, num_episodes):
    grim_rewards = play_ipd(policy_net, num_episodes, strategy="grim_trigger")
    avg_reward = np.mean(grim_rewards)
    print(f"Average reward against Grim Trigger strategy: {avg_reward}")

test_against_grim(policy_net, num_episodes=100)

Episode 1, Total Reward: 100
Episode 2, Total Reward: 100
Episode 3, Total Reward: 99
Episode 4, Total Reward: 99
Episode 5, Total Reward: 98
Episode 6, Total Reward: 96
Episode 7, Total Reward: 102
Episode 8, Total Reward: 95
Episode 9, Total Reward: 99
Episode 10, Total Reward: 98
Episode 11, Total Reward: 101
Episode 12, Total Reward: 97
Episode 13, Total Reward: 100
Episode 14, Total Reward: 97
Episode 15, Total Reward: 99
Episode 16, Total Reward: 95
Episode 17, Total Reward: 104
Episode 18, Total Reward: 97
Episode 19, Total Reward: 99
Episode 20, Total Reward: 98
Episode 21, Total Reward: 100
Episode 22, Total Reward: 101
Episode 23, Total Reward: 96
Episode 24, Total Reward: 99
Episode 25, Total Reward: 99
Episode 26, Total Reward: 102
Episode 27, Total Reward: 100
Episode 28, Total Reward: 102
Episode 29, Total Reward: 97
Episode 30, Total Reward: 100
Episode 31, Total Reward: 98
Episode 32, Total Reward: 94
Episode 33, Total Reward: 100
Episode 34, Total Reward: 98
Episode 35

In [None]:
# Always Cooperate strategy
def test_against_ac(policy_net, num_episodes):
    ac_rewards = play_ipd(policy_net, num_episodes, strategy="always_cooperate")
    avg_reward = np.mean(ac_rewards)
    print(f"Average reward against Always cooperate strategy: {avg_reward}")

test_against_ac(policy_net, num_episodes=100)

Episode 1, Total Reward: 488
Episode 2, Total Reward: 480
Episode 3, Total Reward: 490
Episode 4, Total Reward: 492
Episode 5, Total Reward: 496
Episode 6, Total Reward: 480
Episode 7, Total Reward: 486
Episode 8, Total Reward: 492
Episode 9, Total Reward: 494
Episode 10, Total Reward: 486
Episode 11, Total Reward: 496
Episode 12, Total Reward: 492
Episode 13, Total Reward: 494
Episode 14, Total Reward: 492
Episode 15, Total Reward: 492
Episode 16, Total Reward: 492
Episode 17, Total Reward: 494
Episode 18, Total Reward: 494
Episode 19, Total Reward: 490
Episode 20, Total Reward: 492
Episode 21, Total Reward: 492
Episode 22, Total Reward: 492
Episode 23, Total Reward: 494
Episode 24, Total Reward: 490
Episode 25, Total Reward: 496
Episode 26, Total Reward: 486
Episode 27, Total Reward: 494
Episode 28, Total Reward: 488
Episode 29, Total Reward: 484
Episode 30, Total Reward: 496
Episode 31, Total Reward: 484
Episode 32, Total Reward: 486
Episode 33, Total Reward: 488
Episode 34, Total R

In [None]:
# Always Defect strategy
def test_against_ad(policy_net, num_episodes):
    ad_rewards = play_ipd(policy_net, num_episodes, strategy="always_defect")
    avg_reward = np.mean(ad_rewards)
    print(f"Average reward against Always defect strategy: {avg_reward}")

test_against_ad(policy_net, num_episodes=100)

Episode 1, Total Reward: 96
Episode 2, Total Reward: 95
Episode 3, Total Reward: 92
Episode 4, Total Reward: 96
Episode 5, Total Reward: 94
Episode 6, Total Reward: 96
Episode 7, Total Reward: 98
Episode 8, Total Reward: 91
Episode 9, Total Reward: 94
Episode 10, Total Reward: 94
Episode 11, Total Reward: 94
Episode 12, Total Reward: 100
Episode 13, Total Reward: 99
Episode 14, Total Reward: 95
Episode 15, Total Reward: 99
Episode 16, Total Reward: 92
Episode 17, Total Reward: 95
Episode 18, Total Reward: 91
Episode 19, Total Reward: 95
Episode 20, Total Reward: 99
Episode 21, Total Reward: 95
Episode 22, Total Reward: 89
Episode 23, Total Reward: 96
Episode 24, Total Reward: 96
Episode 25, Total Reward: 96
Episode 26, Total Reward: 98
Episode 27, Total Reward: 96
Episode 28, Total Reward: 97
Episode 29, Total Reward: 96
Episode 30, Total Reward: 91
Episode 31, Total Reward: 96
Episode 32, Total Reward: 96
Episode 33, Total Reward: 94
Episode 34, Total Reward: 96
Episode 35, Total Rewa

In [None]:
def test_against_random(policy_net, num_episodes):
    random_rewards = play_ipd(policy_net, num_episodes,strategy='random')
    avg_reward = np.mean(random_rewards)
    print("Average reward against random strategy:", avg_reward)


test_against_random(policy_net, num_episodes=100)

Episode 1, Total Reward: 323
Episode 2, Total Reward: 283
Episode 3, Total Reward: 286
Episode 4, Total Reward: 341
Episode 5, Total Reward: 305
Episode 6, Total Reward: 291
Episode 7, Total Reward: 318
Episode 8, Total Reward: 294
Episode 9, Total Reward: 238
Episode 10, Total Reward: 298
Episode 11, Total Reward: 303
Episode 12, Total Reward: 273
Episode 13, Total Reward: 286
Episode 14, Total Reward: 300
Episode 15, Total Reward: 324
Episode 16, Total Reward: 295
Episode 17, Total Reward: 307
Episode 18, Total Reward: 308
Episode 19, Total Reward: 314
Episode 20, Total Reward: 330
Episode 21, Total Reward: 292
Episode 22, Total Reward: 306
Episode 23, Total Reward: 306
Episode 24, Total Reward: 286
Episode 25, Total Reward: 308
Episode 26, Total Reward: 314
Episode 27, Total Reward: 324
Episode 28, Total Reward: 295
Episode 29, Total Reward: 290
Episode 30, Total Reward: 288
Episode 31, Total Reward: 297
Episode 32, Total Reward: 275
Episode 33, Total Reward: 291
Episode 34, Total R