<a href="https://colab.research.google.com/github/Kshitij04Poojary/Iterated-Prisoners-Dilemma/blob/main/Iterated_Prisoner's_Dilemma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from scipy.stats import pearsonr
import pickle
import numpy as np
import pandas as pd
import torch
from torch import nn, optim
from torch.autograd import Variable
from torch.nn import init
import seaborn as sns

sns.set()

from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.api import VAR
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression

MSE = mean_squared_error
lag = 1

In [12]:
import matplotlib as mpl
mpl.rcParams.update(mpl.rcParamsDefault) #updates the default Matplotlib configuration settings to their default values

In [13]:
SMALL_SIZE = 15
MEDIUM_SIZE = 20
BIGGER_SIZE = 30

plt.rc("font", size=SMALL_SIZE)  # controls default text sizes
plt.rc("axes", titlesize=SMALL_SIZE)  # fontsize of the axes title
plt.rc("axes", labelsize=MEDIUM_SIZE)  # fontsize of the x and y labels
plt.rc("xtick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
plt.rc("ytick", labelsize=SMALL_SIZE)  # fontsize of the tick labels
plt.rc("legend", fontsize=SMALL_SIZE)  # legend fontsize
plt.rc("figure", titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# # cooperation rate
# def getCR(r):
#     if len(r.shape) == 3:
#         r = r[:, :, 0] #removes the third dimension
#     cr = np.zeros(r.shape)
#     for t in np.arange(r.shape[1]): #0 to r.shape[1]-1
#         for b in np.arange(r.shape[0]):
#             cr[b, t] = r[b, : t + 1].sum() / (t + 1)#It sums up the cooperation decisions made by the player up to the current time step (t) and divides by the total number of time steps so far (t + 1)
#     return cr

In [15]:
## IPD Task
def MSE_by_time(r, p):
    err = []
    for t in np.arange(r.shape[1]):
        if len(p.shape) == 3:
            err.append(MSE(r[:, t, :], p[:, t, :]))
        else:
            err.append(MSE(r[:, t, 0], p[:, t]))
    return np.array(err)


def ipd_regression_data(regressiondata):
    x, y = [], []
    for e in regressiondata:
        for t in np.arange(8):
            x.append(
                [
                    e[2],
                    e[3],
                    e[0],
                    e[4],
                    e[1],
                    e[1] * e[2],
                    e[1] * e[3],
                    e[46],
                    e[47],
                    e[46] * e[1],
                    e[14 - t],
                    e[23 - t],
                    e[4] * e[23 - t],
                    t + 2,
                ]
            )
            y.append(np.abs(e[13 - t]))
    return np.array(x), np.array(y)


def valid_ipd(n):
    if n < 1:
        n = int(8258 * n)
    shuffindex = np.random.permutation(8258)
    data = pd.read_csv("/content/drive/MyDrive/IPD/all_data.csv")
    trajs = np.array(data[data["period"] == 10].iloc[:, 9:27])  # (8258, 18)
    regressiondata = np.array(data[data["period"] == 10].iloc[:, 3:51])  # (8258, 48)
    regressiondata, trajs = regressiondata[shuffindex], trajs[shuffindex]
    train_set_rgx, train_set_rgy = ipd_regression_data(regressiondata[n:])
    test_set_rgx, test_set_rgy = ipd_regression_data(regressiondata[:n])
    trajs = trajs.reshape((trajs.shape[0], 2, 9))  # (8258, 2, 9)
    trajs[trajs == 0] = 2
    trajs = trajs - 1
    train_set, test_set = trajs[n:], trajs[:n]
    return train_set, test_set, train_set_rgx, test_set_rgx, train_set_rgy, test_set_rgy


class lstmModel(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, layer_num):
        super().__init__()
        self.lstmLayer = nn.LSTM(in_dim, hidden_dim, layer_num)
        self.relu = nn.ReLU()
        self.fcLayer = nn.Linear(hidden_dim, out_dim)
        self.weightInit = np.sqrt(1.0 / hidden_dim)

    def forward(self, x):
        out, _ = self.lstmLayer(x)
        out = self.relu(out)
        out = self.fcLayer(out)
        out = nn.Softmax(dim=-1)(out)
        return out


n_fold = 5
for fold in np.arange(n_fold):
    (
        train_set,
        test_set,
        train_set_rgx,
        test_set_rgx,
        train_set_rgy,
        test_set_rgy,
    ) = valid_ipd(0.2)
    full_data = {
        "train": train_set,
        "test": test_set,
        "train_set_rgx": train_set_rgx,
        "train_set_rgy": train_set_rgy,
        "test_set_rgx": test_set_rgx,
        "test_set_rgy": test_set_rgy,
    }

    with open("/content/drive/MyDrive/IPD/processed_train_test.pkl", "wb") as handle:
        pickle.dump(full_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    n_nodes, n_layers = 10, 2
    lstm = lstmModel(2, n_nodes, 2, n_layers)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(lstm.parameters(), lr=1e-2)
    n_epochs, window, batch_size = 10, 10, 100
    loss_set = []
    for ep in np.arange(n_epochs):
        for bc in np.arange(train_set.shape[0] / batch_size):
            inputs = Variable(
                torch.from_numpy(
                    train_set[int(bc * batch_size) : int((bc + 1) * batch_size)]
                )
                .transpose(1, 2)
                .float()
            )
            target = Variable(
                torch.from_numpy(
                    train_set[int(bc * batch_size) : int((bc + 1) * batch_size)]
                )
                .transpose(1, 2)
                .float()
            )
            output = lstm(inputs)
            loss = criterion(output.squeeze()[:, :-lag, 0], target[:, lag:, 0])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            print_loss = loss.item()
            loss_set.append(print_loss)
            if bc % window == 0:
                print(fold)
                print(
                    "Epoch[{}/{}], Batch[{}/{}], Loss: {:.5f}".format(
                        ep + 1,
                        n_epochs,
                        bc + 1,
                        train_set.shape[0] / batch_size,
                        print_loss,
                    )
                )
    lstm = lstm.eval()

0
Epoch[1/10], Batch[1.0/66.07], Loss: 0.24667
0
Epoch[1/10], Batch[11.0/66.07], Loss: 0.24499
0
Epoch[1/10], Batch[21.0/66.07], Loss: 0.22961
0
Epoch[1/10], Batch[31.0/66.07], Loss: 0.17950
0
Epoch[1/10], Batch[41.0/66.07], Loss: 0.11435
0
Epoch[1/10], Batch[51.0/66.07], Loss: 0.11284
0
Epoch[1/10], Batch[61.0/66.07], Loss: 0.11038
0
Epoch[2/10], Batch[1.0/66.07], Loss: 0.10384
0
Epoch[2/10], Batch[11.0/66.07], Loss: 0.12849
0
Epoch[2/10], Batch[21.0/66.07], Loss: 0.10882
0
Epoch[2/10], Batch[31.0/66.07], Loss: 0.11904
0
Epoch[2/10], Batch[41.0/66.07], Loss: 0.10033
0
Epoch[2/10], Batch[51.0/66.07], Loss: 0.10911
0
Epoch[2/10], Batch[61.0/66.07], Loss: 0.10796
0
Epoch[3/10], Batch[1.0/66.07], Loss: 0.10254
0
Epoch[3/10], Batch[11.0/66.07], Loss: 0.12846
0
Epoch[3/10], Batch[21.0/66.07], Loss: 0.10813
0
Epoch[3/10], Batch[31.0/66.07], Loss: 0.11892
0
Epoch[3/10], Batch[41.0/66.07], Loss: 0.10011
0
Epoch[3/10], Batch[51.0/66.07], Loss: 0.10931
0
Epoch[3/10], Batch[61.0/66.07], Loss: 0.1

In [16]:
test_input = torch.from_numpy(test_set).float()
with torch.no_grad():
    lstm_output = lstm(test_input)

In [19]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import pickle

# Define the neural network architecture
class DQN(nn.Module):
    def __init__(self, input_size, lstm_output_size, hidden_size, output_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(input_size + lstm_output_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Define the replay buffer
class ReplayBuffer:
    def __init__(self, capacity):
        self.capacity = capacity
        self.buffer = []
        self.position = 0

    def push(self, state, action, reward, next_state):
        if len(self.buffer) < self.capacity:
            self.buffer.append(None)
        self.buffer[self.position] = (state, action, reward, next_state)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return zip(*random.sample(self.buffer, batch_size))

    def __len__(self):
        return len(self.buffer)

# Define constants
input_size = 3  # Original state representation size
lstm_output_size = 2  # Size of the output of the LSTM model
hidden_size = 64  # Hidden layer size
output_size = 2  # Number of actions
batch_size = 64
gamma = 0.99  # Discount factor
epsilon_start = 1.0
epsilon_end = 0.01
epsilon_decay = 0.995
target_update = 10  # Update target network every 10 steps
num_episodes = 200

# Define the IPD environment
class IteratedPrisonersDilemma:
    def __init__(self):
        self.num_actions = 2  # Cooperate or Defect
        self.payoff_matrix = np.array([[3, 0], [5, 1], [1, 5], [0, 0]])  # Payoff matrix

    def step(self, action1, action2):
        reward1 = self.payoff_matrix[action1][action2]
        reward2 = self.payoff_matrix[action2][action1]
        return reward1, reward2

# Initialize DQN, target DQN, optimizer
policy_net = DQN(input_size, lstm_output_size, hidden_size, output_size)
target_net = DQN(input_size, lstm_output_size, hidden_size, output_size)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=0.001)

# Initialize replay buffer
replay_buffer = ReplayBuffer(capacity=10000)

# Epsilon-greedy action selection
def select_action(state, epsilon):
    if np.random.rand() < epsilon:
        return np.random.randint(output_size)
    else:
        with torch.no_grad():
            q_values = policy_net(torch.tensor(state, dtype=torch.float32))
            return q_values.argmax().item()

# Update Q-values using DQN
def update_q_values():
    if len(replay_buffer) > batch_size:
        states, actions, rewards, next_states = replay_buffer.sample(batch_size)
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.long)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        next_states = torch.tensor(next_states, dtype=torch.float32)

        q_values = policy_net(states)
        q_values_next = target_net(next_states).max(1)[0].detach()
        expected_q_values = rewards + gamma * q_values_next

        loss = nn.functional.mse_loss(q_values.gather(1, actions.unsqueeze(1)), expected_q_values.unsqueeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Training loop
ipd_env = IteratedPrisonersDilemma()
for episode in range(num_episodes):
    state = [0, 0, 0, 0, 0]  # Initial state
    total_reward = 0
    for t in range(100):  # Limiting episode length
        # Select action
        epsilon = epsilon_end + (epsilon_start - epsilon_end) * np.exp(-episode / epsilon_decay)
        action = select_action(state, epsilon)

        # Take action
        opponent_action = np.random.randint(2)  # Random opponent action
        reward, opponent_reward = ipd_env.step(action, opponent_action)

        # Store transition in replay buffer
        next_state = [action, opponent_action, reward, opponent_reward, 0]  # Placeholder for next state
        replay_buffer.push(state, action, reward, next_state)

        # Move to the next state
        state = next_state
        total_reward += reward

        # Perform one step of optimization (on minibatch)
        update_q_values()

        # Update target network
        if t % target_update == 0:
            target_net.load_state_dict(policy_net.state_dict())

    print(f"Episode {episode + 1}, Total Reward: {total_reward}")

# After training, you can use the policy_net to play the game


Episode 1, Total Reward: 256
Episode 2, Total Reward: 239
Episode 3, Total Reward: 183
Episode 4, Total Reward: 149
Episode 5, Total Reward: 145
Episode 6, Total Reward: 139
Episode 7, Total Reward: 188
Episode 8, Total Reward: 209
Episode 9, Total Reward: 192
Episode 10, Total Reward: 205
Episode 11, Total Reward: 194
Episode 12, Total Reward: 196
Episode 13, Total Reward: 216
Episode 14, Total Reward: 220
Episode 15, Total Reward: 228
Episode 16, Total Reward: 235
Episode 17, Total Reward: 230
Episode 18, Total Reward: 228
Episode 19, Total Reward: 221
Episode 20, Total Reward: 222
Episode 21, Total Reward: 241
Episode 22, Total Reward: 238
Episode 23, Total Reward: 196
Episode 24, Total Reward: 189
Episode 25, Total Reward: 254
Episode 26, Total Reward: 228
Episode 27, Total Reward: 248
Episode 28, Total Reward: 223
Episode 29, Total Reward: 250
Episode 30, Total Reward: 269
Episode 31, Total Reward: 216
Episode 32, Total Reward: 234
Episode 33, Total Reward: 219
Episode 34, Total R