# Prisoner's Dilemma Dataset Generation

## Pip Installations

In [None]:
%pip install transformers
%pip install --upgrade transformers
%pip install torch
%pip install accelerate
%pip install pandas

Collecting transformers
  Downloading transformers-4.54.0-py3-none-any.whl.metadata (41 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.1-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting tqdm>=4.27 (from transformers)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub<1.0,>=0.34.0->transformers)
  Downloading hf_xet-1.1.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (879 bytes)
Downloading tra

## Importing Qwen3-1.7B from HuggingFace

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.set_default_device(device)

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-1.7B")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-1.7B")

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/622M [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.44G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [36]:
#Qwen-3-1.7B PD-Player Implementation
def qwenGen(prompt):
  inputs = tokenizer(prompt, return_tensors="pt")
  inputs = inputs.to(device)

  outputs = model.generate(
    **inputs,
    max_new_tokens=80,
    do_sample=True,
    temperature=0.85,
    top_p=0.9,
    repetition_penalty=1.2
)
  output = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
  return output


## Base Prompt for Prisoner's Dilemma Game

In [5]:
base_prompt = """
You are playing an iterated Prisoner's Dilemma game.

Each round, choose between:
- 'C' (Cooperate)
- 'D' (Defect)

Objective:
Maximize your long-term reward over multiple rounds against Player A.

Payoff Matrix:
- ('C', 'C') → both players receive 6 points
- ('C', 'D') → Player A gets 0, Player B gets 10
- ('D', 'C') → Player A gets 10, Player B gets 0
- ('D', 'D') → both players receive 1 point

Strategy Guidance:
- If Player A has cooperated more often than defected, consider defecting to exploit that behavior.
- If Player A’s behavior is mixed or unpredictable, analyze the past rounds and select the action you believe offers the best future reward.
- Rely solely on the game rules and history provided — be logical, concise, and focused.

Game History from Your Perspective:

{formatted_history}

Respond clearly and crisply with your next action inside square brackets: [C] or [D]
"""


## Dataset Generation Functions/Utilities

In [24]:
import pandas as pd
columns = ["Number of Rounds", "History", "Co-op Payoff", "Defect Positive Payoff", "Defect Negative Payoff"]
dataset = pd.DataFrame(columns=columns)
#paymat = [[num_rounds], [coop_payoff], [defect_pos_payoff], [defect_neg_payoff]] 
def data_update(paymat, history, model_action):
    global dataset
    str_history = ""
    for i in history:
        for j in range(4):
            str_history += str(i[j])
        str_history += " "
    new_row = {
        "Number of Rounds": paymat[0],
        "History": str_history,
        "Co-op Payoff": paymat[1][0],
        "Defect Positive Payoff": paymat[2][1],
        "Defect Negative Payoff": paymat[3][0],
        "Model Action": model_action
    }
    dataset = pd.concat([dataset, pd.DataFrame([new_row])], ignore_index=True)
def data_reset():
    global dataset
    columns = ["Number of Rounds", "History", "Co-op Payoff", "Defect Positive Payoff", "Defect Negative Payoff"]
    dataset = pd.DataFrame(columns=columns)

## Game Architecture/Helper Functions

In [8]:
class PrisonersDilemma:
    def __init__(self, coop, pos_d, neg_d, num_rounds=1):
        self.num_rounds = num_rounds
        self.history = []  #of the form (action_A, action_B, A_payoff, B_payoff).
        self.payoff_matrix = {
            ('C', 'C'): (coop, coop),
            ('C', 'D'): (0, pos_d),
            ('D', 'C'): (pos_d, 0),
            ('D', 'D'): (neg_d, neg_d)
        }
        self.paymat = [self.num_rounds, self.payoff_matrix[('C', 'C')], self.payoff_matrix[('C', 'D')], self.payoff_matrix[('D', 'D')]]
        self.promptA = base_prompt.format(num_rounds=self.num_rounds, formatted_history=format_history_with_payoffs(self.history, player="A"))
        self.promptB = base_prompt.format(num_rounds=self.num_rounds, formatted_history=format_history_with_payoffs(self.history, player="B"))


    def step(self, action_A, action_B):
        global dataset
        payoff_A, payoff_B = self.payoff_matrix[(action_A, action_B)]
        data_update(self.paymat, self.history.copy(), action_A)
        self.history.append((action_A, action_B, payoff_A, payoff_B))
        self.promptA = base_prompt.format(num_rounds=self.num_rounds, formatted_history=format_history_with_payoffs(self.history, player="A"))
        self.promptB = base_prompt.format(num_rounds=self.num_rounds, formatted_history=format_history_with_payoffs(self.history, player="B"))
        return payoff_A, payoff_B

    def reset(self):
        self.history = []

    def genPrompt(self, p_name):
        return self.promptA if p_name == "A" else self.promptB if p_name == "B" else None

In [38]:
import re
import random
class LLMModel:
    def __init__(self, name, strategy_fn, use_LLM = True):
        self.name = name
        self.strategy_fn = strategy_fn
        self.use_LLM = use_LLM

    def act(self, prompt, history = []):
        if self.use_LLM:
            output = self.strategy_fn(prompt)
            match = re.findall(r"\[([A-Z])\]", output)
            return match[-1]
        elif self.strategy_fn == "random":
            a = random.choice(['C', 'D'])
            return a
        elif self.strategy_fn == "titfortat":
            if self.name == "TitForTatA" and history:
                return history[-1][1]
            if self.name == "TitForTatB" and history:
                return history[-1][0]
            else:
                return random.choice(['C', 'D'])
        else:
            return self.strategy_fn
            

In [10]:
def format_history_with_payoffs(history, player):
    lines = []
    for i, (a, b, pa, pb) in enumerate(history, 1):
        if player == "A":
            lines.append(f"Round {i}: You chose {a}, Opponent chose {b} → You got {pa}, Opponent got {pb} \nNew Round Starting... update your responses based on this information to try to get more reward!")
        else:
            lines.append(f"Round {i}: You chose {b}, Opponent chose {a} → You got {pb}, Opponent got {pa} \nNew Round Starting... update your responses based on this information to try to get more reward!")
    return "\n".join(lines) if lines else ""

In [17]:
def run_game(player_A, player_B, env):
    scores = [0, 0]
    for round_num in range(env.num_rounds):
        aPrompt = env.genPrompt("A")
        bPrompt = env.genPrompt("B")
        # print(aPrompt)
        # print(bPrompt)
        action_A = player_A.act(aPrompt, env.history,)
        action_B = player_B.act(bPrompt, env.history,)
        reward_A, reward_B = env.step(action_A, action_B)
        scores[0] += reward_A
        scores[1] += reward_B
        print(f"Round {round_num + 1}: A={action_A}, B={action_B} → A:{reward_A}, B:{reward_B}")
    print(f"Final Scores → A: {scores[0]}, B: {scores[1]}")

## Running Experiment!

In [39]:
data_reset()

# Co-op: 6; Pos_D: 10; Neg_D: 1.
c = 6
pos_d = 10
neg_d = 1
for i in range(3):
    # Qwen vs. Tit-for-Tat
    env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
    playerA = LLMModel("QwenA", qwenGen)
    playerTitForTatB = LLMModel("TitForTatB", "titfortat", use_LLM = False) 
    run_game(playerA, playerTitForTatB, env)
    
    # Qwen vs. Qwen
    env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
    playerA = LLMModel("QwenA", qwenGen)
    playerB = LLMModel("QwenB", qwenGen)
    run_game(playerA, playerB, env)
    
    # Qwen vs. Random
    env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
    playerA = LLMModel("QwenA", qwenGen)
    randB = LLMModel("Random", "random", use_LLM = False)
    run_game(playerA, randB, env)
    
    # Qwen vs. Always Cooperate
    env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
    playerA = LLMModel("QwenA", qwenGen)
    playerCoopB = LLMModel("AlwaysCooperate", 'C', use_LLM = False) 
    run_game(playerA, playerCoopB, env)
    
    # Qwen vs. Always Defect
    env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
    playerA = LLMModel("QwenA", qwenGen)
    playerDefectB = LLMModel("AlwaysDefect", 'D', use_LLM = False) 
    run_game(playerA, playerDefectB, env)
    
    # TitForTat vs. TitForTat
    env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
    playerTitForTatA = LLMModel("TitForTatA", "titfortat", use_LLM = False) 
    playerTitForTatB = LLMModel("TitForTatB", "titfortat", use_LLM = False) 
    run_game(playerTitForTatA, playerTitForTatB, env)
    
    # Random vs. Random
    env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
    randA = LLMModel("RandomA", "random", use_LLM = False)
    randB = LLMModel("RandomB", "random", use_LLM = False)
    run_game(randA, randB, env)
    
    # TitForTat vs. Random
    env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
    playerTitForTatA = LLMModel("TitForTatA", "titfortat", use_LLM = False) 
    randB = LLMModel("randB", "random", use_LLM = False) 
    run_game(playerTitForTatA, randB, env)

''' - - - - - Experimental Past This Point - - - - - '''

# #Co-op: 20; Pos_D: 5; Neg_D: 0.
# c = 20
# pos_d = 5
# neg_d = 0
# #random model
# env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
# playerA = LLMModel("QwenA", qwenGen)
# #playerB = LLMModel("QwenB", qwenGen)
# playerRand = LLMModel("Random", random.choice(['C', 'D']), use_LLM = False) 
# #run_game(playerA, playerB, env)
# run_game(playerA, playerRand, env)

# #always Cooperate
# env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
# playerA = LLMModel("QwenA", qwenGen)
# #playerB = LLMModel("QwenB", qwenGen)
# playerRand = LLMModel("Random", 'C', use_LLM = False) 
# #run_game(playerA, playerB, env)
# run_game(playerA, playerRand, env)

# #always Defect
# env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
# playerA = LLMModel("QwenA", qwenGen)
# #playerB = LLMModel("QwenB", qwenGen)
# playerRand = LLMModel("Random", 'D', use_LLM = False) 
# #run_game(playerA, playerB, env)
# run_game(playerA, playerRand, env)

# #Co-op: 10; Pos_D: 11; Neg_D: 2.
# c = 10
# pos_d = 11
# neg_d = 2
# #random model
# env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
# playerA = LLMModel("QwenA", qwenGen)
# #playerB = LLMModel("QwenB", qwenGen)
# playerRand = LLMModel("Random", random.choice(['C', 'D']), use_LLM = False) 
# #run_game(playerA, playerB, env)
# run_game(playerA, playerRand, env)

# #always Cooperate
# env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
# playerA = LLMModel("QwenA", qwenGen)
# #playerB = LLMModel("QwenB", qwenGen)
# playerRand = LLMModel("Random", 'C', use_LLM = False) 
# #run_game(playerA, playerB, env)
# run_game(playerA, playerRand, env)

# #always Defect
# env = PrisonersDilemma(c, pos_d, neg_d, num_rounds=7)
# playerA = LLMModel("QwenA", qwenGen)
# #playerB = LLMModel("QwenB", qwenGen)
# playerRand = LLMModel("Random", 'D', use_LLM = False) 
# #run_game(playerA, playerB, env)
# run_game(playerA, playerRand, env)


Round 1: A=D, B=C → A:10, B:0
Round 2: A=D, B=D → A:1, B:1
Round 3: A=D, B=D → A:1, B:1
Round 4: A=D, B=D → A:1, B:1
Round 5: A=D, B=D → A:1, B:1
Round 6: A=D, B=D → A:1, B:1
Round 7: A=D, B=D → A:1, B:1
Final Scores → A: 16, B: 6
Round 1: A=D, B=C → A:10, B:0
Round 2: A=D, B=D → A:1, B:1
Round 3: A=D, B=D → A:1, B:1
Round 4: A=D, B=D → A:1, B:1
Round 5: A=D, B=D → A:1, B:1
Round 6: A=D, B=D → A:1, B:1
Round 7: A=D, B=D → A:1, B:1
Final Scores → A: 16, B: 6
Round 1: A=C, B=C → A:6, B:6
Round 2: A=D, B=C → A:10, B:0
Round 3: A=D, B=C → A:10, B:0
Round 4: A=D, B=C → A:10, B:0
Round 5: A=D, B=D → A:1, B:1
Round 6: A=D, B=C → A:10, B:0
Round 7: A=D, B=D → A:1, B:1
Final Scores → A: 48, B: 8
Round 1: A=D, B=C → A:10, B:0
Round 2: A=D, B=C → A:10, B:0
Round 3: A=D, B=C → A:10, B:0
Round 4: A=D, B=C → A:10, B:0
Round 5: A=D, B=C → A:10, B:0
Round 6: A=D, B=C → A:10, B:0
Round 7: A=D, B=C → A:10, B:0
Final Scores → A: 70, B: 0
Round 1: A=D, B=D → A:1, B:1
Round 2: A=D, B=D → A:1, B:1
Round 3: 

' - - - - - Experimental Past This Point - - - - - '

In [40]:
print(dataset.to_string(index=False))

Number of Rounds                              History Co-op Payoff Defect Positive Payoff Defect Negative Payoff Model Action
               7                                                 6                     10                      1            D
               7                               DC100             6                     10                      1            D
               7                          DC100 DD11             6                     10                      1            D
               7                     DC100 DD11 DD11             6                     10                      1            D
               7                DC100 DD11 DD11 DD11             6                     10                      1            D
               7           DC100 DD11 DD11 DD11 DD11             6                     10                      1            D
               7      DC100 DD11 DD11 DD11 DD11 DD11             6                     10                      1      

In [41]:
dataset.to_csv("game_data.csv", index=False)
