In [57]:
#Find the best policy for each opponent type and discount factor.   

In [None]:
# Import all necessary libraries
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output, HTML
from IPython.display import Video
import seaborn as sns

# Import our custom environment
# Use importlib to ensure we get the latest version (clears cache)
import importlib
import prisoners_dilemma_env
importlib.reload(prisoners_dilemma_env)

from prisoners_dilemma_env import IteratedPrisonersDilemma, COOPERATE, DEFECT, ACTION_MAP

# Import utility functions
import utils
importlib.reload(utils)
from utils import print_policy, print_comparison

In [59]:
from operator import ge

strategies = ["ALL-C", "ALL-D", "TFT", "IMPERFECT-TFT"]
memory_schemes = [1, 2]
gamma = 0.9

# ============================================================================
# EXPERIMENT: Policy Iteration for Different Opponent Strategies
# ============================================================================

# ----------------------------------------------------------------------------
# Opponent: ALL-C (Always Cooperate)
# ----------------------------------------------------------------------------

env = IteratedPrisonersDilemma(opponent_strategy="ALL-C", memory_scheme=1)
best_policy, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy, value_function, env, opponent_strategy="ALL-C")

# ----------------------------------------------------------------------------
# COMPARISON: Random Policy vs Best Policy for ALL-C
# ----------------------------------------------------------------------------
comparison = env.test_against_random(best_policy, num_steps=100)
print_comparison(comparison)

# ----------------------------------------------------------------------------
# Opponent: ALL-D (Always Defect)
# ----------------------------------------------------------------------------

env = IteratedPrisonersDilemma(opponent_strategy="ALL-D", memory_scheme=1)
best_policy, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy, value_function, env, opponent_strategy="ALL-D")

# ----------------------------------------------------------------------------
# COMPARISON: Random Policy vs Best Policy for ALL-D
# ----------------------------------------------------------------------------
comparison = env.test_against_random(best_policy, num_steps=100)
print_comparison(comparison)

# ----------------------------------------------------------------------------
# Opponent: TFT (Tit-for-Tat)
# ----------------------------------------------------------------------------

env = IteratedPrisonersDilemma(opponent_strategy="TFT", memory_scheme=1)
best_policy, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy, value_function, env, opponent_strategy="TFT")

# ----------------------------------------------------------------------------
# COMPARISON: Random Policy vs Best Policy for TFT
# ----------------------------------------------------------------------------
comparison = env.test_against_random(best_policy, num_steps=100)
print_comparison(comparison)

# ----------------------------------------------------------------------------
# Opponent: IMPERFECT-TFT (Imperfect Tit-for-Tat)
# ----------------------------------------------------------------------------

env = IteratedPrisonersDilemma(opponent_strategy="IMPERFECT-TFT", memory_scheme=1)
best_policy, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy, value_function, env, opponent_strategy="IMPERFECT-TFT")

# ----------------------------------------------------------------------------
# COMPARISON: Random Policy vs Best Policy for IMPERFECT-TFT
# ----------------------------------------------------------------------------
comparison = env.test_against_random(best_policy, num_steps=100)
print_comparison(comparison)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT")
print("#" * 80 + "\n")



OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: ALL-C
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          1.0000     0.0000     Cooperate (C)   -30.0000  
(C, D)          1          1.0000     0.0000     Cooperate (C)   -30.0000  
(D, C)          2          1.0000     0.0000     Cooperate (C)   -30.0000  
(D, D)          3          1.0000     0.0000     Cooperate (C)   -30.0000  


Random vs Best Policy:
  Random: -390.00
  Best:   -300.00
  Diff:   90.00


OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: ALL-D
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
-------------------------------------------

In [60]:
# ============================================================================
# EXPERIMENT: Policy Iteration for Different Discount Factors (Gamma)
# ============================================================================
# Define gamma values to test
gamma_values = [0.1, 0.5, 0.9, 0.99]



In [None]:
# ============================================================================
# OPPONENT: ALL-C (Always Cooperate) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)

for gamma in gamma_values:
    env = IteratedPrisonersDilemma(opponent_strategy="ALL-C", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="ALL-C", gamma=gamma)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - ALL-C")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################
# Opponent Strategy: ALL-C (Always Cooperate)
# Memory Scheme: 1
################################################################################

--------------------------------------------------------------------------------
DISCOUNT FACTOR (γ): 0.1
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: ALL-C
Discount Factor (γ): 0.1
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          1.0000     0.0000     Cooperate (C)   -3.3333   
(C, D)        

In [65]:
# ============================================================================
# OPPONENT: ALL-D (Always Defect) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)
print("# Opponent Strategy: ALL-D (Always Defect)")
print("# Memory Scheme: 1")
print("#" * 80)

for gamma in gamma_values:
    print("\n" + "-" * 80)
    print(f"DISCOUNT FACTOR (γ): {gamma}")
    print("-" * 80)
    env = IteratedPrisonersDilemma(opponent_strategy="ALL-D", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="ALL-D", gamma=gamma)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - ALL-D")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################
# Opponent Strategy: ALL-D (Always Defect)
# Memory Scheme: 1
################################################################################

--------------------------------------------------------------------------------
DISCOUNT FACTOR (γ): 0.1
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: ALL-D
Discount Factor (γ): 0.1
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          1.0000     0.0000     Cooperate (C)   0.0000    
(C, D)          1

In [63]:
# ============================================================================
# OPPONENT: TFT (Tit-for-Tat) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)
print("# Opponent Strategy: TFT (Tit-for-Tat)")
print("# Memory Scheme: 1")
print("#" * 80)

for gamma in gamma_values:
    print("\n" + "-" * 80)
    print(f"DISCOUNT FACTOR (γ): {gamma}")
    print("-" * 80)
    env = IteratedPrisonersDilemma(opponent_strategy="TFT", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="TFT", gamma=gamma)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - TFT")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################
# Opponent Strategy: TFT (Tit-for-Tat)
# Memory Scheme: 1
################################################################################

--------------------------------------------------------------------------------
DISCOUNT FACTOR (γ): 0.1
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: TFT
Discount Factor (γ): 0.1
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          1.0000     0.0000     Cooperate (C)   -3.3333   
(C, D)          1      

In [64]:
# ============================================================================
# OPPONENT: IMPERFECT-TFT (Imperfect Tit-for-Tat) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)
print("# Opponent Strategy: IMPERFECT-TFT (Imperfect Tit-for-Tat)")
print("# Memory Scheme: 1")
print("#" * 80)

for gamma in gamma_values:
    print("\n" + "-" * 80)
    print(f"DISCOUNT FACTOR (γ): {gamma}")
    print("-" * 80)
    env = IteratedPrisonersDilemma(opponent_strategy="IMPERFECT-TFT", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="IMPERFECT-TFT", gamma=gamma)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - IMPERFECT-TFT")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################
# Opponent Strategy: IMPERFECT-TFT (Imperfect Tit-for-Tat)
# Memory Scheme: 1
################################################################################

--------------------------------------------------------------------------------
DISCOUNT FACTOR (γ): 0.1
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: IMPERFECT-TFT
Discount Factor (γ): 0.1
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          1.0000     0.0000     Cooperate (C)   -3.0