In [41]:
def print_policy(policy, value_function, env, opponent_strategy=None, gamma=None):
    """
    Print policy and value function in a readable format.
    
    Args:
        policy: Policy matrix of shape (num_states, num_actions)
        value_function: Value function array of shape (num_states,)
        env: IteratedPrisonersDilemma environment instance
        opponent_strategy: Optional string to display opponent strategy name
        gamma: Optional discount factor to display
    """
    num_states = env.observation_space.n
    memory_scheme = env.memory_scheme
    
    print("\n" + "=" * 80)
    print("OPTIMAL POLICY AND VALUE FUNCTION")
    print("=" * 80)
    if opponent_strategy:
        print(f"Opponent Strategy: {opponent_strategy}")
    if gamma is not None:
        print(f"Discount Factor (γ): {gamma}")
    print(f"Memory Scheme: {memory_scheme}")
    print(f"Number of States: {num_states}")
    print()
    
    # Print policy table
    print("POLICY (π):")
    print("-" * 80)
    if memory_scheme == 1:
        # Memory-1: 4 states
        state_names = ["(C, C)", "(C, D)", "(D, C)", "(D, D)"]
        print(f"{'State':<15} {'State ID':<10} {'P(C)':<10} {'P(D)':<10} {'Best Action':<15} {'V(s)':<10}")
        print("-" * 80)
        for s in range(num_states):
            state_name = state_names[s]
            p_cooperate = policy[s, 0]
            p_defect = policy[s, 1]
            best_action_idx = np.argmax(policy[s])
            best_action = "Cooperate (C)" if best_action_idx == 0 else "Defect (D)"
            v_value = value_function[s]
            print(f"{state_name:<15} {s:<10} {p_cooperate:<10.4f} {p_defect:<10.4f} {best_action:<15} {v_value:<10.4f}")
    else:
        # Memory-2: 16 states
        print(f"{'State':<25} {'State ID':<10} {'P(C)':<10} {'P(D)':<10} {'Best Action':<15} {'V(s)':<10}")
        print("-" * 80)
        for s in range(num_states):
            # Decode state: [A_t-1, O_t-1, A_t-2, O_t-2]
            state_vector = []
            temp = s
            for i in range(4):
                bit = temp % 2
                state_vector.insert(0, bit)
                temp = temp // 2
            A_t1, O_t1, A_t2, O_t2 = state_vector
            
            # Format state name
            action_map = {0: "C", 1: "D"}
            state_name = f"({action_map[A_t1]},{action_map[O_t1]})→({action_map[A_t2]},{action_map[O_t2]})"
            
            p_cooperate = policy[s, 0]
            p_defect = policy[s, 1]
            best_action_idx = np.argmax(policy[s])
            best_action = "Cooperate (C)" if best_action_idx == 0 else "Defect (D)"
            v_value = value_function[s]
            print(f"{state_name:<25} {s:<10} {p_cooperate:<10.4f} {p_defect:<10.4f} {best_action:<15} {v_value:<10.4f}")
    
    print()
    print("=" * 80)
    print("SUMMARY")
    print("=" * 80)
    print(f"Average State Value: {np.mean(value_function):.4f}")
    print(f"Max State Value: {np.max(value_function):.4f}")
    print(f"Min State Value: {np.min(value_function):.4f}")
    print()



In [42]:
#Find the best policy for each opponent type and discount factor.   

In [43]:
# Import all necessary libraries
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output, HTML
from IPython.display import Video
import seaborn as sns

# Import our custom environment
# Use importlib to ensure we get the latest version (clears cache)
import importlib
import prisoners_dilemma_env
importlib.reload(prisoners_dilemma_env)

from prisoners_dilemma_env import IteratedPrisonersDilemma, COOPERATE, DEFECT, ACTION_MAP

In [44]:
from operator import ge

strategies = ["ALL-C", "ALL-D", "TFT", "IMPERFECT-TFT"]
memory_schemes = [1, 2]
gamma = 0.9

# ============================================================================
# EXPERIMENT: Policy Iteration for Different Opponent Strategies
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - MEMORY SCHEME 1")
print("#" * 80)

# ----------------------------------------------------------------------------
# Opponent: ALL-C (Always Cooperate)
# ----------------------------------------------------------------------------
print("\n" + "-" * 80)
print("OPPONENT: ALL-C (Always Cooperate)")
print("-" * 80)
env = IteratedPrisonersDilemma(opponent_strategy="ALL-C", memory_scheme=1)
best_policy, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy, value_function, env, opponent_strategy="ALL-C")

# ----------------------------------------------------------------------------
# Opponent: ALL-D (Always Defect)
# ----------------------------------------------------------------------------
print("\n" + "-" * 80)
print("OPPONENT: ALL-D (Always Defect)")
print("-" * 80)
env = IteratedPrisonersDilemma(opponent_strategy="ALL-D", memory_scheme=1)
best_policy, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy, value_function, env, opponent_strategy="ALL-D")

# ----------------------------------------------------------------------------
# Opponent: TFT (Tit-for-Tat)
# ----------------------------------------------------------------------------
print("\n" + "-" * 80)
print("OPPONENT: TFT (Tit-for-Tat)")
print("-" * 80)
env = IteratedPrisonersDilemma(opponent_strategy="TFT", memory_scheme=1)
best_policy, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy, value_function, env, opponent_strategy="TFT")

# ----------------------------------------------------------------------------
# Opponent: IMPERFECT-TFT (Imperfect Tit-for-Tat)
# ----------------------------------------------------------------------------
print("\n" + "-" * 80)
print("OPPONENT: IMPERFECT-TFT (Imperfect Tit-for-Tat)")
print("-" * 80)
env = IteratedPrisonersDilemma(opponent_strategy="IMPERFECT-TFT", memory_scheme=1)
best_policy, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy, value_function, env, opponent_strategy="IMPERFECT-TFT")

print("\n" + "#" * 80)
print("# END OF EXPERIMENT")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - MEMORY SCHEME 1
################################################################################

--------------------------------------------------------------------------------
OPPONENT: ALL-C (Always Cooperate)
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: ALL-C
Memory Scheme: 1
Number of States: 4

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          1.0000     0.0000     Cooperate (C)   -30.0000  
(C, D)          1          1.0000     0.0000     Cooperate (C)   -30.0000  
(D, C)          2          1.0000     0.0000     Cooperate (C)   -30.0000  
(D, D)          3 

In [45]:
# ============================================================================
# EXPERIMENT: Policy Iteration for Different Discount Factors (Gamma)
# ============================================================================
# Define gamma values to test
gamma_values = [0.1, 0.5, 0.9, 0.99]



In [46]:
# ============================================================================
# OPPONENT: ALL-C (Always Cooperate) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)
print("# Opponent Strategy: ALL-C (Always Cooperate)")
print("# Memory Scheme: 1")
print("#" * 80)

for gamma in gamma_values:
    print("\n" + "-" * 80)
    print(f"DISCOUNT FACTOR (γ): {gamma}")
    print("-" * 80)
    env = IteratedPrisonersDilemma(opponent_strategy="ALL-C", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="ALL-C", gamma=gamma)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - ALL-C")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################
# Opponent Strategy: ALL-C (Always Cooperate)
# Memory Scheme: 1
################################################################################

--------------------------------------------------------------------------------
DISCOUNT FACTOR (γ): 0.1
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: ALL-C
Discount Factor (γ): 0.1
Memory Scheme: 1
Number of States: 4

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          1.0000     0.0000     Cooperate (C)   -3.33

In [47]:
# ============================================================================
# OPPONENT: ALL-D (Always Defect) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)
print("# Opponent Strategy: ALL-D (Always Defect)")
print("# Memory Scheme: 1")
print("#" * 80)

for gamma in gamma_values:
    print("\n" + "-" * 80)
    print(f"DISCOUNT FACTOR (γ): {gamma}")
    print("-" * 80)
    env = IteratedPrisonersDilemma(opponent_strategy="ALL-D", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="ALL-D", gamma=gamma)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - ALL-D")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################
# Opponent Strategy: ALL-D (Always Defect)
# Memory Scheme: 1
################################################################################

--------------------------------------------------------------------------------
DISCOUNT FACTOR (γ): 0.1
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: ALL-D
Discount Factor (γ): 0.1
Memory Scheme: 1
Number of States: 4

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          1.0000     0.0000     Cooperate (C)   0.0000  

In [48]:
# ============================================================================
# OPPONENT: TFT (Tit-for-Tat) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)
print("# Opponent Strategy: TFT (Tit-for-Tat)")
print("# Memory Scheme: 1")
print("#" * 80)

for gamma in gamma_values:
    print("\n" + "-" * 80)
    print(f"DISCOUNT FACTOR (γ): {gamma}")
    print("-" * 80)
    env = IteratedPrisonersDilemma(opponent_strategy="TFT", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="TFT", gamma=gamma)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - TFT")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################
# Opponent Strategy: TFT (Tit-for-Tat)
# Memory Scheme: 1
################################################################################

--------------------------------------------------------------------------------
DISCOUNT FACTOR (γ): 0.1
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: TFT
Discount Factor (γ): 0.1
Memory Scheme: 1
Number of States: 4

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          1.0000     0.0000     Cooperate (C)   -3.3333   
(C,

In [49]:
# ============================================================================
# OPPONENT: IMPERFECT-TFT (Imperfect Tit-for-Tat) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)
print("# Opponent Strategy: IMPERFECT-TFT (Imperfect Tit-for-Tat)")
print("# Memory Scheme: 1")
print("#" * 80)

for gamma in gamma_values:
    print("\n" + "-" * 80)
    print(f"DISCOUNT FACTOR (γ): {gamma}")
    print("-" * 80)
    env = IteratedPrisonersDilemma(opponent_strategy="IMPERFECT-TFT", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="IMPERFECT-TFT", gamma=gamma)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - IMPERFECT-TFT")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################
# Opponent Strategy: IMPERFECT-TFT (Imperfect Tit-for-Tat)
# Memory Scheme: 1
################################################################################

--------------------------------------------------------------------------------
DISCOUNT FACTOR (γ): 0.1
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: IMPERFECT-TFT
Discount Factor (γ): 0.1
Memory Scheme: 1
Number of States: 4

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          1.0000     0.0000     


################################################################################
# TESTING play_policy METHOD
################################################################################

Opponent Strategy: TFT
Memory Scheme: 1
Policy shape: (4, 2)

CUMULATIVE RESULTS
Cumulative Reward: -54.00
Number of Steps: 50
Average Reward per Step: -1.08

First 10 Steps:
Step   Agent    Opponent   Reward    
----------------------------------------
1      D        C          -5.00     
2      D        D          -1.00     
3      D        D          -1.00     
4      D        D          -1.00     
5      D        D          -1.00     
6      D        D          -1.00     
7      D        D          -1.00     
8      D        D          -1.00     
9      D        D          -1.00     
10     D        D          -1.00     

...

Last 10 Steps:
Step   Agent    Opponent   Reward    
----------------------------------------
41     D        D          -1.00     
42     D        D          -1.00   