In [131]:
# ============================================================================
# GOOGLE COLAB SETUP
# ============================================================================
# Run this cell first when using Google Colab
# It will prompt you to upload the required Python files

# Install required packages
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", 
                       "gymnasium", "numpy", "matplotlib", "seaborn", 
                       "tqdm", "pandas", "imageio", "imageio-ffmpeg", 
                       "pillow", "moviepy"])

# Upload required files (only in Colab)
try:
    from google.colab import files  # type: ignore
    print("=" * 80)
    print("GOOGLE COLAB DETECTED")
    print("=" * 80)
    print("\nPlease upload the following files:")
    print("1. prisoners_dilemma_env.py")
    print("2. utils.py")
    uploaded = files.upload()
    
    # Verify files were uploaded
    import os
    for filename in uploaded.keys():
        print(f"✓ Uploaded: {filename}")
except ImportError:
    print("=" * 80)
    print("LOCAL JUPYTER ENVIRONMENT DETECTED")
    print("=" * 80)
    print("Make sure prisoners_dilemma_env.py and utils.py are in the same directory.")


LOCAL JUPYTER ENVIRONMENT DETECTED
Make sure prisoners_dilemma_env.py and utils.py are in the same directory.


You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.


In [132]:
#Find the best policy for each opponent type and discount factor.   

In [133]:
# Import all necessary libraries
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output, HTML
from IPython.display import Video
import seaborn as sns

# Import our custom environment
# Use importlib to ensure we get the latest version (clears cache)
import importlib
import prisoners_dilemma_env
importlib.reload(prisoners_dilemma_env)

from prisoners_dilemma_env import IteratedPrisonersDilemma, COOPERATE, DEFECT, ACTION_MAP

# Import utility functions
import utils
importlib.reload(utils)
from utils import print_policy, print_comparison

In [134]:
from operator import ge

strategies = ["ALL-C", "ALL-D", "TFT", "IMPERFECT-TFT"]
memory_schemes = [1, 2]
gamma = 0.9

# ============================================================================
# EXPERIMENT: Policy Iteration for Different Opponent Strategies
# ============================================================================

# ----------------------------------------------------------------------------
# Opponent: ALL-C (Always Cooperate)
# ----------------------------------------------------------------------------

env = IteratedPrisonersDilemma(opponent_strategy="ALL-C", memory_scheme=1)
best_policy_all_c, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy_all_c, value_function, env, opponent_strategy="ALL-C")

# ----------------------------------------------------------------------------
# COMPARISON: Random Policy vs Best Policy for ALL-C
# ----------------------------------------------------------------------------
comparison = env.test_against_random(best_policy_all_c, num_steps=100)
print_comparison(comparison)

# ----------------------------------------------------------------------------
# Opponent: ALL-D (Always Defect)
# ----------------------------------------------------------------------------

env = IteratedPrisonersDilemma(opponent_strategy="ALL-D", memory_scheme=1)
best_policy_all_d, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy_all_d, value_function, env, opponent_strategy="ALL-D")

# ----------------------------------------------------------------------------
# COMPARISON: Random Policy vs Best Policy for ALL-D
# ----------------------------------------------------------------------------
comparison = env.test_against_random(best_policy_all_d, num_steps=100)
print_comparison(comparison)

# ----------------------------------------------------------------------------
# Opponent: TFT (Tit-for-Tat)
# ----------------------------------------------------------------------------

env = IteratedPrisonersDilemma(opponent_strategy="TFT", memory_scheme=1)
best_policy_tft, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy_tft, value_function, env, opponent_strategy="TFT")

# ----------------------------------------------------------------------------
# COMPARISON: Random Policy vs Best Policy for TFT
# ----------------------------------------------------------------------------
comparison = env.test_against_random(best_policy_tft, num_steps=100)
print_comparison(comparison)

# ----------------------------------------------------------------------------
# Opponent: Memory 2 TFT (Tit-for-Tat)
# ----------------------------------------------------------------------------

env = IteratedPrisonersDilemma(opponent_strategy="TFT", memory_scheme=2)
best_policy_tft_m2, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy_tft_m2, value_function, env, opponent_strategy="TFT")


# ----------------------------------------------------------------------------
# COMPARISON: Random Policy vs Best Policy for TFT
# ----------------------------------------------------------------------------
comparison = env.test_against_random(best_policy_tft_m2, num_steps=100)
print_comparison(comparison)

# ----------------------------------------------------------------------------
# Opponent: IMPERFECT-TFT (Imperfect Tit-for-Tat)
# ----------------------------------------------------------------------------

env = IteratedPrisonersDilemma(opponent_strategy="IMPERFECT-TFT", memory_scheme=1)
best_policy_imperfect_tft, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy_imperfect_tft, value_function, env, opponent_strategy="IMPERFECT-TFT")

# ----------------------------------------------------------------------------
# COMPARISON: Random Policy vs Best Policy for IMPERFECT-TFT
# ----------------------------------------------------------------------------
comparison = env.test_against_random(best_policy_imperfect_tft, num_steps=100)
print_comparison(comparison)

# ----------------------------------------------------------------------------
# Memory 2 Opponent: IMPERFECT-TFT (Imperfect Tit-for-Tat)
# ----------------------------------------------------------------------------

env = IteratedPrisonersDilemma(opponent_strategy="IMPERFECT-TFT", memory_scheme=2)
best_policy_imperfect_tft_m2, value_function = env.policy_iteration(theta=0.000001, max_iterations=100)
print_policy(best_policy_imperfect_tft_m2, value_function, env, opponent_strategy="IMPERFECT-TFT")

# ----------------------------------------------------------------------------
# COMPARISON: Random Policy vs Best Policy for IMPERFECT-TFT
# ----------------------------------------------------------------------------
comparison = env.test_against_random(best_policy_imperfect_tft_m2, num_steps=100)
print_comparison(comparison)


print("\n" + "#" * 80)
print("# END OF EXPERIMENT")
print("#" * 80 + "\n")



OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: ALL-C
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          0.0000     1.0000     Defect (D)      50.0000   
(C, D)          1          0.0000     1.0000     Defect (D)      50.0000   
(D, C)          2          0.0000     1.0000     Defect (D)      50.0000   
(D, D)          3          0.0000     1.0000     Defect (D)      50.0000   


Random vs Best Policy:
  Random: 378.00
  Best:   500.00
  Diff:   122.00


OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: ALL-D
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------

In [135]:
# ============================================================================
# EXPERIMENT: Policy Iteration for Different Discount Factors (Gamma)
# ============================================================================
# Define gamma values to test
gamma_values = [0.1, 0.5, 0.9, 0.99]



In [136]:
# ============================================================================
# OPPONENT: TFT (Tit-for-Tat) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)

for gamma in gamma_values:
    env = IteratedPrisonersDilemma(opponent_strategy="TFT", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="TFT", gamma=gamma)
    # ----------------------------------------------------------------------------
    # COMPARISON: Random Policy vs Best Policy for TFT
    # ----------------------------------------------------------------------------
    comparison = env.test_against_random(best_policy, num_steps=100)
    print_comparison(comparison)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - TFT")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: TFT
Discount Factor (γ): 0.1
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          0.0000     1.0000     Defect (D)      5.1111    
(C, D)          1          0.0000     1.0000     Defect (D)      5.1111    
(D, C)          2          0.0000     1.0000     Defect (D)      1.1111    
(D, D)          3          0.0000     1.0000     Defect (D)      1.1111    


Random vs Best Policy:
  Random: 232.00
  Best:   104.00
  Diff:   -128.00


OPTIMAL POLICY AND VALUE FUNCTION
Opponent 

In [137]:
# ============================================================================
# OPPONENT: TFT (Tit-for-Tat) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)

for gamma in gamma_values:
    env = IteratedPrisonersDilemma(opponent_strategy="TFT", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="TFT", gamma=gamma)
    # ----------------------------------------------------------------------------
    # COMPARISON: Random Policy vs Best Policy for TFT
    # ----------------------------------------------------------------------------
    comparison = env.test_against_random(best_policy, num_steps=100)
    print_comparison(comparison)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - TFT")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: TFT
Discount Factor (γ): 0.1
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          0.0000     1.0000     Defect (D)      5.1111    
(C, D)          1          0.0000     1.0000     Defect (D)      5.1111    
(D, C)          2          0.0000     1.0000     Defect (D)      1.1111    
(D, D)          3          0.0000     1.0000     Defect (D)      1.1111    


Random vs Best Policy:
  Random: 228.00
  Best:   104.00
  Diff:   -124.00


OPTIMAL POLICY AND VALUE FUNCTION
Opponent 

In [138]:
# ============================================================================
# OPPONENT: IMPERFECT-TFT (Imperfect Tit-for-Tat) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)

for gamma in gamma_values:
    print("\n" + "-" * 80)
    print(f"DISCOUNT FACTOR (γ): {gamma}")
    print("-" * 80)
    env = IteratedPrisonersDilemma(opponent_strategy="IMPERFECT-TFT", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="IMPERFECT-TFT", gamma=gamma)
    # ----------------------------------------------------------------------------
    # COMPARISON: Random Policy vs Best Policy for IMPERFECT-TFT
    # ----------------------------------------------------------------------------
    comparison = env.test_against_random(best_policy, num_steps=100)
    print_comparison(comparison)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - IMPERFECT-TFT")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################

--------------------------------------------------------------------------------
DISCOUNT FACTOR (γ): 0.1
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: IMPERFECT-TFT
Discount Factor (γ): 0.1
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          0.0000     1.0000     Defect (D)      4.7556    
(C, D)          1          0.0000     1.0000     Defect (D)      4.7556    
(D, C)          2          0.0000     1.0000     Defect (D)      1.5556    


In [139]:
# ============================================================================
# OPPONENT: IMPERFECT-TFT (Imperfect Tit-for-Tat) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)

for gamma in gamma_values:
    print("\n" + "-" * 80)
    print(f"DISCOUNT FACTOR (γ): {gamma}")
    print("-" * 80)
    env = IteratedPrisonersDilemma(opponent_strategy="IMPERFECT-TFT", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="IMPERFECT-TFT", gamma=gamma)
    # ----------------------------------------------------------------------------
    # COMPARISON: Random Policy vs Best Policy for IMPERFECT-TFT
    # ----------------------------------------------------------------------------
    comparison = env.test_against_random(best_policy, num_steps=100)
    print_comparison(comparison)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - IMPERFECT-TFT")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################

--------------------------------------------------------------------------------
DISCOUNT FACTOR (γ): 0.1
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: IMPERFECT-TFT
Discount Factor (γ): 0.1
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          0.0000     1.0000     Defect (D)      4.7556    
(C, D)          1          0.0000     1.0000     Defect (D)      4.7556    
(D, C)          2          0.0000     1.0000     Defect (D)      1.5556    


In [140]:
# ============================================================================
# OPPONENT: IMPERFECT-TFT (Imperfect Tit-for-Tat) - Different Discount Factors
# ============================================================================
print("\n" + "#" * 80)
print("# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)")
print("#" * 80)

for gamma in gamma_values:
    print("\n" + "-" * 80)
    print(f"DISCOUNT FACTOR (γ): {gamma}")
    print("-" * 80)
    env = IteratedPrisonersDilemma(opponent_strategy="IMPERFECT-TFT", memory_scheme=1)
    best_policy, value_function = env.policy_iteration(gamma=gamma, theta=0.000001, max_iterations=100)
    print_policy(best_policy, value_function, env, opponent_strategy="IMPERFECT-TFT", gamma=gamma)
    # ----------------------------------------------------------------------------
    # COMPARISON: Random Policy vs Best Policy for IMPERFECT-TFT
    # ----------------------------------------------------------------------------
    comparison = env.test_against_random(best_policy, num_steps=100)
    print_comparison(comparison)

print("\n" + "#" * 80)
print("# END OF EXPERIMENT - IMPERFECT-TFT")
print("#" * 80 + "\n")



################################################################################
# POLICY ITERATION RESULTS - DIFFERENT DISCOUNT FACTORS (γ)
################################################################################

--------------------------------------------------------------------------------
DISCOUNT FACTOR (γ): 0.1
--------------------------------------------------------------------------------

OPTIMAL POLICY AND VALUE FUNCTION
Opponent Strategy: IMPERFECT-TFT
Discount Factor (γ): 0.1
Memory Scheme: 1

POLICY (π):
--------------------------------------------------------------------------------
State           State ID   P(C)       P(D)       Best Action     V(s)      
--------------------------------------------------------------------------------
(C, C)          0          0.0000     1.0000     Defect (D)      4.7556    
(C, D)          1          0.0000     1.0000     Defect (D)      4.7556    
(D, C)          2          0.0000     1.0000     Defect (D)      1.5556    
