In [None]:
from typing import List, Tuple, Callable
import random
import numpy as np


def score(player_a: bool, player_b: bool) -> Tuple[int, int]:
    if player_a and player_b:
        return (3, 3)
    if player_a and not player_b:
        return (0, 5)
    if not player_a and player_b:
        return (5, 0)
    else:
        return (1, 1)
    

def repeated_prisoners_dilemma(n_repetitions: int, strategy_player_a: Callable, strategy_player_b: Callable) -> Tuple[Tuple[int, int], Tuple[List[bool], List[bool]]]:
    score_a = 0
    score_b = 0
    history_player_a = []
    history_player_b = []
    for _ in range(n_repetitions):
        turn_player_a = strategy_player_a(history_this_player=history_player_a, history_other_player=history_player_b)
        turn_player_b = strategy_player_b(history_this_player=history_player_b, history_other_player=history_player_a)
        turn_scores = score(player_a=turn_player_a, player_b=turn_player_b)
        score_a += turn_scores[0]
        score_b += turn_scores[1]
        history_player_a.append(turn_player_a)
        history_player_b.append(turn_player_b)
    return ((score_a, score_b), (history_player_a, history_player_b))



class QModel:
    """
    True: player cooperates
    False: player defects
    """
    def __init__(self, epsilon: float = 0.1, max_history: int = 5, verbose: bool = False):
        self.verbose = verbose
        self.epsilon = epsilon
        self.max_history = max_history
        self.q = {"init": {False: [1, 1], True: [1, 1]}}


    def select_action(self, state: str, explore: bool):
        # exploit state knowledge
        score_ratio_false = self.q[state][False][0] / self.q[state][False][1]
        score_ratio_true = self.q[state][True][0] / self.q[state][True][1]  
        selected_action = score_ratio_true >= score_ratio_false 
        if self.verbose:
            print("select action")
            print(f"q[state][False]: {self.q[state][False]}")
            print(f"q[state][True]: {self.q[state][True]}")
            print(f"so: {score_ratio_true >= score_ratio_false}\n")
        # explore by deviating randomly (swaps selected action (True or False) with probability self.epsilon)
        if explore:
            if selected_action:
                if random.random() <= self.epsilon:
                    return False
                return True
            else:
                if random.random() <= self.epsilon:
                    return True
                return False
        else:
            return selected_action
    

    def update_q(self, state: str, turn: bool, scores: Tuple[int, int]):
        if state in self.q.keys():
            if self.verbose: print("update state:", f"self.q[{state}][{turn}] = [{self.q[state][turn][0]} + {scores[0]}, {self.q[state][turn][1]} + {scores[1]}]")
            self.q[state][turn] = [self.q[state][turn][0] + scores[0], self.q[state][turn][1] + scores[1]]
        else:
            self.q[state] = {False: [1, 1], True: [1, 1]}
            self.q[state][turn][0] = self.q[state][turn][0] + scores[0]
            self.q[state][turn][1] = self.q[state][turn][1] + scores[1] 
            if self.verbose: print("new state:", f"self.q[{state}][{turn}] = [{self.q[state][turn][0] + scores[0]}, {self.q[state][turn][1] + scores[1]}]")

    def play_and_update(self, history_this_player: List[bool], history_other_player: List[bool]) -> bool:
        return self._play_and_update(history_this_player=history_this_player, history_other_player=history_other_player, update_q=True)

    def play(self, history_this_player: List[bool], history_other_player: List[bool]) -> bool:
        return self._play_and_update(history_this_player=history_this_player, history_other_player=history_other_player, update_q=False)

    def _play_and_update(self, history_this_player: List[bool], history_other_player: List[bool], update_q: bool) -> bool:
        if len(history_this_player) == 0:
            if self.verbose: print("first step so state = init")
            state = "init"
        else:
            state_np = np.concat([[int(i) for i in history_this_player[-self.max_history:][::-1]], [int(i) for i in history_other_player[-self.max_history:]]])
            state = str(state_np)
            if self.verbose: print(f"state = {state}")
            if update_q:
                scores_last_turn = score(player_a=history_this_player[-1], player_b=history_other_player[-1])
                if len(history_this_player) == 1:
                    last_state = "init"
                else:
                    last_state_np = np.concat([[int(i) for i in history_this_player[-(self.max_history+1):][::-1]], [int(i) for i in history_other_player[-(self.max_history+1):]]])
                    last_state = str(last_state_np[1:-1])
                if self.verbose: print(f"last state = {last_state}")
                self.update_q(state=last_state, turn=history_this_player[-1], scores=scores_last_turn)
        if state in self.q.keys():
            return self.select_action(state=state, explore=update_q)
        else:
            return random.random() > 0.2


class StatelessStrategies:
    """
    True: player cooperates
    False: player defects
    """
    @staticmethod
    def strategy_always_cooperate(history_this_player: List[bool], history_other_player: List[bool]):
        return True

    @staticmethod
    def strategy_always_defect(history_this_player: List[bool], history_other_player: List[bool]):
        return False

    @staticmethod
    def strategy_tit_for_tat(history_this_player: List[bool], history_other_player: List[bool]):
        """
        start with cooperation and afterwards exactly mirrors the last turn of the opponent
        """
        if len(history_this_player) == 0:
            return True
        return history_other_player[-1]
    
    @staticmethod
    def strategy_tit_for_tat_generous(history_this_player: List[bool], history_other_player: List[bool]):
        """
        tit for tat but with 10% chance forgives deflection of other player in last turn
        """
        if len(history_this_player) == 0:
            return True
        regular_turn = history_other_player[-1] 
        if regular_turn == False:
            return random.random() > 0.9
        return regular_turn

    @staticmethod
    def strategy_tit_for_tat_suspicious(history_this_player: List[bool], history_other_player: List[bool]):
        """
        tit for tat but starting with deflection (prefered in hostile environments)
        """
        if len(history_this_player) == 0:
            return False
        return history_other_player[-1]

    @staticmethod
    def strategy_tit_for_tat_noisy(history_this_player: List[bool], history_other_player: List[bool]):
        """
        randomly deviates from regular tit for tat turn with 10% chance 
        """
        if len(history_this_player) == 0:
            regular_turn = True
        else:
            regular_turn = history_other_player[-1]

        if random.random() > 0.9:
            return not regular_turn
        return regular_turn

    @staticmethod
    def strategy_tit_for_tat_exponential_decay(history_this_player: List[bool], history_other_player: List[bool]):
        """
        tit for tat but considering the history of the opponents actions, assigning more weight to recent moves.
        The player cooperates with a probability proportional to the weighted fraction of the opponents cooperation.
        """
        if len(history_this_player) == 0:
            return True
        weights = np.logspace(start=0.1, stop=1, base=10, num=len(history_other_player))
        return bool(random.random() < (weights[history_other_player].sum() / weights.sum()))

    @staticmethod
    def strategy_grim_trigger(history_this_player: List[bool], history_other_player: List[bool]):
        """
        starts with cooperation but always deflects once the other player deflects a single time
        """
        if sum(history_other_player) != len(history_other_player):
            return False
        return True

    @staticmethod
    def strategy_random(history_this_player: List[bool], history_other_player: List[bool]):
        """
        random deflection / cooperation
        """
        return random.random() > 0.5
    
    @staticmethod
    def strategy_naive_probability(history_this_player: List[bool], history_other_player: List[bool]):
        """
        starts with cooperation, then cooperates with same probability as the opponent 
        """
        if len(history_other_player) == 0:
            return True
        return random.random() <= sum(history_other_player) / len(history_other_player)




# TODO: 
# models with large max_history fail to explore all possible states with frew training iterations 
# # -> also track partials states to fallback to knowledge about <max_history turns
wins_a = 0
wins_b = 0
ties = 0

for _ in range(100):
    model_a = QModel(epsilon=0.1, max_history=3)
    model_b = QModel(epsilon=0.1, max_history=4)
    
    # train
    for _ in range(10):
        repeated_prisoners_dilemma(n_repetitions=20, strategy_player_a=model_a.play_and_update, strategy_player_b=model_b.play_and_update)

    # test
    model_a.epsilon = 0
    model_b.epsilon = 0
    scores, _ = repeated_prisoners_dilemma(n_repetitions=10, strategy_player_a=model_a.play, strategy_player_b=model_b.play)
    if scores[0] > scores[1]:
        wins_a += 1
    elif scores[0] < scores[1]:
        wins_b += 1
    else:
        ties += 1
print(f"wins a: {wins_a}\nwins b: {wins_b}\nties: {ties}")

wins a: 41
wins b: 4
ties: 55
