In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
from abc import ABC, abstractmethod
from dataclasses import dataclass

## Environment Setting

In [2]:
class Env(ABC):
    @abstractmethod
    def step(self, action):
        raise NotImplementedError

    @abstractmethod
    def reset(self):
        raise NotImplementedError


class DiscreteEnv(Env):

    def __init__(self, state_dim, action_dim, transition_P, initial_dis):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.transition_P = transition_P  # transition probs, P[s][a] == [(prob, s', r, done), ...]
        # 转移概率矩阵，是个字典，每个key为state，value为state对应action组成的字典，action字典key为action，value为（prob,s',r,done）元组
        self.initial_state_dis = initial_dis  # 应该是一个列表，每个元素是初始state分布的概率
        self.s = None  # current state

    def step(self, action):
        transition = self.transition_P[self.s][action]  # [(prob, s', r, done), ...]
        i = np.random.choice(len(transition), p=[t[0] for t in transition])
        p, next_s, r, d = transition[i]
        self.s = next_s
        return next_s, r, d, {'prob': p}

    def reset(self):
        self.s = np.random.choice(self.state_dim, p=self.initial_state_dis)
        return self.s


In [3]:
class AssetAllocation(DiscreteEnv):
    aversion_rate: float  = 0.01
    riskless_return: float = 0.05
    risky_return: dict = {0.4:0.5, 0.6:-0.3}  # 风险资产的Bernoulli分布参数 [p: corresponding_return]

    # action_space = [0.2, 0.5, 0.8]  # Allocable weight in risky assets
    action_space = [0.2, 0.8]

    def __init__(self, init_wealth=10, T=10):
        self.t = None
        self.s = None
        self.initial_wealth = init_wealth
        transition_p = {}
        state_space = {}
        # Generate state space to store state between time step
        for i in range(T):
            if i == 0: state_space[i] = [init_wealth]
            else: state_space[i] = []

        # Generate state space and transition prob

        for i in range(1,T):
            transition_p[i-1] = {}
            for w in state_space[i-1]:  # 提取出前一时刻的wealth
                transition_p[i-1][w] = {act:[] for act in self.action_space}  #

                for act in self.action_space:  # 每种wealth有不同action
                    for p,r in self.risky_return.items():  # 每个action有两种可能
                        w_ = w * act * (1 + r) + w * (1-act) * (1 + self.riskless_return)
                        w_= np.round(w_, 3)  # 保留3位小数
                        state_space[i] += [w_]  # next state

                        # if i == T-1:
                        #     reward = self._reward_compute(w_, self.aversion_rate)
                        #     transition_p[i-1][w][act] += [(p, w_, reward, True)]  # (prob, s', imm_reward, done)
                        # else:
                        #     reward = 0
                        #     transition_p[i-1][w][act] += [(p, w_, reward, False)]
                        reward = 0
                        transition_p[i-1][w][act] += [(p, w_, reward, False)]

        # Final state transfer to itself
        transition_p[9] = {}
        for w_ in state_space[9]:
            transition_p[9][w_] = {act:[] for act in self.action_space}
            for act in self.action_space:
                reward = self._reward_compute(w_, self.aversion_rate)
                transition_p[9][w_][act] += [(1, w_, reward, True)]

        state_dim = sum([len(state_space[x]) for x in state_space])  # number of state
        action_dim = len(self.action_space)
        init_dis = [1] + [0]*(state_dim - 1)  # Initial state distribution

        self.state_space = state_space
        super(AssetAllocation, self).__init__(state_dim, action_dim, transition_p, init_dis)

    @staticmethod
    def _reward_compute(wealth, aversion_rate):
         return (-np.exp(- aversion_rate * wealth)) / aversion_rate  # CARA Function

    def reset(self):
        self.s = self.initial_wealth
        self.t = 0
        return f'state wealth: {self.s}, t: {self.t}'

    def step(self, action):
        # self.t += 1
        # return super(AssetAllocation, self).step(action)
        transition = self.transition_P[self.t][self.s][action]  # [(prob, s', r, done), ...]
        i = np.random.choice(len(transition), p=[t_[0] for t_ in transition])
        p, next_s, r, d = transition[i]
        self.s = next_s
        self.t += 1
        return next_s, r, d, {'prob': p}



In [4]:
ample = AssetAllocation()
ample.reset()

'state wealth: 10, t: 0'

In [5]:
ample.step(np.random.choice(ample.action_space))

(11.4, 0, False, {'prob': 0.4})

In [6]:
ample.state_space[1]

[11.4, 9.8, 14.1, 7.7]

In [7]:
ample.transition_P[ample.t][ample.s][0.2]

[(0.4, 12.996, 0, False), (0.6, 11.172, 0, False)]

In [5]:
ample.transition_P

{0: {10: {0.2: [(0.4, 11.4, 0, False), (0.6, 9.8, 0, False)],
   0.8: [(0.4, 14.1, 0, False), (0.6, 7.7, 0, False)]}},
 1: {11.4: {0.2: [(0.4, 12.996, 0, False), (0.6, 11.172, 0, False)],
   0.8: [(0.4, 16.074, 0, False), (0.6, 8.778, 0, False)]},
  9.8: {0.2: [(0.4, 11.172, 0, False), (0.6, 9.604, 0, False)],
   0.8: [(0.4, 13.818, 0, False), (0.6, 7.546, 0, False)]},
  14.1: {0.2: [(0.4, 16.074, 0, False), (0.6, 13.818, 0, False)],
   0.8: [(0.4, 19.881, 0, False), (0.6, 10.857, 0, False)]},
  7.7: {0.2: [(0.4, 8.778, 0, False), (0.6, 7.546, 0, False)],
   0.8: [(0.4, 10.857, 0, False), (0.6, 5.929, 0, False)]}},
 2: {12.996: {0.2: [(0.4, 14.815, 0, False), (0.6, 12.736, 0, False)],
   0.8: [(0.4, 18.324, 0, False), (0.6, 10.007, 0, False)]},
  11.172: {0.2: [(0.4, 12.736, 0, False), (0.6, 10.949, 0, False)],
   0.8: [(0.4, 15.753, 0, False), (0.6, 8.602, 0, False)]},
  16.074: {0.2: [(0.4, 18.324, 0, False), (0.6, 15.753, 0, False)],
   0.8: [(0.4, 22.664, 0, False), (0.6, 12.377, 0

## Model Based Method---Dynamic Programming
### Valution Iteration
****


In [5]:
class AgentValueIteration:
    DISCOUNT_FACTOR = 0.7
    def __init__(self, env):
        self.env = env
        # Initialize state value & random policy
        self.value_dict = {}
        self.policy = {}

        for t_, s_list in env.state_space.items():
            self.value_dict[t_] = {ss_:0 for ss_ in s_list}
            self.policy[t_] = {ss_:np.random.choice(self.env.action_space) for ss_ in s_list}


    def _action_computation(self, t_, s_, mode=1):
        action_values = {act: 0 for act in self.env.action_space}
        for a_ in self.env.action_space:
            for prob, next_state, reward, done in self.env.transition_P[t_][s_][a_]:
                if done:
                    action_values[a_] += prob * (reward + self.DISCOUNT_FACTOR * self.value_dict[t_][next_state])
                    # break
                else:
                    action_values[a_] += prob * (reward + self.DISCOUNT_FACTOR * self.value_dict[t_+1][next_state])
        best_action = max(action_values, key= lambda x: action_values[x])
        best_action_value = action_values.get(best_action)
        if mode == 1:
            return best_action, best_action_value
        else:
            return action_values

    def value_iteration(self):
        rep_times = 0

        while True:
            delta = 0
            for t, s_list in self.env.state_space.items():
                for s in s_list:
                    best_a, best_a_v = self._action_computation(t, s)
                    delta = max(delta, np.abs(best_a_v - self.value_dict[t][s]))
                    self.value_dict[t][s] = best_a_v
                    self.policy[t][s] = best_a
            if delta < 0.001:
                print(f'Value Iteration Done!  Total round: {rep_times}')
                break
            print(f'Iteration round:{rep_times}, delta:{delta}')
            rep_times += 1

        return self.policy

    def state_action_value(self):
        s_a_table = {}
        for t, s_list in self.env.state_space.items():
            s_a_table[t] = pd.DataFrame(columns=['state']+ ample.action_space)
            for s in s_list:
                act_vs = self._action_computation(t, s,mode=0)
                a_l = [s] + list(act_vs.values())
                s_a_table[t].loc[len(s_a_table[t]), :] = a_l
        s_a_table = pd.concat(s_a_table.values(), keys=s_a_table.keys(), axis=0)
        s_a_table = s_a_table.reset_index().rename(columns={'level_0':'t'}).drop(columns='level_1')
        s_a_table = s_a_table.set_index(['t', 'state'])
        return s_a_table

    def policy_table(self):
        table = {}
        for t, s_a in self.policy.items():
            table[t] = pd.DataFrame(columns=['state']+ ample.action_space)
            for s, a in s_a.items():
                a_l = [s] + [1 if i == a else 0 for i in ample.action_space]
                table[t].loc[len(table[t]), :] = a_l
        table = pd.concat(table.values(), keys=table.keys(), axis=0)
        table = table.reset_index().rename(columns={'level_0':'t'}).drop(columns='level_1')
        table = table.set_index(['t','state'])
        return table

# ample_vi = AgentValueIteration(ample)
# ample_vi.value_iteration()
# ample_vi.state_action_value()

In [8]:
ample_vi = AgentValueIteration(ample)
ample_vi.value_iteration()

Iteration round:0, delta:99.05251717413593
Iteration round:1, delta:228.50755534690512
Iteration round:2, delta:158.77022520576787
Iteration round:3, delta:110.02591230498408
Iteration round:4, delta:76.330033783413
Iteration round:5, delta:52.85140224731263
Iteration round:6, delta:35.740501441344904
Iteration round:7, delta:24.58152406574308
Iteration round:8, delta:16.701450739508925
Iteration round:9, delta:11.14833456965423
Iteration round:10, delta:2.7979884452840906
Iteration round:11, delta:1.958591911698818
Iteration round:12, delta:1.371014338189184
Iteration round:13, delta:0.9597100367324174
Iteration round:14, delta:0.6717970257126922
Iteration round:15, delta:0.47025791799887884
Iteration round:16, delta:0.32918054259926066
Iteration round:17, delta:0.23042637981944836
Iteration round:18, delta:0.16129846587364227
Iteration round:19, delta:0.11290892611151548
Iteration round:20, delta:0.07903624827810063
Iteration round:21, delta:0.05532537379463065
Iteration round:22, de

{0: {10: 0.2},
 1: {11.4: 0.2, 9.8: 0.2, 14.1: 0.2, 7.7: 0.2},
 2: {12.996: 0.2,
  11.172: 0.2,
  16.074: 0.2,
  8.778: 0.2,
  9.604: 0.2,
  13.818: 0.2,
  7.546: 0.2,
  19.881: 0.2,
  10.857: 0.2,
  5.929: 0.2},
 3: {14.815: 0.2,
  12.736: 0.2,
  18.324: 0.2,
  10.007: 0.2,
  10.949: 0.2,
  15.753: 0.2,
  8.602: 0.2,
  22.664: 0.2,
  12.377: 0.2,
  6.759: 0.2,
  9.412: 0.2,
  13.542: 0.2,
  7.395: 0.2,
  19.483: 0.2,
  10.64: 0.2,
  5.81: 0.2,
  28.032: 0.2,
  15.308: 0.2,
  8.36: 0.2,
  4.565: 0.2},
 4: {16.889: 0.2,
  14.519: 0.2,
  20.889: 0.2,
  11.408: 0.2,
  12.481: 0.2,
  17.958: 0.2,
  9.807: 0.2,
  25.837: 0.2,
  14.109: 0.2,
  14.11: 0.2,
  7.705: 0.2,
  12.482: 0.2,
  10.73: 0.2,
  15.438: 0.2,
  8.431: 0.2,
  22.212: 0.2,
  12.13: 0.2,
  9.806: 0.2,
  8.43: 0.2,
  12.129: 0.2,
  6.624: 0.2,
  22.211: 0.2,
  31.956: 0.2,
  17.451: 0.2,
  17.452: 0.2,
  9.53: 0.2,
  5.204: 0.2,
  9.224: 0.2,
  13.271: 0.2,
  7.247: 0.2,
  19.094: 0.2,
  10.427: 0.2,
  5.694: 0.2,
  19.093: 0

In [7]:
ample_vi.policy

{0: {10: 0.2},
 1: {11.4: 0.2, 9.8: 0.8, 14.1: 0.8, 7.7: 0.8},
 2: {12.996: 0.2,
  11.172: 0.8,
  16.074: 0.8,
  8.778: 0.8,
  9.604: 0.2,
  13.818: 0.8,
  7.546: 0.2,
  19.881: 0.8,
  10.857: 0.8,
  5.929: 0.8},
 3: {14.815: 0.2,
  12.736: 0.8,
  18.324: 0.2,
  10.007: 0.8,
  10.949: 0.8,
  15.753: 0.8,
  8.602: 0.8,
  22.664: 0.8,
  12.377: 0.2,
  6.759: 0.2,
  9.412: 0.2,
  13.542: 0.2,
  7.395: 0.8,
  19.483: 0.8,
  10.64: 0.2,
  5.81: 0.8,
  28.032: 0.8,
  15.308: 0.2,
  8.36: 0.8,
  4.565: 0.8},
 4: {16.889: 0.2,
  14.519: 0.2,
  20.889: 0.2,
  11.408: 0.8,
  12.481: 0.2,
  17.958: 0.2,
  9.807: 0.8,
  25.837: 0.8,
  14.109: 0.8,
  14.11: 0.8,
  7.705: 0.8,
  12.482: 0.8,
  10.73: 0.8,
  15.438: 0.8,
  8.431: 0.2,
  22.212: 0.2,
  12.13: 0.2,
  9.806: 0.2,
  8.43: 0.2,
  12.129: 0.2,
  6.624: 0.8,
  22.211: 0.8,
  31.956: 0.2,
  17.451: 0.8,
  17.452: 0.8,
  9.53: 0.2,
  5.204: 0.2,
  9.224: 0.8,
  13.271: 0.2,
  7.247: 0.2,
  19.094: 0.8,
  10.427: 0.2,
  5.694: 0.2,
  19.093: 0

### Policy Iteration
****

In [9]:
class AgentPolicyIteration:
    DISCOUNT_FACTOR = 0.7
    def __init__(self, env):
        self.env = env
        # Initialize state value & random policy
        self.value_dict = {}
        self.policy = {}

        for t_, s_list in env.state_space.items():
            self.value_dict[t_] = {ss_:0 for ss_ in s_list}
            self.policy[t_] = {ss_:[(1, np.random.choice(self.env.action_space))] for ss_ in s_list}
            # 每个t时刻的下policy是该时刻下state作为key，（prob，act）作为value的字典的字典

    def _action_computation(self, t_, s_, mode=1):
        action_values = {act: 0 for act in self.env.action_space}
        for a_ in self.env.action_space:
            for prob, next_state, reward, done in self.env.transition_P[t_][s_][a_]:
                if done:
                    action_values[a_] += prob * (reward + self.DISCOUNT_FACTOR * self.value_dict[t_][next_state])
                    # break
                else:
                    action_values[a_] += prob * (reward + self.DISCOUNT_FACTOR * self.value_dict[t_+1][next_state])
        best_action = max(action_values, key= lambda x: action_values[x])
        best_action_value = action_values.get(best_action)
        if mode == 1:
            return best_action, best_action_value
        else:
            return action_values

    def _policy_evaluation(self, truncated_k):
        """
        This function is used to evaluate the self.policy and update the corresponding state value, which is the self.value_dict.
        Attention:
            - Since I use truncated policy iteration method, the state value don't need to iter to convergence but only k steps.
            - Updating self.value_dict directly means that the initial state value of each policy only needs to inherit the final policy value of the previous iteration.
        :param truncated_k: ...
        :return: Difference of the policy state value between each iteration
        """
        rep_times = 0
        delta = 0
        while rep_times < truncated_k:
            delta = 0
            for t, s_list in self.env.state_space.items():
                for s in s_list:
                    s_policy_value = 0
                    for a_prob, act in self.policy[t][s]:
                        for t_prob, next_s, reward, done in self.env.transition_P[t][s][act]:
                            if done:
                                s_policy_value += a_prob * t_prob * (reward + self.DISCOUNT_FACTOR * self.value_dict[t][next_s])
                            else:
                                s_policy_value += a_prob * t_prob * (reward + self.DISCOUNT_FACTOR * self.value_dict[t+1][next_s])

                    delta = max(delta, np.abs(s_policy_value - self.value_dict[t][s]))
                    self.value_dict[t][s] = s_policy_value  # Update the policy state value
            rep_times += 1
        return delta

    def _policy_improvement(self):
        for t, s_list in self.env.state_space.items():
            for s in s_list:
                best_a, best_a_v = self._action_computation(t, s)
                self.policy[t][s] = [(1, best_a)]  # greedy policy


    def policy_iteration(self, truncated_k = 15):
        rep_times = 0
        while True:
            # Policy evaluation
            delta = self._policy_evaluation(truncated_k=truncated_k)
            # Policy improvement
            self._policy_improvement()
            if delta < 0.001:
                print(f'Policy Iteration Done! delta:{delta} Total round: {rep_times}')
                break
            print(f'Iteration round:{rep_times}, delta:{delta}')
            rep_times += 1
        return self.policy

    def policy_table(self):
        ...

In [10]:
ample_pi = AgentPolicyIteration(ample)
ample_pi.policy_iteration()

Iteration round:0, delta:0.6717970257126922
Iteration round:1, delta:0.0031893977017602992
Policy Iteration Done! delta:1.5141861751999386e-05 Total round: 2


{0: {10: [(1, 0.2)]},
 1: {11.4: [(1, 0.2)], 9.8: [(1, 0.2)], 14.1: [(1, 0.2)], 7.7: [(1, 0.2)]},
 2: {12.996: [(1, 0.2)],
  11.172: [(1, 0.2)],
  16.074: [(1, 0.2)],
  8.778: [(1, 0.2)],
  9.604: [(1, 0.2)],
  13.818: [(1, 0.2)],
  7.546: [(1, 0.2)],
  19.881: [(1, 0.2)],
  10.857: [(1, 0.2)],
  5.929: [(1, 0.2)]},
 3: {14.815: [(1, 0.2)],
  12.736: [(1, 0.2)],
  18.324: [(1, 0.2)],
  10.007: [(1, 0.2)],
  10.949: [(1, 0.2)],
  15.753: [(1, 0.2)],
  8.602: [(1, 0.2)],
  22.664: [(1, 0.2)],
  12.377: [(1, 0.2)],
  6.759: [(1, 0.2)],
  9.412: [(1, 0.2)],
  13.542: [(1, 0.2)],
  7.395: [(1, 0.2)],
  19.483: [(1, 0.2)],
  10.64: [(1, 0.2)],
  5.81: [(1, 0.2)],
  28.032: [(1, 0.2)],
  15.308: [(1, 0.2)],
  8.36: [(1, 0.2)],
  4.565: [(1, 0.2)]},
 4: {16.889: [(1, 0.2)],
  14.519: [(1, 0.2)],
  20.889: [(1, 0.2)],
  11.408: [(1, 0.2)],
  12.481: [(1, 0.2)],
  17.958: [(1, 0.2)],
  9.807: [(1, 0.2)],
  25.837: [(1, 0.2)],
  14.109: [(1, 0.2)],
  14.11: [(1, 0.2)],
  7.705: [(1, 0.2)],
  12.4

In [11]:
ample_pi.policy

{0: {10: [(1, 0.2)]},
 1: {11.4: [(1, 0.2)], 9.8: [(1, 0.2)], 14.1: [(1, 0.2)], 7.7: [(1, 0.2)]},
 2: {12.996: [(1, 0.2)],
  11.172: [(1, 0.2)],
  16.074: [(1, 0.2)],
  8.778: [(1, 0.2)],
  9.604: [(1, 0.2)],
  13.818: [(1, 0.2)],
  7.546: [(1, 0.2)],
  19.881: [(1, 0.2)],
  10.857: [(1, 0.2)],
  5.929: [(1, 0.2)]},
 3: {14.815: [(1, 0.2)],
  12.736: [(1, 0.2)],
  18.324: [(1, 0.2)],
  10.007: [(1, 0.2)],
  10.949: [(1, 0.2)],
  15.753: [(1, 0.2)],
  8.602: [(1, 0.2)],
  22.664: [(1, 0.2)],
  12.377: [(1, 0.2)],
  6.759: [(1, 0.2)],
  9.412: [(1, 0.2)],
  13.542: [(1, 0.2)],
  7.395: [(1, 0.2)],
  19.483: [(1, 0.2)],
  10.64: [(1, 0.2)],
  5.81: [(1, 0.2)],
  28.032: [(1, 0.2)],
  15.308: [(1, 0.2)],
  8.36: [(1, 0.2)],
  4.565: [(1, 0.2)]},
 4: {16.889: [(1, 0.2)],
  14.519: [(1, 0.2)],
  20.889: [(1, 0.2)],
  11.408: [(1, 0.2)],
  12.481: [(1, 0.2)],
  17.958: [(1, 0.2)],
  9.807: [(1, 0.2)],
  25.837: [(1, 0.2)],
  14.109: [(1, 0.2)],
  14.11: [(1, 0.2)],
  7.705: [(1, 0.2)],
  12.4

## Model Free
### Mento Carlo Exploring Start with soft policy
****


In [12]:
class AgentMC:
    DISCOUNT_FACTOR = 0.7
    EPSILON = 0.05
    def __init__(self, env):
        self.env = env
        # Initialize state vale & policy
        self.policy = {} # self.policy[t] = {state:[(act1_prob, act1), (act2_prob, act2), ...]}
        self.Q_table = {} # self.Q_table[(t, state)] = {act1: [0], act2: [0], ...}
        for t_, s_list in env.state_space.items():
            # Initial Policy is deterministic
            self.policy[t_] = {ss_: [(1, np.random.choice(self.env.action_space))] for ss_ in s_list}
            for ss_ in s_list:
                self.Q_table[(t_, ss_)] = {act: [0] for act in self.env.action_space}

    def episode(self, policy):
        """
        Generate trajectory under particular policy
        :param policy:
        :return: trajectory [(t, state, action, reward), ...]
        """
        self.env.reset()
        s, t = self.env.s, self.env.t
        trajectory = []
        while True:
            a = []
            p = []
            for p_a in policy[t][s]: # [(prob1, act1), (prob2, act2)]
                p.append(p_a[0])
                a.append(p_a[1])
            act = np.random.choice(a, p=p)

            next_s, r, done, _ = self.env.step(act)
            trajectory.append((t, s, act, r))

            if done:
                break
            s = next_s
            t += 1
        return trajectory

    def _best_action_computation(self, t, s):
        action_values = {}
        for act in self.env.action_space:
            action_values[act] = np.mean(self.Q_table[(t, s)][act])  # 对列表求mean
        best_a = max(action_values, key=lambda x: action_values[x])
        best_a_v = action_values.get(best_a)
        return best_a, best_a_v

    def _policy_improvement(self, t, s, best_a_):
        self.policy[t][s] = []
        act_d = self.env.action_dim
        exploring_p = self.EPSILON / act_d  # Epsilon-greedy Updating
        exploiting_p = 1 - (((act_d - 1) * self.EPSILON) / act_d)
        for act in self.env.action_space:
            if act == best_a_:
                self.policy[t][s] += [(exploiting_p, act)]
            else:
                self.policy[t][s] += [(exploring_p, act)]

    def mc_iteration(self, episode_num = 10000):
        for ith in range(1, episode_num+1):
            if ith % 1000 == 0:
                print(f'Episode Round: {ith} / {episode_num}')

            g = 0
            trajectory = self.episode(self.policy)

            for step in trajectory[::-1]:  # Backward Computation
                t, s, act, reward = step

                g = self.DISCOUNT_FACTOR * g + reward  # g(s,a)
                self.Q_table[(t, s)][act] += [g]

                ## action value Evaluation on this state
                best_a, _ = self._best_action_computation(t, s)
                ## epsilon-policy improvement
                self._policy_improvement(t, s, best_a)
        return self.policy

    def policy_table(self):
        ...


In [13]:
ample_mc = AgentMC(ample)

In [46]:
ample_mc.mc_iteration()

Episode Round: 1000 / 10000
Episode Round: 2000 / 10000
Episode Round: 3000 / 10000
Episode Round: 4000 / 10000
Episode Round: 5000 / 10000
Episode Round: 6000 / 10000
Episode Round: 7000 / 10000
Episode Round: 8000 / 10000
Episode Round: 9000 / 10000
Episode Round: 10000 / 10000


In [51]:
ample_mc.policy

{32.517: [(1, 0.8)],
 27.954: [(1, 0.8)],
 40.219: [(1, 0.2)],
 21.963: [(0.975, 0.2), (0.025, 0.8)],
 24.031: [(0.975, 0.2), (0.025, 0.8)],
 34.575: [(0.975, 0.2), (0.025, 0.8)],
 18.881: [(0.975, 0.2), (0.025, 0.8)],
 34.574: [(1, 0.2)],
 49.745: [(0.025, 0.2), (0.975, 0.8)],
 27.166: [(0.025, 0.2), (0.975, 0.8)],
 27.165: [(0.975, 0.2), (0.025, 0.8)],
 14.835: [(0.025, 0.2), (0.975, 0.8)],
 27.953: [(1, 0.8)],
 24.03: [(0.975, 0.2), (0.025, 0.8)],
 34.573: [(1, 0.8)],
 18.88: [(0.975, 0.2), (0.025, 0.8)],
 20.657: [(0.025, 0.2), (0.975, 0.8)],
 29.721: [(1, 0.2)],
 16.231: [(0.975, 0.2), (0.025, 0.8)],
 42.762: [(1, 0.2)],
 23.353: [(0.975, 0.2), (0.025, 0.8)],
 23.352: [(0.975, 0.2), (0.025, 0.8)],
 12.753: [(0.975, 0.2), (0.025, 0.8)],
 49.744: [(1, 0.2)],
 61.525: [(1, 0.8)],
 33.599: [(0.975, 0.2), (0.025, 0.8)],
 18.348: [(0.975, 0.2), (0.025, 0.8)],
 10.02: [(0.025, 0.2), (0.975, 0.8)],
 20.658: [(0.975, 0.2), (0.025, 0.8)],
 29.723: [(0.975, 0.2), (0.025, 0.8)],
 16.232: [(0.

In [37]:
ample_mc.episode(ample_mc.policy)

[(0, 10, 0.8, 0),
 (1, 7.7, 0.2, 0),
 (2, 7.546, 0.2, 0),
 (3, 8.602, 0.2, 0),
 (4, 8.43, 0.8, 0),
 (5, 11.886, 0.2, 0),
 (6, 11.648, 0.2, 0),
 (7, 11.415, 0.8, 0),
 (8, 8.79, 0.2, 0),
 (9, 8.614, 0.2, -91.7465777019784)]

In [26]:
ample_mc.Q_table

{(0, 10): {0.2: [0], 0.8: [0]},
 (1, 11.4): {0.2: [0], 0.8: [0]},
 (1, 9.8): {0.2: [0], 0.8: [0]},
 (1, 14.1): {0.2: [0], 0.8: [0]},
 (1, 7.7): {0.2: [0], 0.8: [0]},
 (2, 12.996): {0.2: [0], 0.8: [0]},
 (2, 11.172): {0.2: [0], 0.8: [0]},
 (2, 16.074): {0.2: [0], 0.8: [0]},
 (2, 8.778): {0.2: [0], 0.8: [0]},
 (2, 9.604): {0.2: [0], 0.8: [0]},
 (2, 13.818): {0.2: [0], 0.8: [0]},
 (2, 7.546): {0.2: [0], 0.8: [0]},
 (2, 19.881): {0.2: [0], 0.8: [0]},
 (2, 10.857): {0.2: [0], 0.8: [0]},
 (2, 5.929): {0.2: [0], 0.8: [0]},
 (3, 14.815): {0.2: [0], 0.8: [0]},
 (3, 12.736): {0.2: [0], 0.8: [0]},
 (3, 18.324): {0.2: [0], 0.8: [0]},
 (3, 10.007): {0.2: [0], 0.8: [0]},
 (3, 10.949): {0.2: [0], 0.8: [0]},
 (3, 15.753): {0.2: [0], 0.8: [0]},
 (3, 8.602): {0.2: [0], 0.8: [0]},
 (3, 22.664): {0.2: [0], 0.8: [0]},
 (3, 12.377): {0.2: [0], 0.8: [0]},
 (3, 6.759): {0.2: [0], 0.8: [0]},
 (3, 9.412): {0.2: [0], 0.8: [0]},
 (3, 13.542): {0.2: [0], 0.8: [0]},
 (3, 7.395): {0.2: [0], 0.8: [0]},
 (3, 19.483): 

In [29]:
ample_mc.policy[1][9.8]

[(1, 0.8)]

### TD SARSA
****

In [90]:
class AgentSARSA:
    DISCOUNT_FACTOR = 0.7
    EPSILON = 0.05

    def __init__(self, env):
        self.env = env
        self.policy = {}  # self.policy[t][state] = [(prob1, act1),(...)]
        self.Q_table = {} # self.Q_table[(t, state)] = {act1: [0], act2: [0], ...}

        self.exploring_p = self.EPSILON / env.action_dim
        self.exploiting_p = 1 - (((env.action_dim - 1) * self.EPSILON) / env.action_dim)

        for t_, s_list in env.state_space.items():
            self.policy[t_] = {}
            for ss_ in s_list:
                self.Q_table[(t_, ss_)] = {act: 0 for act in env.action_space}
                # Equal initial action Prob
                self.policy[t_][ss_] = [(1 / env.action_dim, act) for act in env.action_space]
                # self.policy[t_][ss_] = [(1, self.env.action_space[0])]

    def _best_action_computation(self, t, s):
        action_values = self.Q_table[(t, s)]
        best_a = max(action_values, key=lambda x: action_values[x])
        best_a_v = action_values.get(best_a)
        return best_a, best_a_v

    def _policy_action_choose(self, t, s):  # Choose action under self.policy
        a = []
        p = []
        for p_a in self.policy[t][s]:
            a.append(p_a[1])
            p.append(p_a[0])
        return np.random.choice(a, p=p)

    def _policy_improvement(self, best_a):
        p_a = []
        for act in self.env.action_space:
            if act == best_a:
                p_a.append((self.exploiting_p, act))
            else:
                p_a.append((self.exploring_p, act))
        return p_a

    def sarsa_iteration(self, episode_num = 10000, alpha = 0.1):

        for ith in range(1, episode_num+1):
            if ith % 1000 == 0:
                print(f'Episode Round: {ith} / {episode_num}')

            self.env.reset()
            while True:
                # Generate experience (s, a, r, s_t+1, a_t+1)
                t, s = self.env.t, self.env.s
                act = self._policy_action_choose(t, s)
                next_s, r, done, _ = self.env.step(act)
                if done:
                    break
                next_act = self._policy_action_choose(t+1, next_s)

                td_target = r + self.DISCOUNT_FACTOR * self.Q_table[(t+1, next_s)][next_act]
                self.Q_table[(t, s)][act] += alpha * (self.Q_table[(t, s)][act] - td_target)

                best_a, _ = self._best_action_computation(t, s)
                self.policy[t][s] = self._policy_improvement(best_a)




In [102]:
ample_sarsa = AgentSARSA(ample)

In [103]:
ample_sarsa.sarsa_iteration()

Episode Round: 1000 / 10000
Episode Round: 2000 / 10000
Episode Round: 3000 / 10000
Episode Round: 4000 / 10000
Episode Round: 5000 / 10000
Episode Round: 6000 / 10000
Episode Round: 7000 / 10000
Episode Round: 8000 / 10000
Episode Round: 9000 / 10000
Episode Round: 10000 / 10000


In [104]:
ample_sarsa.policy

{0: {10: [(0.975, 0.2), (0.025, 0.8)]},
 1: {11.4: [(0.975, 0.2), (0.025, 0.8)],
  9.8: [(0.975, 0.2), (0.025, 0.8)],
  14.1: [(0.975, 0.2), (0.025, 0.8)],
  7.7: [(0.975, 0.2), (0.025, 0.8)]},
 2: {12.996: [(0.975, 0.2), (0.025, 0.8)],
  11.172: [(0.975, 0.2), (0.025, 0.8)],
  16.074: [(0.975, 0.2), (0.025, 0.8)],
  8.778: [(0.975, 0.2), (0.025, 0.8)],
  9.604: [(0.975, 0.2), (0.025, 0.8)],
  13.818: [(0.975, 0.2), (0.025, 0.8)],
  7.546: [(0.975, 0.2), (0.025, 0.8)],
  19.881: [(0.5, 0.2), (0.5, 0.8)],
  10.857: [(0.975, 0.2), (0.025, 0.8)],
  5.929: [(0.975, 0.2), (0.025, 0.8)]},
 3: {14.815: [(0.975, 0.2), (0.025, 0.8)],
  12.736: [(0.975, 0.2), (0.025, 0.8)],
  18.324: [(0.975, 0.2), (0.025, 0.8)],
  10.007: [(0.975, 0.2), (0.025, 0.8)],
  10.949: [(0.975, 0.2), (0.025, 0.8)],
  15.753: [(0.975, 0.2), (0.025, 0.8)],
  8.602: [(0.975, 0.2), (0.025, 0.8)],
  22.664: [(0.975, 0.2), (0.025, 0.8)],
  12.377: [(0.975, 0.2), (0.025, 0.8)],
  6.759: [(0.975, 0.2), (0.025, 0.8)],
  9.412: 

### TD Q-Learning On-Policy & Off-Policy Version
****

In [113]:
import copy
class AgentQLearning:
    DISCOUNT_FACTOR = 0.7
    EPSILON = 0.05

    def __init__(self, env):
        self.env = env
        self.policy = {}
        self.Q_table = {}

        self.exploring_p = self.EPSILON / env.action_dim
        self.exploiting_p = 1 - (((env.action_dim - 1) * self.EPSILON) / env.action_dim)

        for t_, s_list in env.state_space.items():
            self.policy[t_] = {}
            for ss_ in s_list:
                self.Q_table[(t_, ss_)] = {act: 0 for act in env.action_space}
                # Equal initial action Prob
                self.policy[t_][ss_] = [(1 / env.action_dim, act) for act in env.action_space]

    def reset(self):
        self.policy = {}
        self.Q_table = {}
        for t_, s_list in self.env.state_space.items():
            self.policy[t_] = {}
            for ss_ in s_list:
                self.Q_table[(t_, ss_)] = {act: 0 for act in self.env.action_space}
                # Equal initial action Prob
                self.policy[t_][ss_] = [(1 / self.env.action_dim, act) for act in self.env.action_space]

    def _policy_action_choose(self, t, s, policy=None):  # Choose action under self.policy
        a = []
        p = []
        if policy:
            for p_a in policy[t][s]:
                a.append(p_a[1])
                p.append(p_a[0])
            return np.random.choice(a, p=p)
        else:
            for p_a in self.policy[t][s]:
                a.append(p_a[1])
                p.append(p_a[0])
            return np.random.choice(a, p=p)

    def _best_action_computation(self, t, s):
        action_values = self.Q_table[t, s]
        best_a = max(action_values, key=lambda x:action_values[x])
        best_a_v = action_values.get(best_a)
        return best_a, best_a_v

    def _policy_improvement(self, best_a):
        p_a = []
        for act in self.env.action_space:
            if act == best_a:
                p_a.append((self.exploiting_p, act))
            else:
                p_a.append((self.exploring_p, act))
        return p_a

    def qlearning_iteration(self, episode_num = 10000, alpha = 0.1):
        for ith in range(1, episode_num+1):
            if ith % 1000 == 0:
                print(f'Episode Round: {ith} / {episode_num}')

            self.env.reset()
            while True:
                t, s = self.env.t, self.env.s
                act = self._policy_action_choose(t,s)
                next_s, r, done, _ = self.env.step(act)
                if done:
                    break
                _, best_next_a_v = self._best_action_computation(t+1, next_s)

                td_target = r + self.EPSILON * best_next_a_v
                self.Q_table[(t, s)][act] += alpha * (self.Q_table[(t, s)][act] - td_target)

                best_a, _ = self._best_action_computation(t, s)
                self.policy[t][s] = self._policy_improvement(best_a)

    def qlearning_iteration_Off_policy(self, episode_num = 10000, alpha = 0.1):
        # Sampling Policy just as same as the initial policy
        sample_policy = copy.deepcopy(self.policy)

        for ith in range(1, episode_num+1):
            if ith % 1000 == 0:
                print(f'Episode Round: {ith} / {episode_num}')

            self.env.reset()
            while True:
                t, s = self.env.t, self.env.s
                act = self._policy_action_choose(t,s, sample_policy)
                next_s, r, done, _ = self.env.step(act)
                if done:
                    break
                _, best_next_a_v = self._best_action_computation(t+1, next_s)

                td_target = r + self.EPSILON * best_next_a_v
                self.Q_table[(t, s)][act] += alpha * (self.Q_table[(t, s)][act] - td_target)

                best_a, _ = self._best_action_computation(t, s)
                self.policy[t][s] = self._policy_improvement(best_a)





In [114]:
ample_qlearning = AgentQLearning(ample)

In [108]:
ample_qlearning.qlearning_iteration()

Episode Round: 1000 / 10000
Episode Round: 2000 / 10000
Episode Round: 3000 / 10000
Episode Round: 4000 / 10000
Episode Round: 5000 / 10000
Episode Round: 6000 / 10000
Episode Round: 7000 / 10000
Episode Round: 8000 / 10000
Episode Round: 9000 / 10000
Episode Round: 10000 / 10000


In [109]:
ample_qlearning.policy

{0: {10: [(0.975, 0.2), (0.025, 0.8)]},
 1: {11.4: [(0.975, 0.2), (0.025, 0.8)],
  9.8: [(0.975, 0.2), (0.025, 0.8)],
  14.1: [(0.975, 0.2), (0.025, 0.8)],
  7.7: [(0.975, 0.2), (0.025, 0.8)]},
 2: {12.996: [(0.975, 0.2), (0.025, 0.8)],
  11.172: [(0.975, 0.2), (0.025, 0.8)],
  16.074: [(0.975, 0.2), (0.025, 0.8)],
  8.778: [(0.975, 0.2), (0.025, 0.8)],
  9.604: [(0.975, 0.2), (0.025, 0.8)],
  13.818: [(0.975, 0.2), (0.025, 0.8)],
  7.546: [(0.975, 0.2), (0.025, 0.8)],
  19.881: [(0.975, 0.2), (0.025, 0.8)],
  10.857: [(0.975, 0.2), (0.025, 0.8)],
  5.929: [(0.975, 0.2), (0.025, 0.8)]},
 3: {14.815: [(0.975, 0.2), (0.025, 0.8)],
  12.736: [(0.975, 0.2), (0.025, 0.8)],
  18.324: [(0.975, 0.2), (0.025, 0.8)],
  10.007: [(0.975, 0.2), (0.025, 0.8)],
  10.949: [(0.975, 0.2), (0.025, 0.8)],
  15.753: [(0.975, 0.2), (0.025, 0.8)],
  8.602: [(0.975, 0.2), (0.025, 0.8)],
  22.664: [(0.975, 0.2), (0.025, 0.8)],
  12.377: [(0.975, 0.2), (0.025, 0.8)],
  6.759: [(0.975, 0.2), (0.025, 0.8)],
  9.4

In [115]:
ample_qlearning.reset()

In [116]:
ample_qlearning.qlearning_iteration_Off_policy()

Episode Round: 1000 / 10000
Episode Round: 2000 / 10000
Episode Round: 3000 / 10000
Episode Round: 4000 / 10000
Episode Round: 5000 / 10000
Episode Round: 6000 / 10000
Episode Round: 7000 / 10000
Episode Round: 8000 / 10000
Episode Round: 9000 / 10000
Episode Round: 10000 / 10000


In [117]:
ample_qlearning.policy

{0: {10: [(0.975, 0.2), (0.025, 0.8)]},
 1: {11.4: [(0.975, 0.2), (0.025, 0.8)],
  9.8: [(0.975, 0.2), (0.025, 0.8)],
  14.1: [(0.975, 0.2), (0.025, 0.8)],
  7.7: [(0.975, 0.2), (0.025, 0.8)]},
 2: {12.996: [(0.975, 0.2), (0.025, 0.8)],
  11.172: [(0.975, 0.2), (0.025, 0.8)],
  16.074: [(0.975, 0.2), (0.025, 0.8)],
  8.778: [(0.975, 0.2), (0.025, 0.8)],
  9.604: [(0.975, 0.2), (0.025, 0.8)],
  13.818: [(0.975, 0.2), (0.025, 0.8)],
  7.546: [(0.975, 0.2), (0.025, 0.8)],
  19.881: [(0.975, 0.2), (0.025, 0.8)],
  10.857: [(0.975, 0.2), (0.025, 0.8)],
  5.929: [(0.975, 0.2), (0.025, 0.8)]},
 3: {14.815: [(0.975, 0.2), (0.025, 0.8)],
  12.736: [(0.975, 0.2), (0.025, 0.8)],
  18.324: [(0.975, 0.2), (0.025, 0.8)],
  10.007: [(0.975, 0.2), (0.025, 0.8)],
  10.949: [(0.975, 0.2), (0.025, 0.8)],
  15.753: [(0.975, 0.2), (0.025, 0.8)],
  8.602: [(0.975, 0.2), (0.025, 0.8)],
  22.664: [(0.975, 0.2), (0.025, 0.8)],
  12.377: [(0.975, 0.2), (0.025, 0.8)],
  6.759: [(0.975, 0.2), (0.025, 0.8)],
  9.4