In [None]:
%run Imports.ipynb
%run Discrete_Env.ipynb

# Gambling Environment

### State Space $S$
A state $s \in S$ in this MDP indicates the gambler's capital or balance.  
The state space $S = \{1,2 ... 99\}$  
$|S| = 99$ states  
This is an episodic finite MDP.

### Action Space $A$
The action $a \in A$ performed by the gambler is to propose a stake.
The action space for any state $s \in S$ is:  
$A(s) = \{0,1,2 ... \min(s,100-s)\}$  
 
### Flow from $(s(t)$, $a(t))$ to $s(t+1)$
$s(t) = s$, the gambler's capital at time t    
$a(t) = a$, the stakes proposed by the gambler   
$a \leq \min(s,100-s)$  
 
Internally:   
The coin is biased to show heads on a coin toss with probability $p, 0 \leq p \leq 1$  
So, with probability $p$, the gambler wins $a$ stakes, and with probability $1-p$ he loses $a$ stakes.

### Transition Probability Function $p(s'|s,a)$
The transition probability function for taking action $a(t) = a$ from state $s(t) = s$ to reach state $s(t+1)$ is described as follows:  
Let $s(t) = s$  
$a(t) = a$    
$a \leq \min(s,100-s)$      
Then $p(s+a \ | \ s, a) = p$   
And $p(s-a \ | \ s, a) = 1-p$  
For all other states $s'$, $p(s' \ | \ s,a) = 0$  
  
### Reward Function $R(s,a,s')$
The reward is zero on all transitions except those on which the gambler reaches his goal, when it is $+1$.  
$R(s,a,100) = +1$  
For all other $s' \neq 100$, $R(s,a,s') = 0$.
  
### Discount Rate $\gamma$  
Discount rate for this MDP is defined to be $\gamma = 1.0 \ (0 <= \gamma <= 1)$.  

## Bellman Updates for Value Iteration

$\forall \ s \in S$:  
$V(s) = \max_{a} \sum_{s'} p(s'|s,a)[r(s,a,s') + \gamma \cdot V(s')]$  
(Bellman Optimality Equation)    

Similarly, we can formulate the Bellman Optimality Equation to get the improved state values:    
$V(s) = \max_{a} \sum_{s'}  p(s'|s,a) [R(s,a,s') + 1.0 V(s')]$    
$ = \max_{a = 0}^{\max(s,100-s)} \{ p(s+a|s,a) [R(s,a,s+a) + V(s+a)] + p(s-a|s,a) [R(s,a,s-a) + V(s-a)]\}$  
$ = \max_{a = 0}^{\max(s,100-s)} \{ p [R(s,a,s+a) + V(s+a)] + (1-p) [R(s,a,s-a) + V(s-a)]\}$ 



In [None]:
class GamblingEnv(DiscreteEnvironment):
    
    def __init__(self,bias):
        
        # States set S
        self.states = [i for i in range(1,100)]
        
        # Action sets for every state A(s)
        self.actions = {s:[j for j in range(0,min(s,100-s))] for s in self.states}

        # Discount factor Gamma
        self.gamma = 1.0

        # Bias of coin used
        self.bias = bias
        self.rewards = {}
        self.transitions = {}
        
        for s in self.states:
            action_list = self.actions[s]
            for a in action_list:
                for dest in self.states:
                    self.rewards[(s,a,dest)] = 0
                
                tup_win = (s,a,s+a)
                tup_loss = (s,a,s-a)
                
                # Transition function p(s'|s,a) -> [s,a,s']
                self.transitions[tup_win] = bias
                self.transitions[tup_loss] = 1 - bias
                
                # Rewards r(s,a,s')
                if s+a == 100:
                    self.rewards[tup_win] = 1000.0
                else: 
                    self.rewards[tup_win] = 0.0
                if s-a == 100:
                    self.rewards[tup_loss] = 1.0
                else: 
                    self.rewards[tup_loss] = 0.0
                    
        # Current state of agent
        self.agent_state = self.initial_state()
        
        # Has the game terminated?
        self.is_terminated = False
        
        self.step_count = 0
    
    def step(self, a):
        self.step_count += 1
        s = self.agent_state
        dest_states = [s+a,s-a]
        dest_probs = [bias,1-bias]
        dest = np.random.choice(dest_states, dest_probs)
        self.agent_state = dest
        reward = self.rewards[(state,x,dest)]
        if self.is_final_state(self.agent_state):
            self.is_terminated = True
        return [self.agent_state, reward, self.is_terminated]
        
    def reset(self): 
        self.agent_state = self.initial_state()
        is_terminated = False
        self.step_count = 0
        return self.agent_state
    
    def initial_state(self):
        return random.choice(self.states)
    
    def is_final_state(self,s):
        if s == 0 or s == 100:
            return 1
        return 0