# Coding Test 

You will be assesed overall on;

1) How far you get in the alloted time.
2) Code optimisations.
3) Code reusability.
4) Code readability.

Some hints; 

1) Take regulaer berak (at least 5 minutes every hour) or changes in activity
2) Avoiding awkward, static postures by regularly changing position
3) Getting up and moving or doing stretching exercises
4) Avoiding eye fatigue by changing focus or blinking from time to time 

In [38]:
import gym
import torch
import numpy 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

# Part 1: PPO

- Implement a vanilla PPO learning agent and train it on 'acrobot-v1'.

In [44]:
learning_rate = 0.00005
gamma = 0.98
lmbda = 0.95

#extra-hyperparameter
eps_clip = 0.1
K_epoch = 3

class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.prep_data = []
        self.function1   = nn.Linear(6,256)
        self.function_pi = nn.Linear(256,3)
        self.function_v  = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x, softmax_dim = 0):
        x = F.relu(self.function1(x))
        x = self.function_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob
    
    def v(self, x):
        x = F.relu(self.function1(x))
        v = self.function_v(x)
        return v
      
    def put_data(self, transition):
        self.data.append(transition)
        
    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []
        for transition in self.prep_data:
            s, a, r, s_prime, prob_a, done = transition
            
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            prob_a_lst.append([prob_a])
            done_mask = 0 if done else 1
            done_lst.append([done_mask])
            
        s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                          torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
                                          torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
        self.prep_data = []
        return s, a, r, s_prime, done_mask, prob_a
        
    def train_net(self):
        s, a, r, s_prime, done_mask, prob_a = self.make_batch()

        for i in range(K_epoch):
            td_target = r + gamma * self.v(s_prime) * done_mask
            delta = td_target - self.v(s)
            delta = delta.detach().numpy()

            advantage_lst = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_lst.append([advantage])
            advantage_lst.reverse()
            advantage = torch.tensor(advantage_lst, dtype=torch.float)

            pi = self.pi(s, softmax_dim=1)
            pi_a = pi.gather(1,a)
            ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a))  # a/b == exp(log(a)-log(b))

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
            loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach())

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()


In [46]:
def main():
    env = gym.make('Acrobot-v1')
    model = PPO()
    score = 0.0
    print_interval = 200

    for n_epi in range(10000):
        s = env.reset()
        done = False
        
        test_a = 0
        mn_a = 1000
        
        while not done:
            for t in range(20):
                prob = model.pi(torch.from_numpy(s).float())
                m = Categorical(prob)
                a = m.sample().item()
#                 env.render()
                s_prime, r, done, info = env.step(a)
                test_a = max(test_a, a)
                mn_a = min(mn_a, a)

                model.prep_data.append((s, a, r/100.0, s_prime, prob[a].item(), done))
                s = s_prime

                score += r
                if done:
                    break

            model.train_net()

        if n_epi%print_interval==0 and n_epi!=0:
            print("# of episode :{}, avg score : {:.1f}".format(n_epi, score/print_interval))
            score = 0.0

    env.close()

if __name__ == '__main__':
    main()

```
# of episode :200, avg score : -199.3
# of episode :400, avg score : -116.1
# of episode :600, avg score : -105.3
# of episode :800, avg score : -97.8
# of episode :1000, avg score : -94.1
# of episode :1200, avg score : -87.1
# of episode :1400, avg score : -86.5
# of episode :1600, avg score : -92.0
# of episode :1800, avg score : -84.6
# of episode :2000, avg score : -90.3
# of episode :2200, avg score : -86.3
# of episode :2400, avg score : -86.3
# of episode :2600, avg score : -84.7
# of episode :2800, avg score : -85.3
# of episode :3000, avg score : -82.8
# of episode :3200, avg score : -87.0
# of episode :3400, avg score : -83.5
# of episode :3600, avg score : -83.4
# of episode :3800, avg score : -84.6
# of episode :4000, avg score : -86.2
# of episode :4200, avg score : -82.5
# of episode :4400, avg score : -86.1
# of episode :4600, avg score : -88.2
# of episode :4800, avg score : -84.3
# of episode :5000, avg score : -87.2
# of episode :5200, avg score : -86.0
# of episode :5400, avg score : -83.9
# of episode :5600, avg score : -89.1
# of episode :5800, avg score : -87.8
# of episode :6000, avg score : -85.3
# of episode :6200, avg score : -86.5
# of episode :6400, avg score : -92.4
# of episode :6600, avg score : -87.3
# of episode :6800, avg score : -86.4
# of episode :7000, avg score : -88.6
# of episode :7200, avg score : -86.8
# of episode :7400, avg score : -86.2
# of episode :7600, avg score : -84.5
# of episode :7800, avg score : -85.4
# of episode :8000, avg score : -88.0
# of episode :8200, avg score : -87.8
# of episode :8400, avg score : -81.6
# of episode :8600, avg score : -88.1
# of episode :8800, avg score : -85.2
# of episode :9000, avg score : -86.4
# of episode :9200, avg score : -83.7
# of episode :9400, avg score : -83.5
# of episode :9600, avg score : -87.6
# of episode :9800, avg score : -85.3
```

# Part 2: Transition Model: LSTM with Attention

- In this section you will (i) make a dataset of randomly collected rollouts from acrobot-v1 and then (ii) fit an LSTM with attention to this. Your input to the model will be [state,action] and output [next_state].

# Part3: Model Based PPO: Reinforcement Learning

- In this section you turn your transition model into a gym enviroment and then attempt to train PPO inside of this model.  Compare PPO on the model versus PPO on the real acrobot as well. 
