# Same as before but this time with an Agent!


We start with the same import statements like before. We also setup the LunarLander environment. 

In [None]:
import gym
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import numpy as np

from gymclass import Notebook
%matplotlib inline

In [None]:
env = gym.make("LunarLander-v2")
env = env.unwrapped
env.seed(1)

## Now we setup the Agent 

![Agent](images/Agent.png)

This agent is a special kind of Reinforced Machine Learning agent called a **Policy Gradient**.

The objective of this agent is to maximize the “expected” reward. When controlling something complicated with many steps like a lunar lunder the policy gradent agent must figure out what sequence of actions will lead to the highest rewards. 

Like atheletes must practice to build muscle memory our Policy Gradient must practice and train to learn the best actions to take. 

## How the code works 

![Agent](images/LearningExplain.png)



If you want to use your own AI in the next testing steps change **MYNAME** in the save string below to be your name


In [None]:
%%time
from collections import deque
from dqn_agent import DQNAgent
import time

# eps stands for Epsilon which is the amount of the Agent will randomly take actions 
# its starts being quite random and this causes it to 'explore' its options and potentially
# try things that might not work as well.  As time passes in the training we allow less 
# random actions. 

eps_start=1.0
eps_end=0.001
eps_decay=0.995
eps = eps_start  # initialize epsilon

episode_rewards = [] # List of all rewards
episode_rewards_window = deque(maxlen=100)  # last 100 scores


# Replace MYNAME with your name below. 
save_path = "output/LunarLander-ROBIN.ckpt"
agent = DQNAgent(state_size=8, action_size=4, seed=0, hidden_layer1=64, hidden_layer2=108)

episodes = 2000
for t in Notebook.log_progress(range(episodes)):
    observation = env.reset()
    episode_reward = 0
    tic = time.perf_counter() 
    while True:
        # 1. Choose an action based on observation        
        action = agent.act(observation, eps)
        
        # 2. Take action in the environment
        observation_next, reward, done, info = env.step(action)
        
        # 3. Now tell the agent about the action and reward so it can learn
        agent.step(observation, action, reward, observation_next, done)
        
        # Taking too long
        #if time.perf_counter() - tic > 60:
        #    done = True

        # Oops Crashed or flew away, stops early 
        if episode_reward<-500:
            done = True

        # After initial training quit early when things go wrong 
        # try to amplify good experience, remove random 
        if t>500 and episode_reward<-250:
            done = True

            
        observation = observation_next
        episode_reward += reward
        if done:
            break
    # save scores and update epsilon which sets the amount of random exploration
    episode_rewards_window.append(episode_reward)
    episode_rewards.append(episode_reward)
    eps = max(eps_end, eps_decay*eps)
    raw = np.mean(episode_rewards_window)
    print("\r Episodes ", t, " Current Rolling Avg Reward ", raw, end="")   
    if raw > 350:
        break;
    
        
        
agent.save(save_path)  
agent.save_bin(save_path+'.bin')  
print("")
print("Done! Average Reward =", np.mean(episode_rewards_window))
print("Average Fitness Score =", agent.fitness(np.mean(episode_rewards_window)))
plt.plot(np.arange(len(episode_rewards)), episode_rewards)
plt.ylabel('Reward')
plt.xlabel('Training Steps')
plt.savefig("reward-episodes-" +str(episodes)+".png")
plt.show()