# Deep Q-Learning Banana Collector

---
The work is about training an agent using deep reinforcement learning to navigate and collect bananas in a large, square world. A reward of +1 is provided for collecting a yellow banana, and a reward of -1 is provided for collecting a blue banana. Thus, the goal of your agent is to collect as many yellow bananas as possible while avoiding blue bananas.

The following code is adapted from Udacitys Deep Reinforement Learning programme.
Please see readme for instructions on how to run the code.

## Setup envionment and deep Q-learning agent
Import libaries and setup environment and DQL agent

In [None]:
# Import libaries 
from unityagents import UnityEnvironment
import numpy as np
import random
import torch
import pandas as pd
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline

# Load unity banana environment
env = UnityEnvironment(file_name="./unity_simulation/Banana.x86_64")
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

# Make agent to learn the task. This agent implements DQL
from banana_agent import Agent
agent = Agent(state_size=37, action_size=4, seed=0)

## Train agent
Training loop that uses the agent to take actions and update the deep Q-network. The loop makes use of the banana environment to collect rewards, observations, and restart the simulation when necessary. Finally, the scores and learned weights are collected and saved to files that can be used for documentation and testing.

In [None]:
def dqn(n_episodes=1000, max_t=1000, eps_start=1.0, eps_end=0.015, eps_decay=0.0027):
    
    scores = []                        # list containing scores from each episode
    scores_window = deque(maxlen=100)  # last 100 scores
    eps = eps_start                    # initialize epsilon
        
    # Run episodes
    for i_episode in range(1, n_episodes+1):               # Run for n_episodes episodes
        env_info = env.reset(train_mode=True)[brain_name]  # Restart the env
        state = env_info.vector_observations[0]            # Get current state
        score = 0                                          # Initialize score to 0 
        for t in range(max_t):
            action = agent.act(state, eps)                 # Get action from agent
            env_info = env.step(action)[brain_name]        # Send action to the environment
            next_state = env_info.vector_observations[0]   # Get the next state
            reward = env_info.rewards[0]                   # Get the reward
            done = env_info.local_done[0]                  # See if episode has finished
            
            agent.step(state, action, reward, next_state, done) # Step agent (i.e., train)
            state = next_state
            score += reward
            if done:
                break 
                
        scores_window.append(score)       # save most recent score
        scores.append(score)              # save most recent score      
        eps = max(eps_end, eps-eps_decay) # linear decay epsilon
        
        # Print output to user
        print('\rEpisode {}\tAverage Score: {:.2f}\tEpsilon: {:.3f}'.format(i_episode, np.mean(scores_window), eps), end="")
        if i_episode % 100 == 0:
            print('\rEpisode {}\tAverage Score: {:.2f}\tEpsilon: {:.3f}'.format(i_episode, np.mean(scores_window), eps))
        if np.mean(scores_window)>=20.0:
            print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.3f}'.format(i_episode-100, np.mean(scores_window)))
            break
    
    # Save trained model
    torch.save(agent.qnetwork_local.state_dict(), './saved_models/dense_model.pth')
    return scores

episodes_to_run = 1000 # Number of episodes per trial 
index_collector = range(0, episodes_to_run) # Index used for plotting
trials = 1 # Number of training trials (used for plotting) 

for i in range(trials):
    print("\n -- Trial {} --".format(i+1))
    if i == 0:
        scores_current = dqn(episodes_to_run) # Run training algorithm
        scores = np.vstack((range(0, episodes_to_run), scores_current)).T # Save scores
    else:
        scores_current = dqn(episodes_to_run) # Run training algorithm
        scores = np.concatenate((scores, np.vstack((index_collector, scores_current)).T)) # Save scores

# df = pd.DataFrame(scores) 
# df.to_csv('./data/scores_sparse.csv', index=False) # Save training results (scores) to file

# Score Plotter
Plotter function that can plot the scores file generated from using the above training loop. 

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.ticker import FormatStrFormatter
%matplotlib inline
import seaborn as sns

# Read and postprocess CSV file generated while training
def load_csv(csv_name):
    # Get and arrange data
    df_scores = pd.read_csv(csv_name)   # Read CSV file generated while training
    df_scores.columns = ["episode", "score"]      # Rename columns 
    df_scores_smooth = df_scores.groupby(["episode"], as_index=False).mean()        # Prepare for rolling mean
    df_scores_smooth['smooth_score'] = df_scores_smooth['score'].rolling(15).mean() # Use rolling mean of 15
    return df_scores, df_scores_smooth

# Setup seaborn for plotting
sns.set("talk", rc={"lines.linewidth": 3, 'figure.figsize': (14, 7)})
sns.set_palette("muted")
sns.set_style("whitegrid")

# Get and arrange data
df_scores_sparse, df_scores_smooth_sparse = load_csv("./data/scores_sparse.csv")   
df_scores_dense, df_scores_smooth_dense = load_csv("./data/scores_dense.csv")   

# Plot data: first raw score with mean and std error - then the rolling mean with a darker color
sns_plot = sns.lineplot('episode', 'score', data=df_scores_dense, linewidth = 1, color="navy", alpha = 0.5, label='_nolegend_')
sns_plot = sns.lineplot('episode', 'smooth_score', data=df_scores_smooth_dense, linewidth = 3, color="navy")
sns_plot = sns.lineplot('episode', 'score', data=df_scores_sparse, linewidth = 1, color="orange", alpha = 0.5, label='_nolegend_')
sns_plot = sns.lineplot('episode', 'smooth_score', data=df_scores_smooth_sparse, linewidth = 3, color="orange")

sns_plot.axhline(13, ls='--', color="green", linewidth=5, alpha=0.75)  # Insert line at 13 which is the goal 
sns_plot.set(ylabel='Score', xlabel='Episode #')
sns_plot.legend(fontsize = 'medium', loc='upper left', labels=["Dense DQN", "Sparse DQN"], framealpha = 0)

figure = sns_plot.get_figure()
figure.tight_layout()
figure.savefig('./data/score_DQN.png', dpi=200) # Save figure to score.png
plt.show()

## Agent Viewer
Run the trained agent in the banana collector environment using the saved weights from training.

In [None]:
from unityagents import UnityEnvironment
import torch
from datetime import datetime
import time
from banana_agent import Agent

agent = Agent(state_size=37, action_size=4, seed=time.time())      # Make agent
agent.qnetwork_local.load_state_dict(torch.load('./saved_models/sparse_model.pth')) # load weights from previous training session

env = UnityEnvironment(file_name="./unity_simulation/Banana.x86_64") # Load banana environment. !Remember to change network in banana_agent.py
brain_name = env.brain_names[0]
brain = env.brains[brain_name]

env_info = env.reset(train_mode=False)[brain_name]  # reset the environment
state = env_info.vector_observations[0]             # get the current state
score = 0                                           # initialize the score
while True:
    action = agent.act(state, 0)                    # select an action
    env_info = env.step(action)[brain_name]         # send the action to the environment
    next_state = env_info.vector_observations[0]    # get the next state
    reward = env_info.rewards[0]                    # get the reward
    done = env_info.local_done[0]                   # see if episode has finished
    score += reward                                 # update the score
    state = next_state                              # roll over the state to next time step
    if done:                                        # exit loop if episode finished
        break
    
print("Score: {}".format(score))