## Training the model 

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import gym
import random
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('CartPole-v1')
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n

EPISODES = 1000
LEARNING_RATE = 0.0001
MEM_SIZE = 10000
BATCH_SIZE = 64
GAMMA = 0.95
EXPLORATION_MAX = 1.0
EXPLORATION_DECAY = 0.999
EXPLORATION_MIN = 0.001

FC1_DIMS = 1024
FC2_DIMS = 512
DEVICE = torch.device("cpu")

best_reward = 0
average_reward = 0
episode_number = []
average_reward_number = []

In [2]:
class Network(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.input_shape = env.observation_space.shape
        self.action_space = action_space

        self.fc1 = nn.Linear(*self.input_shape, FC1_DIMS)
        self.fc2 = nn.Linear(FC1_DIMS, FC2_DIMS)
        self.fc3 = nn.Linear(FC2_DIMS, self.action_space)

        self.optimizer = optim.Adam(self.parameters(), lr=LEARNING_RATE)
        self.loss = nn.MSELoss()
        self.to(DEVICE)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class ReplayBuffer:
    def __init__(self):
        self.mem_count = 0

        self.states = np.zeros((MEM_SIZE, *env.observation_space.shape),dtype=np.float32)
        self.actions = np.zeros(MEM_SIZE, dtype=np.int64)
        self.rewards = np.zeros(MEM_SIZE, dtype=np.float32)
        self.states_ = np.zeros((MEM_SIZE, *env.observation_space.shape),dtype=np.float32)
        self.dones = np.zeros(MEM_SIZE, dtype=np.bool)

    def add(self, state, action, reward, state_, done):
        mem_index = self.mem_count % MEM_SIZE

        self.states[mem_index]  = state.clone().detach().numpy()
        self.actions[mem_index] = action
        self.rewards[mem_index] = reward
        self.states_[mem_index] = state_.clone().detach().numpy()
        self.dones[mem_index] =  1 - done

        self.mem_count += 1

    def sample(self):
        MEM_MAX = min(self.mem_count, MEM_SIZE)
        batch_indices = np.random.choice(MEM_MAX, BATCH_SIZE, replace=True)

        states  = self.states[batch_indices]
        actions = self.actions[batch_indices]
        rewards = self.rewards[batch_indices]
        states_ = self.states_[batch_indices]
        dones   = self.dones[batch_indices]

        return states, actions, rewards, states_, dones

class DQN_Solver:
    def __init__(self):
        self.memory = ReplayBuffer()
        self.exploration_rate = EXPLORATION_MAX
        self.network = Network()

    def choose_action(self, observation):
        if random.random() < self.exploration_rate:
            return env.action_space.sample()

        state = observation
        state = state.to(DEVICE)
        state = state.unsqueeze(0)

        q_values = self.network(state.float())
        return torch.argmax(q_values).item(), q_values

    def learn(self):
        if self.memory.mem_count < BATCH_SIZE:
            return

        states, actions, rewards, states_, dones = self.memory.sample()
        states = torch.tensor(states , dtype=torch.float32).to(DEVICE)
        actions = torch.tensor(actions, dtype=torch.long).to(DEVICE)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(DEVICE)
        states_ = torch.tensor(states_, dtype=torch.float32).to(DEVICE)
        dones = torch.tensor(dones, dtype=torch.bool).to(DEVICE)
        batch_indices = np.arange(BATCH_SIZE, dtype=np.int64)
        states.requires_grad = True
        
        q_values = self.network(states)
        next_q_values = self.network(states_)

        predicted_value_of_now = q_values[batch_indices, actions]
        predicted_value_of_future = torch.max(next_q_values, dim=1)[0]

        q_target = rewards + GAMMA * predicted_value_of_future * dones

        loss = self.network.loss(q_target, predicted_value_of_now)        
        self.network.optimizer.zero_grad()
        loss.backward()
        self.network.optimizer.step()

        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)
        return states.grad.data.sign()

    def returning_epsilon(self):
        return self.exploration_rate

## Training the network without defense

In [3]:
agent = DQN_Solver()
agent.network.train()
for i in range(1, 200):
    state = env.reset()
    score = 0

    while True:
        env.render()
        if attack and gradient_sign is not None:
            state = np.add(state,gradient_sign[0].detach().sign())
        action = agent.choose_action(state)
        try:
            q_values = action[1]
            action = action[0]
        except:
            action = action
        state_, reward, done, info = env.step(action)
        agent.memory.add(state, action, reward, state_, done)
        gradient_sign = agent.learn()
        state = state_
        score += reward

        if done:
            if score > best_reward:
                best_reward = score
            average_reward += score
            print("Episode {} Average Reward {} Best Reward {} Last Reward {} Epsilon {}".format(i, average_reward/i, best_reward, score, agent.returning_epsilon()))
            break

        episode_number.append(i)
        average_reward_number.append(average_reward/i)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.dones = np.zeros(MEM_SIZE, dtype=np.bool)


Episode 1 Average Reward 29.0 Best Reward 29.0 Last Reward 29.0 Epsilon 1.0
Episode 2 Average Reward 21.5 Best Reward 29.0 Last Reward 14.0 Epsilon 1.0
Episode 3 Average Reward 21.666666666666668 Best Reward 29.0 Last Reward 22.0 Epsilon 0.998001
Episode 4 Average Reward 20.5 Best Reward 29.0 Last Reward 17.0 Epsilon 0.9811700348643991
Episode 5 Average Reward 20.2 Best Reward 29.0 Last Reward 19.0 Epsilon 0.9626946373158061
Episode 6 Average Reward 22.5 Best Reward 34.0 Last Reward 34.0 Epsilon 0.9304973749532338
Episode 7 Average Reward 21.0 Best Reward 34.0 Last Reward 12.0 Epsilon 0.9193926150309796
Episode 8 Average Reward 20.125 Best Reward 34.0 Last Reward 14.0 Epsilon 0.9066044494080763
Episode 9 Average Reward 26.333333333333332 Best Reward 76.0 Last Reward 76.0 Epsilon 0.8402237462387894
Episode 10 Average Reward 27.2 Best Reward 76.0 Last Reward 35.0 Epsilon 0.8113103927077344
Episode 11 Average Reward 27.181818181818183 Best Reward 76.0 Last Reward 27.0 Epsilon 0.7896874231

Episode 81 Average Reward 91.07407407407408 Best Reward 309.0 Last Reward 178.0 Epsilon 0.001
Episode 82 Average Reward 92.15853658536585 Best Reward 309.0 Last Reward 180.0 Epsilon 0.001
Episode 83 Average Reward 94.46987951807229 Best Reward 309.0 Last Reward 284.0 Epsilon 0.001
Episode 84 Average Reward 95.20238095238095 Best Reward 309.0 Last Reward 156.0 Epsilon 0.001
Episode 85 Average Reward 97.04705882352941 Best Reward 309.0 Last Reward 252.0 Epsilon 0.001
Episode 86 Average Reward 98.31395348837209 Best Reward 309.0 Last Reward 206.0 Epsilon 0.001
Episode 87 Average Reward 99.34482758620689 Best Reward 309.0 Last Reward 188.0 Epsilon 0.001
Episode 88 Average Reward 101.0 Best Reward 309.0 Last Reward 245.0 Epsilon 0.001
Episode 89 Average Reward 101.53932584269663 Best Reward 309.0 Last Reward 149.0 Epsilon 0.001
Episode 90 Average Reward 102.36666666666666 Best Reward 309.0 Last Reward 176.0 Epsilon 0.001
Episode 91 Average Reward 105.43956043956044 Best Reward 382.0 Last Re

KeyboardInterrupt: 

## Testing the network with FSGM

In [29]:
# Taking the gradient of the reward with respect to the temporal difference error
agent.network.eval()

attack = True
gradient_sign = None
average_reward = 0
epsilon_pert = 0.10

for i in range(1,1000):
    state = env.reset()
    score = 0
    while True:
        found_q_values = False
        env.render()
        action = agent.choose_action(state)
        try:
            q_values = action[1]
            action = action[0]
            found_q_values = True
        except Exception as e:
            action = action
        state_, reward, done, info = env.step(action)
        
        if(found_q_values == True):
            next_q_values = agent.network(state_.float())
            loss = agent.network.loss(q_values, next_q_values)
            
            gradient_sign = torch.autograd.grad(loss,state,retain_graph=True)[0].sign()
            if(gradient_sign != None):
                state_ = state_ + gradient_sign * epsilon_pert
            else:
                print("Not perturbated")
                
            
        state = state_
        score += reward
        
        if done:
            if score > best_reward:
                best_reward = score
            average_reward += score
            print("Episode {} Average Reward {} Best Reward {} Last Reward {} Epsilon {}".format(i, average_reward/i, best_reward, score, agent.returning_epsilon()))
            break

        episode_number.append(i)
        average_reward_number.append(average_reward/i)



Episode 1 Average Reward 25.0 Best Reward 500.0 Last Reward 25.0 Epsilon 0.001
Episode 2 Average Reward 25.5 Best Reward 500.0 Last Reward 26.0 Epsilon 0.001
Episode 3 Average Reward 30.333333333333332 Best Reward 500.0 Last Reward 40.0 Epsilon 0.001
Episode 4 Average Reward 33.25 Best Reward 500.0 Last Reward 42.0 Epsilon 0.001
Episode 5 Average Reward 32.0 Best Reward 500.0 Last Reward 27.0 Epsilon 0.001
Episode 6 Average Reward 31.0 Best Reward 500.0 Last Reward 26.0 Epsilon 0.001
Episode 7 Average Reward 32.285714285714285 Best Reward 500.0 Last Reward 40.0 Epsilon 0.001
Episode 8 Average Reward 33.5 Best Reward 500.0 Last Reward 42.0 Epsilon 0.001
Episode 9 Average Reward 37.55555555555556 Best Reward 500.0 Last Reward 70.0 Epsilon 0.001
Episode 10 Average Reward 38.4 Best Reward 500.0 Last Reward 46.0 Epsilon 0.001
Episode 11 Average Reward 42.81818181818182 Best Reward 500.0 Last Reward 87.0 Epsilon 0.001
Episode 12 Average Reward 49.833333333333336 Best Reward 500.0 Last Reward

Episode 92 Average Reward 56.33695652173913 Best Reward 500.0 Last Reward 26.0 Epsilon 0.001
Episode 93 Average Reward 56.043010752688176 Best Reward 500.0 Last Reward 29.0 Epsilon 0.001
Episode 94 Average Reward 56.670212765957444 Best Reward 500.0 Last Reward 115.0 Epsilon 0.001
Episode 95 Average Reward 56.78947368421053 Best Reward 500.0 Last Reward 68.0 Epsilon 0.001
Episode 96 Average Reward 56.46875 Best Reward 500.0 Last Reward 26.0 Epsilon 0.001
Episode 97 Average Reward 56.34020618556701 Best Reward 500.0 Last Reward 44.0 Epsilon 0.001
Episode 98 Average Reward 56.183673469387756 Best Reward 500.0 Last Reward 41.0 Epsilon 0.001
Episode 99 Average Reward 56.303030303030305 Best Reward 500.0 Last Reward 68.0 Epsilon 0.001
Episode 100 Average Reward 56.63 Best Reward 500.0 Last Reward 89.0 Epsilon 0.001
Episode 101 Average Reward 57.32673267326733 Best Reward 500.0 Last Reward 127.0 Epsilon 0.001
Episode 102 Average Reward 57.03921568627451 Best Reward 500.0 Last Reward 28.0 Eps

Episode 180 Average Reward 58.33888888888889 Best Reward 500.0 Last Reward 102.0 Epsilon 0.001
Episode 181 Average Reward 58.392265193370164 Best Reward 500.0 Last Reward 68.0 Epsilon 0.001
Episode 182 Average Reward 58.214285714285715 Best Reward 500.0 Last Reward 26.0 Epsilon 0.001
Episode 183 Average Reward 58.049180327868854 Best Reward 500.0 Last Reward 28.0 Epsilon 0.001
Episode 184 Average Reward 57.97826086956522 Best Reward 500.0 Last Reward 45.0 Epsilon 0.001
Episode 185 Average Reward 57.810810810810814 Best Reward 500.0 Last Reward 27.0 Epsilon 0.001
Episode 186 Average Reward 57.70967741935484 Best Reward 500.0 Last Reward 39.0 Epsilon 0.001
Episode 187 Average Reward 57.63636363636363 Best Reward 500.0 Last Reward 44.0 Epsilon 0.001
Episode 188 Average Reward 57.47872340425532 Best Reward 500.0 Last Reward 28.0 Epsilon 0.001
Episode 189 Average Reward 57.64021164021164 Best Reward 500.0 Last Reward 88.0 Epsilon 0.001
Episode 190 Average Reward 57.48421052631579 Best Rewar

Episode 268 Average Reward 57.201492537313435 Best Reward 500.0 Last Reward 72.0 Epsilon 0.001
Episode 269 Average Reward 57.312267657992564 Best Reward 500.0 Last Reward 87.0 Epsilon 0.001
Episode 270 Average Reward 57.2 Best Reward 500.0 Last Reward 27.0 Epsilon 0.001
Episode 271 Average Reward 57.08487084870849 Best Reward 500.0 Last Reward 26.0 Epsilon 0.001
Episode 272 Average Reward 56.974264705882355 Best Reward 500.0 Last Reward 27.0 Epsilon 0.001
Episode 273 Average Reward 57.02564102564103 Best Reward 500.0 Last Reward 71.0 Epsilon 0.001
Episode 274 Average Reward 57.215328467153284 Best Reward 500.0 Last Reward 109.0 Epsilon 0.001
Episode 275 Average Reward 57.10909090909091 Best Reward 500.0 Last Reward 28.0 Epsilon 0.001
Episode 276 Average Reward 57.0 Best Reward 500.0 Last Reward 27.0 Epsilon 0.001
Episode 277 Average Reward 56.95667870036101 Best Reward 500.0 Last Reward 45.0 Epsilon 0.001
Episode 278 Average Reward 57.010791366906474 Best Reward 500.0 Last Reward 72.0 

Episode 356 Average Reward 57.258426966292134 Best Reward 500.0 Last Reward 69.0 Epsilon 0.001
Episode 357 Average Reward 57.21848739495798 Best Reward 500.0 Last Reward 43.0 Epsilon 0.001
Episode 358 Average Reward 57.365921787709496 Best Reward 500.0 Last Reward 110.0 Epsilon 0.001
Episode 359 Average Reward 57.5041782729805 Best Reward 500.0 Last Reward 107.0 Epsilon 0.001
Episode 360 Average Reward 57.47222222222222 Best Reward 500.0 Last Reward 46.0 Epsilon 0.001
Episode 361 Average Reward 57.43490304709141 Best Reward 500.0 Last Reward 44.0 Epsilon 0.001
Episode 362 Average Reward 57.72375690607735 Best Reward 500.0 Last Reward 162.0 Epsilon 0.001
Episode 363 Average Reward 57.93939393939394 Best Reward 500.0 Last Reward 136.0 Epsilon 0.001
Episode 364 Average Reward 57.887362637362635 Best Reward 500.0 Last Reward 39.0 Epsilon 0.001
Episode 365 Average Reward 57.8054794520548 Best Reward 500.0 Last Reward 28.0 Epsilon 0.001
Episode 366 Average Reward 57.83606557377049 Best Rewar

Episode 446 Average Reward 57.57174887892376 Best Reward 500.0 Last Reward 24.0 Epsilon 0.001
Episode 447 Average Reward 57.60178970917226 Best Reward 500.0 Last Reward 71.0 Epsilon 0.001
Episode 448 Average Reward 57.580357142857146 Best Reward 500.0 Last Reward 48.0 Epsilon 0.001
Episode 449 Average Reward 57.65033407572383 Best Reward 500.0 Last Reward 89.0 Epsilon 0.001
Episode 450 Average Reward 57.58 Best Reward 500.0 Last Reward 26.0 Epsilon 0.001
Episode 451 Average Reward 57.60532150776053 Best Reward 500.0 Last Reward 69.0 Epsilon 0.001
Episode 452 Average Reward 57.6283185840708 Best Reward 500.0 Last Reward 68.0 Epsilon 0.001
Episode 453 Average Reward 57.558498896247244 Best Reward 500.0 Last Reward 26.0 Epsilon 0.001
Episode 454 Average Reward 57.53303964757709 Best Reward 500.0 Last Reward 46.0 Epsilon 0.001
Episode 455 Average Reward 57.496703296703295 Best Reward 500.0 Last Reward 41.0 Epsilon 0.001
Episode 456 Average Reward 57.425438596491226 Best Reward 500.0 Last R

Episode 534 Average Reward 57.72659176029963 Best Reward 500.0 Last Reward 29.0 Epsilon 0.001
Episode 535 Average Reward 57.70654205607477 Best Reward 500.0 Last Reward 47.0 Epsilon 0.001
Episode 536 Average Reward 57.649253731343286 Best Reward 500.0 Last Reward 27.0 Epsilon 0.001
Episode 537 Average Reward 57.59217877094972 Best Reward 500.0 Last Reward 27.0 Epsilon 0.001
Episode 538 Average Reward 57.611524163568774 Best Reward 500.0 Last Reward 68.0 Epsilon 0.001
Episode 539 Average Reward 57.65677179962894 Best Reward 500.0 Last Reward 82.0 Epsilon 0.001
Episode 540 Average Reward 57.63333333333333 Best Reward 500.0 Last Reward 45.0 Epsilon 0.001
Episode 541 Average Reward 57.68207024029575 Best Reward 500.0 Last Reward 84.0 Epsilon 0.001
Episode 542 Average Reward 57.66420664206642 Best Reward 500.0 Last Reward 48.0 Epsilon 0.001
Episode 543 Average Reward 57.69060773480663 Best Reward 500.0 Last Reward 72.0 Epsilon 0.001
Episode 544 Average Reward 57.63419117647059 Best Reward 5

Episode 621 Average Reward 57.73752012882448 Best Reward 500.0 Last Reward 87.0 Epsilon 0.001
Episode 622 Average Reward 57.68810289389067 Best Reward 500.0 Last Reward 27.0 Epsilon 0.001
Episode 623 Average Reward 57.651685393258425 Best Reward 500.0 Last Reward 35.0 Epsilon 0.001
Episode 624 Average Reward 57.69871794871795 Best Reward 500.0 Last Reward 87.0 Epsilon 0.001
Episode 625 Average Reward 57.656 Best Reward 500.0 Last Reward 31.0 Epsilon 0.001
Episode 626 Average Reward 57.76517571884984 Best Reward 500.0 Last Reward 126.0 Epsilon 0.001
Episode 627 Average Reward 57.84848484848485 Best Reward 500.0 Last Reward 110.0 Epsilon 0.001
Episode 628 Average Reward 57.79299363057325 Best Reward 500.0 Last Reward 23.0 Epsilon 0.001
Episode 629 Average Reward 57.76152623211447 Best Reward 500.0 Last Reward 38.0 Epsilon 0.001
Episode 630 Average Reward 57.731746031746034 Best Reward 500.0 Last Reward 39.0 Epsilon 0.001
Episode 631 Average Reward 57.782884310618066 Best Reward 500.0 Las

Episode 709 Average Reward 57.66431593794076 Best Reward 500.0 Last Reward 73.0 Epsilon 0.001
Episode 710 Average Reward 57.61971830985915 Best Reward 500.0 Last Reward 26.0 Epsilon 0.001
Episode 711 Average Reward 57.71026722925457 Best Reward 500.0 Last Reward 122.0 Epsilon 0.001
Episode 712 Average Reward 57.75561797752809 Best Reward 500.0 Last Reward 90.0 Epsilon 0.001
Episode 713 Average Reward 57.73492286115007 Best Reward 500.0 Last Reward 43.0 Epsilon 0.001
Episode 714 Average Reward 57.76890756302521 Best Reward 500.0 Last Reward 82.0 Epsilon 0.001
Episode 715 Average Reward 57.72447552447552 Best Reward 500.0 Last Reward 26.0 Epsilon 0.001
Episode 716 Average Reward 57.81843575418994 Best Reward 500.0 Last Reward 125.0 Epsilon 0.001
Episode 717 Average Reward 57.860529986052995 Best Reward 500.0 Last Reward 88.0 Epsilon 0.001
Episode 718 Average Reward 57.83983286908078 Best Reward 500.0 Last Reward 43.0 Epsilon 0.001
Episode 719 Average Reward 57.813630041724615 Best Reward

Episode 796 Average Reward 58.3178391959799 Best Reward 500.0 Last Reward 157.0 Epsilon 0.001
Episode 797 Average Reward 58.27854454203262 Best Reward 500.0 Last Reward 27.0 Epsilon 0.001
Episode 798 Average Reward 58.23809523809524 Best Reward 500.0 Last Reward 26.0 Epsilon 0.001
Episode 799 Average Reward 58.196495619524406 Best Reward 500.0 Last Reward 25.0 Epsilon 0.001
Episode 800 Average Reward 58.16 Best Reward 500.0 Last Reward 29.0 Epsilon 0.001
Episode 801 Average Reward 58.2334581772784 Best Reward 500.0 Last Reward 117.0 Epsilon 0.001
Episode 802 Average Reward 58.19950124688279 Best Reward 500.0 Last Reward 31.0 Epsilon 0.001
Episode 803 Average Reward 58.16438356164384 Best Reward 500.0 Last Reward 30.0 Epsilon 0.001
Episode 804 Average Reward 58.201492537313435 Best Reward 500.0 Last Reward 88.0 Epsilon 0.001
Episode 805 Average Reward 58.1639751552795 Best Reward 500.0 Last Reward 28.0 Epsilon 0.001
Episode 806 Average Reward 58.19602977667494 Best Reward 500.0 Last Rew

Episode 885 Average Reward 58.02372881355932 Best Reward 500.0 Last Reward 75.0 Epsilon 0.001
Episode 886 Average Reward 58.01805869074492 Best Reward 500.0 Last Reward 53.0 Epsilon 0.001
Episode 887 Average Reward 58.0304396843292 Best Reward 500.0 Last Reward 69.0 Epsilon 0.001
Episode 888 Average Reward 58.01013513513514 Best Reward 500.0 Last Reward 40.0 Epsilon 0.001
Episode 889 Average Reward 57.993250843644546 Best Reward 500.0 Last Reward 43.0 Epsilon 0.001
Episode 890 Average Reward 57.98876404494382 Best Reward 500.0 Last Reward 54.0 Epsilon 0.001
Episode 891 Average Reward 58.00224466891134 Best Reward 500.0 Last Reward 70.0 Epsilon 0.001
Episode 892 Average Reward 57.96860986547085 Best Reward 500.0 Last Reward 28.0 Epsilon 0.001
Episode 893 Average Reward 57.9484882418813 Best Reward 500.0 Last Reward 40.0 Epsilon 0.001
Episode 894 Average Reward 57.94407158836689 Best Reward 500.0 Last Reward 54.0 Epsilon 0.001
Episode 895 Average Reward 57.927374301675975 Best Reward 500

Episode 973 Average Reward 57.8879753340185 Best Reward 500.0 Last Reward 86.0 Epsilon 0.001
Episode 974 Average Reward 57.87166324435318 Best Reward 500.0 Last Reward 42.0 Epsilon 0.001
Episode 975 Average Reward 57.88410256410256 Best Reward 500.0 Last Reward 70.0 Epsilon 0.001
Episode 976 Average Reward 57.853483606557376 Best Reward 500.0 Last Reward 28.0 Epsilon 0.001
Episode 977 Average Reward 57.86386898669396 Best Reward 500.0 Last Reward 68.0 Epsilon 0.001
Episode 978 Average Reward 57.91922290388548 Best Reward 500.0 Last Reward 112.0 Epsilon 0.001
Episode 979 Average Reward 57.888661899897855 Best Reward 500.0 Last Reward 28.0 Epsilon 0.001
Episode 980 Average Reward 57.85510204081633 Best Reward 500.0 Last Reward 25.0 Epsilon 0.001
Episode 981 Average Reward 57.82364933741081 Best Reward 500.0 Last Reward 27.0 Epsilon 0.001
Episode 982 Average Reward 57.81160896130346 Best Reward 500.0 Last Reward 46.0 Epsilon 0.001
Episode 983 Average Reward 57.7792472024415 Best Reward 50

## Training the network with adversarial training

In [None]:
agent = DQN_Solver()
agent.network.train()
for i in range(1, 200):
    state = env.reset()
    score = 0

    while True:
        env.render()
        if attack and gradient_sign is not None:
            state = np.add(state,gradient_sign[0].detach().sign())
        action = agent.choose_action(state)
        try:
            q_values = action[1]
            action = action[0]
        except:
            action = action
        state_, reward, done, info = env.step(action)
        agent.memory.add(state, action, reward, state_, done)
        gradient_sign = agent.learn()
        state = state_
        score += reward

        if done:
            if score > best_reward:
                best_reward = score
            average_reward += score
            print("Episode {} Average Reward {} Best Reward {} Last Reward {} Epsilon {}".format(i, average_reward/i, best_reward, score, agent.returning_epsilon()))
            break

        episode_number.append(i)
        average_reward_number.append(average_reward/i)