## Training the model 

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import math
import gym
import random
import numpy as np
import matplotlib.pyplot as plt

env = gym.make('CartPole-v1')
observation_space = env.observation_space.shape[0]
action_space = env.action_space.n

EPISODES = 75
LEARNING_RATE = 0.0001
MEM_SIZE = 10000
BATCH_SIZE = 64
GAMMA = 0.95
EXPLORATION_MAX = 1.0
EXPLORATION_DECAY = 0.999
EXPLORATION_MIN = 0.001
EPSILON_PERT = 0.2

FC1_DIMS = 1024
FC2_DIMS = 512
DEVICE = torch.device("cpu")



In [2]:
class Network(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.input_shape = env.observation_space.shape
        self.action_space = action_space

        self.fc1 = nn.Linear(*self.input_shape, FC1_DIMS)
        self.fc2 = nn.Linear(FC1_DIMS, FC2_DIMS)
        self.fc3 = nn.Linear(FC2_DIMS, self.action_space)

        self.optimizer = optim.Adam(self.parameters(), lr=LEARNING_RATE)
        self.loss = nn.MSELoss()
        self.to(DEVICE)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class ReplayBuffer:
    def __init__(self):
        self.mem_count = 0

        self.states = np.zeros((MEM_SIZE, *env.observation_space.shape),dtype=np.float32)
        self.actions = np.zeros(MEM_SIZE, dtype=np.int64)
        self.rewards = np.zeros(MEM_SIZE, dtype=np.float32)
        self.states_ = np.zeros((MEM_SIZE, *env.observation_space.shape),dtype=np.float32)
        self.dones = np.zeros(MEM_SIZE, dtype=np.bool)

    def add(self, state, action, reward, state_, done):
        mem_index = self.mem_count % MEM_SIZE

        self.states[mem_index]  = state.clone().detach().numpy()
        self.actions[mem_index] = action
        self.rewards[mem_index] = reward
        self.states_[mem_index] = state_.clone().detach().numpy()
        self.dones[mem_index] =  1 - done

        self.mem_count += 1

    def sample(self):
        MEM_MAX = min(self.mem_count, MEM_SIZE)
        batch_indices = np.random.choice(MEM_MAX, BATCH_SIZE, replace=True)

        states  = self.states[batch_indices]
        actions = self.actions[batch_indices]
        rewards = self.rewards[batch_indices]
        states_ = self.states_[batch_indices]
        dones   = self.dones[batch_indices]

        return states, actions, rewards, states_, dones

class DQN_Solver:
    def __init__(self):
        self.memory = ReplayBuffer()
        self.exploration_rate = EXPLORATION_MAX
        self.network = Network()

    def choose_action(self, observation):
        if random.random() < self.exploration_rate:
            return env.action_space.sample(), 0.0

        state = observation
        state = state.to(DEVICE)
        state = state.unsqueeze(0)

        q_values = self.network(state.float())
        return torch.argmax(q_values).item(), q_values

    def learn(self):
        if self.memory.mem_count < BATCH_SIZE:
            return

        states, actions, rewards, states_, dones = self.memory.sample()
        states = torch.tensor(states , dtype=torch.float32).to(DEVICE)
        actions = torch.tensor(actions, dtype=torch.long).to(DEVICE)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(DEVICE)
        states_ = torch.tensor(states_, dtype=torch.float32).to(DEVICE)
        dones = torch.tensor(dones, dtype=torch.bool).to(DEVICE)

        batch_indices = np.arange(BATCH_SIZE, dtype=np.int64)
        states.requires_grad = True
        
        q_values = self.network(states)
        next_q_values = self.network(states_)

        predicted_value_of_now = q_values[batch_indices, actions]
        predicted_value_of_future = torch.max(next_q_values, dim=1)[0].detach()
        
        q_target = rewards + GAMMA * predicted_value_of_future * dones

        loss = self.network.loss(q_target, predicted_value_of_now)        
        self.network.optimizer.zero_grad()
        loss.backward()
        self.network.optimizer.step()

        self.exploration_rate *= EXPLORATION_DECAY
        self.exploration_rate = max(EXPLORATION_MIN, self.exploration_rate)

    def returning_epsilon(self):
        return self.exploration_rate

## Training the network without defense

In [3]:
agent = DQN_Solver()
agent.network.train()

best_reward = 0
average_reward = 0
episode_number = []
average_reward_number = []

for i in range(1, EPISODES):
    state = env.reset()
    score = 0

    while True:
        action, q_values = agent.choose_action(state)
        
        state_, reward, done, info = env.step(action)
        agent.memory.add(state, action, reward, state_, done)
        agent.learn()
        state = state_
        score += reward

        if done:
            if score > best_reward:
                best_reward = score
            average_reward += score
            print("Episode {} Average Reward {} Best Reward {} Last Reward {} Epsilon {}".format(i, average_reward/i, best_reward, score, agent.returning_epsilon()))
            break

        episode_number.append(i)
        average_reward_number.append(average_reward/i)


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.dones = np.zeros(MEM_SIZE, dtype=np.bool)


Episode 1 Average Reward 16.0 Best Reward 16.0 Last Reward 16.0 Epsilon 1.0
Episode 2 Average Reward 19.5 Best Reward 23.0 Last Reward 23.0 Epsilon 1.0
Episode 3 Average Reward 18.666666666666668 Best Reward 23.0 Last Reward 17.0 Epsilon 1.0
Episode 4 Average Reward 20.0 Best Reward 24.0 Last Reward 24.0 Epsilon 0.9831353223738244
Episode 5 Average Reward 20.6 Best Reward 24.0 Last Reward 23.0 Epsilon 0.9607702107358118
Episode 6 Average Reward 20.666666666666668 Best Reward 24.0 Last Reward 21.0 Epsilon 0.9407945259609451
Episode 7 Average Reward 20.714285714285715 Best Reward 24.0 Last Reward 21.0 Epsilon 0.9212341621210596
Episode 8 Average Reward 19.75 Best Reward 24.0 Last Reward 13.0 Epsilon 0.9093297114626595
Episode 9 Average Reward 20.11111111111111 Best Reward 24.0 Last Reward 23.0 Epsilon 0.8886435861147077
Episode 10 Average Reward 19.9 Best Reward 24.0 Last Reward 18.0 Epsilon 0.8727832416118043
Episode 11 Average Reward 22.09090909090909 Best Reward 44.0 Last Reward 44.0 

## Testing the network with FGSM

In [None]:
# Taking the gradient of the reward with respect to the temporal difference error
agent.network.eval()

best_reward = 0

episode_number = []
average_reward_number_attacked = []
epsilons_attacks = [0,0.05, .1, 0.15, .2, 0.25, .3, 0.35, .4]


for epsilon in epsilons_attacks:
    average_reward_attacked = 0
    for i in range(1,50):
        state = env.reset()
        score = 0
        did_pertubate = 0
        while True:
            action, q_values = agent.choose_action(state)
            state_, reward, done, info = env.step(action)

            if score % 2 == 0 and not(isinstance(q_values,float)):
                next_q_values = agent.network(state_.float()).detach()
                loss = agent.network.loss(q_values, next_q_values)
                gradient_sign = torch.autograd.grad(loss,state,retain_graph=True)[0].sign()
                state = state + gradient_sign * epsilon
                action, q_values = agent.choose_action(state)
                state_, reward, done, info = env.step(action)
                did_pertubate += 1

            state = state_
            score += reward

            if done:            
                if score > best_reward:
                    best_reward = score
                average_reward_attacked += score
                print("Episode {} Average Reward {} Best Reward {} Last Reward {} Epsilon {}".format(i, average_reward_attacked/i, best_reward, score, agent.returning_epsilon()))
                break

            episode_number.append(i)
    average_reward_number_attacked.append(average_reward_attacked / i)
    
plt.ioff()
plt.figure(1)
plt.plot(epsilons_attacks,average_reward_number_attacked)


  return F.mse_loss(input, target, reduction=self.reduction)


Episode 1 Average Reward 230.0 Best Reward 230.0 Last Reward 230.0 Epsilon 0.001
Episode 2 Average Reward 198.0 Best Reward 230.0 Last Reward 166.0 Epsilon 0.001
Episode 3 Average Reward 222.0 Best Reward 270.0 Last Reward 270.0 Epsilon 0.001


  logger.warn(


Episode 4 Average Reward 206.0 Best Reward 270.0 Last Reward 158.0 Epsilon 0.001
Episode 5 Average Reward 202.2 Best Reward 270.0 Last Reward 187.0 Epsilon 0.001
Episode 6 Average Reward 201.16666666666666 Best Reward 270.0 Last Reward 196.0 Epsilon 0.001
Episode 7 Average Reward 199.28571428571428 Best Reward 270.0 Last Reward 188.0 Epsilon 0.001
Episode 8 Average Reward 195.625 Best Reward 270.0 Last Reward 170.0 Epsilon 0.001
Episode 9 Average Reward 198.66666666666666 Best Reward 270.0 Last Reward 223.0 Epsilon 0.001
Episode 10 Average Reward 198.0 Best Reward 270.0 Last Reward 192.0 Epsilon 0.001
Episode 11 Average Reward 198.0 Best Reward 270.0 Last Reward 198.0 Epsilon 0.001
Episode 12 Average Reward 197.0 Best Reward 270.0 Last Reward 186.0 Epsilon 0.001
Episode 13 Average Reward 203.53846153846155 Best Reward 282.0 Last Reward 282.0 Epsilon 0.001
Episode 14 Average Reward 203.92857142857142 Best Reward 282.0 Last Reward 209.0 Epsilon 0.001
Episode 15 Average Reward 207.2 Best 

Episode 46 Average Reward 208.5 Best Reward 332.0 Last Reward 232.0 Epsilon 0.001
Episode 47 Average Reward 208.29787234042553 Best Reward 332.0 Last Reward 199.0 Epsilon 0.001
Episode 48 Average Reward 209.125 Best Reward 332.0 Last Reward 248.0 Epsilon 0.001
Episode 49 Average Reward 208.9795918367347 Best Reward 332.0 Last Reward 202.0 Epsilon 0.001
Episode 1 Average Reward 209.0 Best Reward 332.0 Last Reward 209.0 Epsilon 0.001
Episode 2 Average Reward 232.5 Best Reward 332.0 Last Reward 256.0 Epsilon 0.001
Episode 3 Average Reward 227.33333333333334 Best Reward 332.0 Last Reward 217.0 Epsilon 0.001
Episode 4 Average Reward 229.75 Best Reward 332.0 Last Reward 237.0 Epsilon 0.001
Episode 5 Average Reward 229.8 Best Reward 332.0 Last Reward 230.0 Epsilon 0.001
Episode 6 Average Reward 221.16666666666666 Best Reward 332.0 Last Reward 178.0 Epsilon 0.001
Episode 7 Average Reward 217.28571428571428 Best Reward 332.0 Last Reward 194.0 Epsilon 0.001
Episode 8 Average Reward 220.125 Best 

Episode 39 Average Reward 214.64102564102564 Best Reward 333.0 Last Reward 204.0 Epsilon 0.001
Episode 40 Average Reward 214.725 Best Reward 333.0 Last Reward 218.0 Epsilon 0.001
Episode 41 Average Reward 214.390243902439 Best Reward 333.0 Last Reward 201.0 Epsilon 0.001
Episode 42 Average Reward 213.85714285714286 Best Reward 333.0 Last Reward 192.0 Epsilon 0.001
Episode 43 Average Reward 214.13953488372093 Best Reward 333.0 Last Reward 226.0 Epsilon 0.001
Episode 44 Average Reward 212.72727272727272 Best Reward 333.0 Last Reward 152.0 Epsilon 0.001
Episode 45 Average Reward 212.8 Best Reward 333.0 Last Reward 216.0 Epsilon 0.001
Episode 46 Average Reward 212.17391304347825 Best Reward 333.0 Last Reward 184.0 Epsilon 0.001
Episode 47 Average Reward 211.3404255319149 Best Reward 333.0 Last Reward 173.0 Epsilon 0.001
Episode 48 Average Reward 211.83333333333334 Best Reward 333.0 Last Reward 235.0 Epsilon 0.001
Episode 49 Average Reward 212.0204081632653 Best Reward 333.0 Last Reward 221

Episode 32 Average Reward 226.5625 Best Reward 333.0 Last Reward 180.0 Epsilon 0.001
Episode 33 Average Reward 229.12121212121212 Best Reward 333.0 Last Reward 311.0 Epsilon 0.001
Episode 34 Average Reward 228.2941176470588 Best Reward 333.0 Last Reward 201.0 Epsilon 0.001
Episode 35 Average Reward 227.88571428571427 Best Reward 333.0 Last Reward 214.0 Epsilon 0.001
Episode 36 Average Reward 227.63888888888889 Best Reward 333.0 Last Reward 219.0 Epsilon 0.001
Episode 37 Average Reward 226.45945945945945 Best Reward 333.0 Last Reward 184.0 Epsilon 0.001
Episode 38 Average Reward 226.5 Best Reward 333.0 Last Reward 228.0 Epsilon 0.001
Episode 39 Average Reward 226.25641025641025 Best Reward 333.0 Last Reward 217.0 Epsilon 0.001
Episode 40 Average Reward 225.125 Best Reward 333.0 Last Reward 181.0 Epsilon 0.001
Episode 41 Average Reward 224.02439024390245 Best Reward 333.0 Last Reward 180.0 Epsilon 0.001
Episode 42 Average Reward 225.11904761904762 Best Reward 333.0 Last Reward 270.0 Epsi

Episode 25 Average Reward 210.16 Best Reward 333.0 Last Reward 165.0 Epsilon 0.001
Episode 26 Average Reward 211.1153846153846 Best Reward 333.0 Last Reward 235.0 Epsilon 0.001
Episode 27 Average Reward 210.66666666666666 Best Reward 333.0 Last Reward 199.0 Epsilon 0.001
Episode 28 Average Reward 212.64285714285714 Best Reward 333.0 Last Reward 266.0 Epsilon 0.001
Episode 29 Average Reward 212.3793103448276 Best Reward 333.0 Last Reward 205.0 Epsilon 0.001
Episode 30 Average Reward 212.26666666666668 Best Reward 333.0 Last Reward 209.0 Epsilon 0.001
Episode 31 Average Reward 212.0 Best Reward 333.0 Last Reward 204.0 Epsilon 0.001
Episode 32 Average Reward 211.84375 Best Reward 333.0 Last Reward 207.0 Epsilon 0.001
Episode 33 Average Reward 211.87878787878788 Best Reward 333.0 Last Reward 213.0 Epsilon 0.001
Episode 34 Average Reward 210.88235294117646 Best Reward 333.0 Last Reward 178.0 Epsilon 0.001
Episode 35 Average Reward 212.31428571428572 Best Reward 333.0 Last Reward 261.0 Epsil

## Training the network with adversarial training

In [None]:
agent_robust = DQN_Solver()
agent_robust.network.train()

best_reward = 0
average_reward = 0
episode_number = []
average_reward_number = []

for i in range(1, EPISODES):
    state = env.reset()
    score = 0
    while True:
        action, q_values = agent_robust.choose_action(state)
        state_, reward, done, info = env.step(action)
        
        if score % 2 == 0 and not(isinstance(q_values,float)):
            next_q_values = agent_robust.network(state_.float()).detach()
            loss = agent_robust.network.loss(q_values, next_q_values)
            gradient_sign = torch.autograd.grad(loss,state,retain_graph=True)[0].sign()
            state = state + gradient_sign * EPSILON_PERT
            action, q_values = agent_robust.choose_action(state)
            state_, reward, done, info = env.step(action)
         
                
        agent_robust.memory.add(state, action, reward, state_, done)
        agent_robust.learn()
        state = state_
        score += reward

        if done:
            if score > best_reward:
                best_reward = score
            average_reward += score
            print("Episode {} Average Reward {} Best Reward {} Last Reward {} Epsilon {}".format(i, average_reward/i, best_reward, score, agent.returning_epsilon()))
            break

        episode_number.append(i)
        average_reward_number.append(average_reward/i)

## Testing the network with FGSM

In [None]:
# Taking the gradient of the reward with respect to the temporal difference error
agent_robust.network.eval()

best_reward = 0

episode_number = []
average_reward_number_robust = []
epsilons_attacks = [0,0.05, .1, 0.15, .2, 0.25, .3, 0.35, .4]


for epsilon in epsilons_attacks:
    average_reward_robust = 0
    for i in range(1,50):
        state = env.reset()
        score = 0
        while True:
            action, q_values = agent_robust.choose_action(state)
            state_, reward, done, info = env.step(action)

            if score % 2 == 0 and not(isinstance(q_values,float)):
                next_q_values = agent_robust.network(state_.float()).detach()
                loss = agent_robust.network.loss(q_values, next_q_values)

                gradient_sign = torch.autograd.grad(loss,state,retain_graph=True)[0].sign()
                state = state + gradient_sign * epsilon
                action, q_values = agent_robust.choose_action(state)
                state_, reward, done, info = env.step(action)


            state = state_
            score += reward

            if done:
                if score > best_reward:
                    best_reward = score
                average_reward_robust += score
                print("Episode {} Average Reward {} Best Reward {} Last Reward {} Epsilon {}".format(i, average_reward_robust/i, best_reward, score, agent.returning_epsilon()))
                break

            episode_number.append(i)
    average_reward_number_robust.append(average_reward_robust / i)

plt.figure(1)
plt.plot(epsilons_attacks,average_reward_number_robust)
plt.show()