# Deep Q-Learning ( Not Working )
---

## Setup
### Environment and Installations

```
pip install numpy
pip install gymnasium
pip install "gymnasium[atari,accept-rom-license]"

NOTE: I am on Windows 10 with Nvidia 1050GTX GPU / you may need something different for a new system 
pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
```

First we create a new envrionment with venv for this project.\
We do this using VSCode by selecting the evnironment (might say kernal or Python), near the top right corner of the IDE.\
We will click "Select Another Kernal.../Python Environments.../+ Create Python Environment".\
We will need to name the environment.\
We use VSCode, so we can easily conncet our environment to our notebook and run the code blocks right here in our notebook.\
\
Once we have the Envrionment we install the packages we need.\
Gymnasium is a library that provides us with learning playground to build A.I. in.\
Gymnasium[atari,accept-rom-license] will give us access to a list of Atari games we can train our agents to play. ( Later in the course )\
Then we install Py-Torch, you will want to visit their site and fine the proper command for your system.\

### Imports

In [9]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple

---

## Building the A.I.
### Create The Neural Network

In [10]:
class Network(nn.Module):

    def __init__(self, state_size, action_size, seed = 42):
        super(Network, self).__init__()
        self.seed = torch.manual_seed(seed)
        self.fc1 = nn.Linear(state_size, 28)
        self.fc2 = nn.Linear(28, 28)
        self.fc3 = nn.Linear(28, 28)
        self.fc4 = nn.Linear(28, 28)
        self.fc5 = nn.Linear(28, action_size)

    def forward(self, state):
        x = self.fc1(state)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        x = self.fc3(x)
        x = F.relu(x)
        x = self.fc4(x)
        x = F.relu(x)
        return self.fc5(x)

## Training The A.I.
### Setup The Environment

In [11]:
import gymnasium as gym

envs = gym.make("MountainCar-v0")

state_shape = envs.observation_space.shape
state_size = envs.observation_space.shape[0]
num_of_actions = envs.action_space.n
print(f'''
State Shape: {state_shape}
State Size: {state_size}
Number of Actions: {num_of_actions}
''')


State Shape: (2,)
State Size: 2
Number of Actions: 3



### Initialize Hyperparametes

In [12]:
learning_rate = .9 # alpha, 0.0005
mini_batch_size = 100 # number of observations used per step to update the model.
discount_factor = 0.9 # gamma - closer to one the more conseration there is of future rewards
replay_buffer_size = int(1e5) # For experience Replay, number of observations in AI memory, 100,000
interpolation_parameter = 1e-3 # 0.001

### Implement Experience Replay

In [13]:
class ReplayMemory(object):

    def __init__(self, capacity):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.capacity = capacity
        self.memory = []

    def push(self, event):
        self.memory.append(event)
        if len(self.memory) > self.capacity:
            del self.memory[0]

    def sample(self, batch_size):
        experiences = random.sample(self.memory, k = batch_size)
        states = torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().to(self.device)
        rewards = torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.vstack([e[3] for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
        return states, next_states, actions, rewards, dones

### Initialize The DQN

In [14]:
class Agent():

    def __init__(self, state_size, action_size):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.state_size = state_size
        self.action_size = action_size
        self.local_qnetwork = Network(state_size, action_size).to(self.device)
        self.target_qnetwork = Network(state_size, action_size).to(self.device)
        self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr = learning_rate)
        self.memory = ReplayMemory(replay_buffer_size)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        self.memory.push((state, action, reward, next_state, done))
        self.t_step = (self.t_step + 1) % 4
        if self.t_step == 0:
            if len(self.memory.memory) > mini_batch_size:
                experiences = self.memory.sample(100)
                self.learn(experiences, discount_factor)

    def act(self, state, epsilon = 0.):
        state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
        print(state)
        self.local_qnetwork.eval()
        with torch.no_grad():
            action_values = self.local_qnetwork(state)
        self.local_qnetwork.train()
        if random.random() > epsilon:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, discount_factor):
        states, next_states, actions, rewards, dones = experiences
        next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1)
        q_targets = rewards + discount_factor * next_q_targets * (1 - dones)
        q_expected = self.local_qnetwork(states).gather(1, actions)
        loss = F.mse_loss(q_expected, q_targets)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.soft_update(self.local_qnetwork, self.target_qnetwork, interpolation_parameter)

    def soft_update(self, local_model, target_model, interpolation_parameter):
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(interpolation_parameter * local_param.data + (1.0 - interpolation_parameter) * target_param.data)

### Initialize The DQN

In [15]:
agent = Agent(state_size, num_of_actions)

### Train The DQN

In [16]:
number_episodes = 1000
maximum_number_timesteps_per_episode = 100
epsilon_starting_value  = 1.0
epsilon_ending_value  = 0.01
epsilon_decay_value  = 0.02
epsilon = epsilon_starting_value
scores_on_100_episodes = deque(maxlen = 100)

for episode in range(1, number_episodes + 1):
    state, _ = envs.reset()
    score = 0
    for t in range(maximum_number_timesteps_per_episode):
        action = agent.act(state, epsilon)
        next_state, reward, done, _, _ = envs.step(action)
        agent.step(state, action, reward, next_state, done)
        state = next_state
        score += reward
        if done:
            break
    scores_on_100_episodes.append(score)
    epsilon = max(epsilon_ending_value, epsilon_decay_value * epsilon)
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)), end = "")
    if episode % 100 == 0:
        print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)))
    if np.mean(scores_on_100_episodes) >= 200.0:
        print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode - 100, np.mean(scores_on_100_episodes)))
        torch.save(agent.local_qnetwork.state_dict(), 'checkpoint.pth')
        break

tensor([[-0.4002,  0.0000]], device='cuda:0')
tensor([[-0.4011, -0.0009]], device='cuda:0')
tensor([[-0.4029, -0.0018]], device='cuda:0')
tensor([[-0.4046, -0.0017]], device='cuda:0')
tensor([[-0.4081, -0.0036]], device='cuda:0')
tensor([[-0.4125, -0.0044]], device='cuda:0')
tensor([[-0.4178, -0.0052]], device='cuda:0')
tensor([[-0.4228, -0.0050]], device='cuda:0')
tensor([[-0.4285, -0.0058]], device='cuda:0')
tensor([[-0.4360, -0.0075]], device='cuda:0')
tensor([[-0.4441, -0.0081]], device='cuda:0')
tensor([[-0.4538, -0.0097]], device='cuda:0')
tensor([[-0.4640, -0.0102]], device='cuda:0')
tensor([[-0.4737, -0.0097]], device='cuda:0')
tensor([[-0.4827, -0.0090]], device='cuda:0')
tensor([[-0.4910, -0.0083]], device='cuda:0')
tensor([[-0.4996, -0.0086]], device='cuda:0')
tensor([[-0.5074, -0.0078]], device='cuda:0')
tensor([[-0.5143, -0.0069]], device='cuda:0')
tensor([[-0.5212, -0.0070]], device='cuda:0')
tensor([[-0.5292, -0.0080]], device='cuda:0')
tensor([[-0.5361, -0.0069]], devic

tensor([[-0.5223,  0.0051]], device='cuda:0')
tensor([[-0.5172,  0.0051]], device='cuda:0')
tensor([[-0.5132,  0.0040]], device='cuda:0')
tensor([[-0.5083,  0.0049]], device='cuda:0')
tensor([[-0.5035,  0.0048]], device='cuda:0')
tensor([[-0.4978,  0.0057]], device='cuda:0')
tensor([[-0.4933,  0.0045]], device='cuda:0')
tensor([[-0.4881,  0.0053]], device='cuda:0')
tensor([[-0.4831,  0.0050]], device='cuda:0')
tensor([[-0.4784,  0.0047]], device='cuda:0')
tensor([[-0.4731,  0.0053]], device='cuda:0')
tensor([[-0.4681,  0.0050]], device='cuda:0')
tensor([[-0.4625,  0.0056]], device='cuda:0')
tensor([[-0.4574,  0.0051]], device='cuda:0')
tensor([[-0.4528,  0.0046]], device='cuda:0')
tensor([[-0.4478,  0.0051]], device='cuda:0')
tensor([[-0.4422,  0.0055]], device='cuda:0')
tensor([[-0.4373,  0.0049]], device='cuda:0')
tensor([[-0.4321,  0.0053]], device='cuda:0')
tensor([[-0.4275,  0.0046]], device='cuda:0')
tensor([[-0.4236,  0.0039]], device='cuda:0')
tensor([[-0.4205,  0.0031]], devic

KeyboardInterrupt: 

---

## Visualize The Results

In [None]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display
from gym.wrappers.monitoring.video_recorder import VideoRecorder

def show_video_of_model(agent, env_name):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    done = False
    frames = []
    while not done:
        frame = env.render()
        frames.append(frame)
        action = agent.act(state)
        state, reward, done, _, _ = env.step(action.item())
    env.close()
    imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, 'MountainCar-v0')

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
                </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()