# Mid Semester Project - DEEP Reinforcements Learning
### Part 1: A2C Algorithm

Student: Jonathan Mendelson 308564293


##### Imports

In [1]:
# general utils
import os
import time

# torch
import torch
from torchsummary import summary

# my imports
from src.display_utils import embed_mp4, launch_tb
from src.utils import record_agent_video, build_envs
from src.cnn_policy import CNNPolicyNetwork
from src.a2c import A2CTrainer

# notebook setup
%load_ext autoreload
%autoreload 2

# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

#### Set experiment name:

In [None]:
# set experiment name
experiment_name = 'A2C_04'

# create log dir
os.makedirs(f".\\experiments\\{experiment_name}", exist_ok=True)
# timestamp = time.strftime("%b-%d_%H-%M-%S")
timestamp = 'Mar-08_22-39-37' # TODO temp
log_path = os.path.join('experiments', experiment_name, timestamp, 'logs')
# create content dir
content_path = os.path.join('experiments', experiment_name, timestamp, 'content')
os.makedirs(content_path, exist_ok=True)
# set models dir
model_path = os.path.join('experiments', experiment_name, timestamp,'models')
os.makedirs(model_path, exist_ok=True)

#### Environment Initialization

In [3]:
env_2_rooms, env_4_rooms, env_6_rooms = build_envs()

#### Agent Initialization
We will actually use the same model as the AC model, except we will only utilize the actor part (and not the critic).

In [4]:
policy_network = CNNPolicyNetwork(device, env_2_rooms.action_space.n, critic = True)
summary(policy_network, (3,56,56))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 55, 55]             208
              ReLU-2           [-1, 16, 55, 55]               0
         MaxPool2d-3           [-1, 16, 27, 27]               0
            Conv2d-4           [-1, 32, 12, 12]           8,224
              ReLU-5           [-1, 32, 12, 12]               0
            Conv2d-6             [-1, 64, 9, 9]          32,832
              ReLU-7             [-1, 64, 9, 9]               0
           Flatten-8                 [-1, 5184]               0
            Linear-9                  [-1, 128]         663,680
           Linear-10                    [-1, 7]             903
           Linear-11                  [-1, 128]         663,680
           Linear-12                    [-1, 1]             129
Total params: 1,369,656
Trainable params: 1,369,656
Non-trainable params: 0
---------------------------

#### Initialize Trainer

In [5]:
# logger
tb_process, tb_writer = launch_tb(log_path)

# trainer
trainer = A2CTrainer(device, policy_network=policy_network, lr=3e-4, gamma = 0.99, writer = tb_writer)

TensorBoard logs are saved in: experiments\A2C_04\Mar-08_22-39-37\logs


#### Training Schedule
We will train the agent according to a training schedule with a few parts:
* Two rooms with max steps of 1000, for 10K episodes.
* Two rooms with max steps of 20, for 10K episodes.
* Four rooms with max steps of 2000, for 10K episodes.
* Six rooms



TODO finish



##### Two Rooms, 10 Envs:

In [6]:
# trainer.policy_network.load_state_dict(torch.load('experiments\RF_02\RF_2Room_100_2.pt'))

In [7]:
experiment_phase = '2Room_1k_1'
trainer.update_phase(experiment_phase)
trainer.set_lr = 3e-4

# remove the steps limit from the env
max_steps = 100
env_2_rooms.unwrapped.max_steps = max_steps

In [17]:
trainer.train(env = env_2_rooms, num_episodes = 2500, entropy_weight = 1e-4, n_rollout = 10)

KeyboardInterrupt: 

In [13]:
video_path = f"{content_path}\\{experiment_phase}_1.mp4"
vid = record_agent_video(policy_network, env_2_rooms, video_path)
embed_mp4((video_path))

In [18]:
torch.save(policy_network.state_dict(), 'A2C_2Room_100_6000.pt')

##### Four rooms, 2000 steps, 10k episodes: 

In [None]:
trainer.policy_network.load_state_dict(torch.load('experiments\\RF_02\\RF_4Room_2000_5000_2.pt'));

<All keys matched successfully>

In [None]:
experiment_phase = '4Room'
# trainer
trainer.update_phase(experiment_phase)
trainer.set_lr(new_lr = 1e-5)

# remove the steps limit from the env
max_steps = 2000
env_4_rooms.unwrapped.max_steps = max_steps

In [None]:
trainer.train(env = env_4_rooms, num_episodes = 10000, max_steps = max_steps, entropy_weight = 1e-6)

In [None]:
video_path = f"{content_path}\\{experiment_phase}_4.mp4"
vid = record_agent_video(device, trainer.policy_network, env_4_rooms, video_path)
embed_mp4((video_path))

In [None]:
torch.save(trainer.policy_network.state_dict(), 'RF_4Room_2000_5000_4.pt')

##### Six rooms, 5000 episodes, 1000 steps max:

In [None]:
trainer.policy_network.load_state_dict(torch.load('experiments\RF_02\Mar-06_22-19-08\models\RF_4Room_2000_5000_4.pt'))

<All keys matched successfully>

In [None]:
trainer.policy_network.load_state_dict(torch.load('experiments\RF_02\Mar-06_22-19-08\models\RF_6Room_200_5000_4.pt'))

<All keys matched successfully>

In [None]:
experiment_phase = '6Room_4'
# trainer
trainer.update_phase(experiment_phase)
trainer.set_lr(new_lr = 5e-5)
trainer.writer = tb_writer

# remove the steps limit from the env
max_steps = 500
env_6_rooms.unwrapped.max_steps = max_steps

In [None]:
tb_process.kill()
tb_process, tb_writer = launch_tb(log_path)
trainer.writer = tb_writer 
trainer.zero_episodes()

TensorBoard logs are saved in: experiments\RF_02\Mar-06_22-19-08\logs


In [None]:
trainer.train(env = env_6_rooms, num_episodes = 15000, max_steps = max_steps, entropy_weight = 1e-8)

Value(False)


In [None]:
video_path = f"{content_path}\\{experiment_phase}_3.mp4"
vid = record_agent_video(device, trainer.policy_network, env_6_rooms, video_path)
embed_mp4((vid))

In [None]:
torch.save(trainer.policy_network.state_dict(), 'experiments\RF_02\Mar-06_22-19-08\models\RF_6Room_200_5000_4.pt')

In [None]:
tb_process.kill()