## Import the necessary software libraries:

In [None]:
#Todos: Save the best model (highest reward)
#        Save reward for every episode to the file
#    create python script
#    need only 2 layers in actor and critic
#    read how to access GPU

In [1]:
import os
import torch
import gym
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch import nn as nn
from torch.optim import AdamW
import torch.nn.functional as F

import torchvision.transforms as transforms

from utils import test_policy_network, seed_everything, plot_stats
from parallel_env import ParallelEnv, ParallelWrapper

### import ROM from the extracted file to atari_py

In [None]:
# for gym 0.20+
# from ale_py import ALEInterface

In [None]:
# for gym 0.20+
# ale = ALEInterface()

In [None]:
# for gym 0.20+
# ! ale-import-roms /opt/anaconda3/envs/pongA2C/lib/python3.8/site-packages/ale_py/roms/

In [None]:
# run only once when creating conda environment
! python -m atari_py.import_roms /Users/meng/Downloads/Roms

In [None]:
# for gym 0.20+
# from ale_py.roms import Pong

In [None]:
# for gym 0.20+
# ale.loadROM(Pong)

In [None]:
# no need to run?
import atari_py

In [None]:
pip install pyglet==1.5.27 # added

In [None]:
pip install torchvision # added

In [None]:
env_name = 'Pong-v4'
#env_name = 'Acrobot-v1'

env = gym.make(env_name)

input_space = env.observation_space.shape

actions = env.action_space.n

In [None]:
print(f"State dimensions: {input_space}. Actions: {actions}")
print(f"Sample state: {env.reset()}")

In [None]:
print(f"state_space[0] = {input_space[2]}")

In [None]:
plt.imshow(np.real(env.render(mode='rgb_array')))

In [None]:
class Environment:

    def __init__(self, env_name):
        self.env = gym.make(env_name)
        seed_everything(self.env)

    def reset(self):
        obs = self.env.reset()
        return obs

    def step_async(self, actions):
        actions = actions.squeeze().numpy()
        self.env.step_async(actions)

    def step_wait(self):
        obs, reward, done, info = self.env.step_wait()
        obs = self.preprocess_observation(obs)
        reward = torch.tensor(reward).unsqueeze(1).float()
        done = torch.tensor(done).unsqueeze(1)
        return obs, reward, done, info

In [None]:
class PreprocessEnv(ParallelWrapper):

    def __init__(self, env):
        super().__init__(env)

    def reset(self):
        state = self.venv.reset()
        return torch.from_numpy(state).float()

    def step_async(self, actions):
        actions = actions.squeeze().numpy()
        self.venv.step_async(actions)

    def step_wait(self):
        next_state, reward, done, info = self.venv.step_wait()
        next_state = torch.from_numpy(next_state).float()
        reward = torch.tensor(reward).unsqueeze(1).float()
        done = torch.tensor(done).unsqueeze(1)
        return next_state, reward, done, info

In [None]:
class FeatureExtractor(nn.Module):

    def __init__(self, input_shape):
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels=3, out_channels=8, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv3 = nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv4 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(64*4*4, 512)


    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv3(x))
        x = F.max_pool2d(x, 2, 2)
        x = F.relu(self.conv4(x))
        x = F.max_pool2d(x, 2, 2)
        
        x = x.view(-1, 64*4*4)
        
        x = F.relu(self.fc1(x))
        return x
        

In [None]:
class Actor(nn.Module):

    def __init__(self, n_actions):
        super().__init__()

        self.fc1 = nn.Linear(in_features=512, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=n_actions)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        x = F.softmax(x, dim=1)
        return x

In [None]:
class Critic(nn.Module):

    def __init__(self):
        super().__init__()

        self.fc1 = nn.Linear(in_features=512, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
class ActorCritic():
    def __init__(self, actor, critic, feature, alpha=1e-4, gamma=0.99, obs_shape=(1, 64, 64)):
        self.actor = actor
        self.critic = critic
        self.feature = feature
        self.obs_shape = obs_shape
        self.alpha = alpha
        self.gamma = gamma
        self.actor_optim = AdamW(self.actor.parameters(), lr=1e-3)
        self.critic_optim = AdamW(self.critic.parameters(), lr=1e-4)
        self.feature_optim = AdamW(self.feature.parameters(), lr=1e-4)
        self.stats = {'Actor Loss': [], 'Critic Loss': [], 'Returns': []}
        
    def preprocess_observation(self, obs_batch):
        # Crop the score and border region
        obs_batch = obs_batch[:, 35:195, :, :]
    
        # Resize to 64x64
        transform = transforms.Resize((64, 64))
        
        obs_batch = torch.stack([transform(obs.permute(2, 0, 1)) for obs in obs_batch])
        
        # Convert to float and rescale to range [0, 1]
        obs_batch = obs_batch / 255.0
        
        return obs_batch

    def train(self, env, episodes):
        for episode in tqdm(range(1, episodes + 1)):
            state = env.reset()
            state = self.preprocess_observation(state)
            
            done_b = torch.zeros((env.num_envs, 1), dtype=torch.bool)
            ep_return = torch.zeros((env.num_envs, 1))
            I = 1.

            while not done_b.all():
                
                self.critic_optim.zero_grad()
                self.actor_optim.zero_grad()

                '''
                    I wanna do something like state = self.feature(state). of course I also change everywhere
                in the code but I got error.
                    first iteration state.shape = [8, 1, 64, 64] as input
                    second iteration state.shape = [8, 512] as input
                '''
                state_f = self.feature(state)
                probs = self.actor(state_f)
                
                action = torch.multinomial(probs, 1).squeeze().detach()
                

                next_state, reward, done, _ = env.step(action)
                next_state = self.preprocess_observation(next_state)
                
                value = self.critic(state_f)
                next_state_f = self.feature(next_state)
                
                target = reward + ~done * self.gamma * self.critic(next_state_f).detach()
                critic_loss = nn.functional.mse_loss(value, target)
                
                
                advantage = (target - value).detach()
                #probs = self.actor(state)
                log_probs = torch.log(probs + 1e-6)
                
                action = action.view(-1, 1)
                
                action_log_prob = log_probs.gather(1, action)
                entropy = - torch.sum(probs * log_probs, dim=-1, keepdim=True)
                actor_loss = - I * action_log_prob * advantage - 0.01 * entropy
                actor_loss = actor_loss.mean()
                
                total_loss = actor_loss + critic_loss
                
                total_loss.backward()
                self.feature_optim.step()
                self.critic_optim.step()
                self.actor_optim.step()

                ep_return += reward
                done_b |= done
                state = next_state
                I = I * self.gamma

            self.stats['Actor Loss'].append(actor_loss.item())
            self.stats['Critic Loss'].append(critic_loss.item())
            self.stats['Returns'].append(ep_return.mean().item())

## Main

In [None]:
env_name = 'Pong-v4'
num_envs = os.cpu_count()
episodes = 100

In [None]:
envs = ParallelEnv([lambda: Environment(env_name).env for _ in range(num_envs)])
envs = PreprocessEnv(envs)

In [None]:
feature = FeatureExtractor(envs.observation_space.shape)
actor = Actor(envs.action_space.n)
critic = Critic()
agent = ActorCritic(actor, critic, feature)

In [None]:
agent.train(envs, episodes)

In [None]:
plot_stats(agent.stats)

In [None]:
env = gym.make(env_name)
test_policy_network_internal(env, agent.actor, agent.feature_extractor, episodes=2)

In [None]:
def test_policy_network_internal(env, policy, feature_extractor, episodes=1):
    from IPython import display
    plt.figure(figsize=(6, 6))
    for episode in range(episodes):
        state = env.reset()
        state = preprocess(state)
        state = agent.feature_extractor(state)
        done = False
        img = plt.imshow(env.render(mode='rgb_array'))
        while not done:
            # state = torch.from_numpy(state).unsqueeze(0).float()
            action = policy(state).multinomial(1).item()
            next_state, _, done, _ = env.step(action)
            img.set_data(env.render(mode='rgb_array'))
            plt.axis('off')
            display.display(plt.gcf())
            display.clear_output(wait=True)
            state = next_state

In [None]:
def preprocess(obs):
        # Convert to float and rescale to range [0, 1]
        #obs_batch = obs_batch.astype(np.float32) / 255.0
        
        obs = torch.from_numpy(obs)
    
        # Crop the score and border region
        obs = obs[35:195, :, :]
    
        # Resize to 64x64
        transform = transforms.Resize((64, 64))
        obs = transform(obs.permute(2, 0, 1))
        obs = obs.float()
    
        return obs

## for testing

In [None]:
state = env.reset()
state = preprocess(state)
state = feature_extractor(state)

In [None]:
print(state.shape)

In [None]:
feature = feature_extractor(state)

In [None]:
probs = actor(feature)

In [None]:
print(probs.shape)

In [None]:
def preprocess_observation(obs):
    # selecting the row from 35 to 195 so that we have a space of 160 * 160
    obs = obs[35:195, :, :]
    # change to gray scale
    obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY)
    # resize 
    obs = cv2.resize(obs, (obs_shape[1], obs_shape[2]), interpolation=cv2.INTER_AREA)
    obs = obs.astype(np.float32) / 255.0
    obs = torch.from_numpy(obs).unsqueeze(0)
    return obs

In [None]:
def preprocess_observation(obs_batch):
    # selecting the row from 35 to 195 so that we have a space of 160 * 160
    obs_batch = obs_batch[:, 35:195, :, :]
    # change to gray scale
    obs_batch = np.dot(obs_batch, [0.2989, 0.5870, 0.1140])
    # resize 
    obs_batch = np.transpose(obs_batch, (1, 2, 0))
    obs_batch = cv2.resize(obs_batch, (64, 64), interpolation=cv2.INTER_AREA)
    #obs_batch = obs_batch.astype(np.float32) / 255.0
    #obs_batch = torch.from_numpy(obs_batch).unsqueeze(1)
    return obs_batch

In [None]:
def preprocess_observation(obs_batch):
    obs_batch = obs_batch[:, 35:195, :, :]
    obs_batch = cv2.resize(obs_batch, (64, 64), interpolation=cv2.INTER_AREA)
    #obs_batch = obs_batch.astype(np.float32) / 255.0
    #obs_batch = torch.from_numpy(obs_batch).permute(0, 3, 1, 2)
    return obs_batch

In [None]:
def resize_observation(obs_batch):
    # create an empty array to hold the resized observations
    resized_obs = np.empty((obs_batch.shape[0], 1, 64, 64), dtype=np.float32)

    for i, obs in enumerate(obs_batch):
        # selecting the row from 35 to 195 so that we have a space of 160 * 160
        obs = obs[35:195, :, :]
        # resize
        obs = cv2.resize(obs, (64, 64), interpolation=cv2.INTER_AREA)
        # convert to float32 and normalize
        obs = obs.astype(np.float32) / 255.0
        # transpose from (64, 64, 3) to (3, 64, 64) and add an extra dimension
        obs = np.transpose(obs, (2, 0, 1))[np.newaxis, ...]
        # add to the resized observations array
        resized_obs[i] = obs

    return resized_obs

In [None]:
def preprocess_observation(obs_batch):
    # selecting the row from 35 to 195 so that we have a space of 160 * 160
    obs_batch = obs_batch[:, 35:195, :, :]
    # resize 
    obs_batch = np.transpose(obs_batch, (0, 3, 1, 2))
    obs_batch = np.asarray([cv2.resize(img, (64, 64), interpolation=cv2.INTER_AREA) for img in obs_batch])
    obs_batch = obs_batch.astype(np.float32) / 255.0
    obs_batch = torch.from_numpy(obs_batch).unsqueeze(1)
    return obs_batch

In [None]:
def preprocess_observation(obs_batch):
    # Convert to float and rescale to range [0, 1]
    #obs_batch = obs_batch.astype(np.float32) / 255.0
    
    # Crop the score and border region
    obs_batch = obs_batch[:, 35:195, :, :]
    
    # Resize to 64x64
    transform = transforms.Resize((64, 64))
    obs_batch = torch.stack([transform(obs.permute(2, 0, 1)) for obs in obs_batch])
    
    return obs_batch


In [None]:
obs_shape=(1, 64, 64)

In [None]:
env = gym.make(env_name)
print(env.reset().shape)

In [None]:
env = Environment(env_name)
obs = env.reset()
print(obs.shape)

In [None]:
obs = envs.reset()
print(obs.shape)

In [None]:
obs = env.reset()
envs = ParallelEnv([lambda: Environment(env_name).env for _ in range(num_envs)])
print(obs.shape)

In [None]:
state = preprocess_observation(obs)

In [None]:
state = resize_observation(obs)

In [None]:
print(state.shape)

## Just in case

In [None]:

    #def plot_stats(self):
    #    plt.plot(self.stats['Actor Loss'], label='Actor Loss')
    #    plt.plot(self.stats['Critic Loss'], label='Critic Loss')
    #    plt.plot(self.stats['Returns'], label='Returns')
    #    plt.legend()
    #    plt.show()

    #def test_policy_network(self, env, episodes=2):
    #    with torch.no_grad():
    #        for episode in range(episodes):
    #            state = env.reset()
    #            done = False
    #            total_reward = 0

    #            while not done:
    #                action = self.actor(state).argmax(dim=1).detach()
    #                next_state, reward, done, _ = env.step(action)
    #                total_reward += reward
    #                state = next_state

    #            print(f'Episode {episode+1}: Total reward = {total_reward.item()}')
