### Import Package

In [None]:
from mlagents_envs.environment import UnityEnvironment
from mlagents_envs.envs.unity_parallel_env import UnityParallelEnv
from mlagents_envs.envs.custom_side_channel import CustomDataChannel, StringSideChannel
from uuid import UUID

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import torch.optim as optim

import cv2
import random

from collections import deque
import sys

### Curriculum Learning Module

In [None]:
class CurriculumLearning:
    def __init__(self):
    
        self.train_player = "LEFT"
        self.course = "A"

        self.frozen_left_agent = None
        self.frozen_right_agent = None
        
        self.play_times = 0
        
        self.course_score = 0
        self.course_grade = {
            "A":21,
            "B":21,
            "C":21,
            "D":19
        }
    
    def reset_score(self):
        self.course_score = 0
    
    def courseA_agent(self):
        return [0,0,0]
    
    def courseB_agent(self):
        return [np.random.randint(0,3),np.random.randint(0,3),np.random.randint(0,3)]
    
    def courseC_agent(self):
        if self.train_player == "LEFT":
            return [0,2,0]
        
        if self.train_player == "RIGHT":
            return [0,1,0]
    
    def courseD_agent(self,obs):
        if self.train_player == "LEFT":
            with torch.no_grad():
                actions, _, _ = self.frozen_right_agent.select_action(obs)
                action = [actions[0],actions[1],actions[2]]
            return action
        
        if self.train_player == "RIGHT":
            with torch.no_grad():
                actions, _, _ = self.frozen_left_agent.select_action(obs)
                action = [actions[0],actions[1],actions[2]]
            return action
 
    def current_course(self,obs=None):  
        if self.course == "A":
            action = self.courseA_agent()
        elif self.course == "B":
            action = self.courseB_agent()
        elif self.course == "C":
            action = self.courseC_agent()
        elif self.course == "D":
            action = self.courseD_agent(obs)
        return action
        
    def update_score(self,reward,done):
        if done:
            if reward == 1:
                self.course_score += 1
    
    def course_grad(self):
        threshold = self.course_grade[self.course]
        
        if self.course_score >= threshold:
            print(f"The Course {self.course} passed by {self.train_player} player.")
            print(f"The grade of this course is {self.course_score}/21 ")
            if self.train_player == "LEFT":
                self.train_player = "RIGHT"
                
            elif self.train_player == "RIGHT":
                self.train_player = "LEFT"
            
        
        elif self.course_score <= threshold:
            print(f"The {self.train_player} player not to pass this course.")
            print(f"The grade of this course is {self.course_score}/21")
        self.reset_score()

    
    def switch_course(self,player1_PPO,player2_PPO):
        if self.course == "A":
            self.course = "B"
            
        elif self.course == "B":
            self.course = "C"
        
        elif self.course == "C":
            from copy import deepcopy
            self.frozen_left_agent = deepcopy(player1_PPO)
            self.frozen_right_agent = deepcopy(player2_PPO)
            self.course = "D"
        
        print(f"Curriculum Learning switch to next level {self.course}!")

In [None]:
class FrameProcessor:
    def __init__(self,stack_frame = 4):
        self.stack_frame = stack_frame
        self.stack = deque(maxlen = stack_frame)
        
    def concat(self):
        return np.concatenate(list(self.stack),axis=-1)

### Actor Critic Network

In [None]:
class ResidualBlock(nn.Module):
    def __init__(self,channels):
        super().__init__()
        self.conv1 = nn.Conv2d(channels , channels, kernel_size = 3, stride = 1, padding = 1)
        self.bn1 = nn.BatchNorm2d(channels)
        self.silu = nn.SiLU()
    
        self.conv2 = nn.Conv2d(channels, kernel_size = 3, stride = 1, padding = 1)
        self.bn2 = nn.BatchNorm2d(channels)
        self.silu = nn.SiLU()
        
    def forward(self,x):
        x1 = self.silu(self.bn1(self.conv1(x)))
        x2 = self.silu(self.bn2(self.conv2(x1)))
        return x1 + x2

class ActorCritic(nn.Module):
    def __init__(self,action_head = [3,3,3]): # input -> (60,138,4)
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(4,16,kernel_size = 4, stride = 2, padding = 1), # (30,69,16)
            nn.BatchNorm2d(16),
            nn.SiLU(),
            nn.Conv2d(16,32,kerel_size = 4, stride = 2, padding = 1), #  (15,35,32)
            nn.BatchNorm2d(32),
            nn.SiLU(),
            nn.Conv2d(32,64,kerel_size = 6, stride = 1, padding = 0), # (10,30,64)
            nn.BatchNorm2d(64),
            nn.SiLU(),
            ResidualBlock(64),
            ResidualBlock(64),
            nn.Flatten()
        )
        
        self.silu = nn.SiLU()
        
        self.fc1 = nn.Linear(19200,512)
        self.bn1 = nn.BatchNorm2d(512)
        self.fc2 = nn.Linear(512,256)
        self.bn2 = nn.BatchNorm2d(256)
        self.action_head1 = nn.Linear(256,action_head[0])
        self.action_head2 = nn.Linear(256,action_head[1])
        self.action_head3 = nn.Linear(256,action_head[2])
        
        self.value_head = nn.Linear(256,1)
        
    def forward(self,x):
        x = self.conv(x.float() / 255.0)
        x = self.silu(self.bn1(self.fc1(x)))
        shared_feat = self.silu(self.bn2(self.fc2(x)))
        
        policy = (
            self.action_head1(shared_feat),
            self.action_head2(shared_feat),
            self.action_head3(shared_feat),
        )
        
        value = self.value_head(shared_feat)
        
        return policy, value



### PPO Algorithm

In [None]:
class RolloutBuffer:
    def __init__(self):
        self.obs = []
        self.rewards = []
        self.dones = []
        
        self.actions = []
        self.logprobs = []
        self.values = []
    
    def clear(self):
        self .__init__()

In [None]:
class PPO:
    def __init__(
        self, 
        net:ActorCritic,
        optimizer,
        device,
        
        gamma = 0.99,
        lambd = 0.95,
        clip_eps = 0.2,
        entropy_coef = 0.01,
        value_loss_coef = 0.5):
        
        self.net = net
        self.optimizer = optimizer
        self.device = device
        self.gamma = gamma
        self.lambd = lambd
        self.clip_eps = clip_eps
        self.entropy_coef = entropy_coef
        self.value_loss_coef = value_loss_coef
        
    def select_action(self,stack_obs):
        stack_obs = torch.tensor(stack_obs, dtype = torch.float32, device = self.device).unsqueeze(0)
        with torch.no_grad():
            logits, value = self.net(stack_obs) # ([a,b,c],[a,b,c],[a,b,c]), value
        
        actions = []
        logprobs = []
        for logit in logits:
            dist = Categorical(logits = logit)
            a = dist.sample()
            actions.append(a.item())
            logprobs.append(dist.log_prob(a))
            
        total_logprob = torch.sum(torch.stack(logprobs))
        return np.array(actions), total_logprob.item(), value.item()
    
    def compute_gae(self, buffer: RolloutBuffer, last_value = 0):
        rewards = buffer.rewards
        values  = buffer.values + [last_value]
        dones   = buffer.dones
        
        advantages = []
        gae = []
        
        for t in reversed(range(len(rewards))):
            delta = rewards[t] + self.gamma * values[t+1] * (1 - dones[t]) - values[t]
            gae = delta + self.gamma * self.lambd * (1 - dones[t]) * gae
            advantages.insert(0, gae)
            
        returns = [adv + val for adv, val in zip(advantages, values[:-1])]
        return advantages, returns
    

    def update(self, buffer: RolloutBuffer):
        advantages, returns = self.compute_gae(buffer)
        
        obs = torch.tensor(buffer.obs, dtype=torch.float32, device=self.device)
        actions = torch.tensor(buffer.actions, dtype=torch.int64, device=self.device)
        old_logprobs = torch.tensor(buffer.logprobs, dtype=torch.float32, device=self.device)
        returns = torch.tensor(returns, dtype=torch.float32, device=self.device)
        advantages = torch.tensor(advantages, dtype=torch.float32, device=self.device)

        logits, values = self.net(obs)
        logp = sum([torch.distributions.Categorical(logits=l).log_prob(actions[:,i]) 
                    for i,l in enumerate(logits)])
        
        ratio = torch.exp(logp - old_logprobs)
        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1-self.clip_eps, 1+self.clip_eps) * advantages
        policy_loss = -torch.min(surr1, surr2).mean()
        value_loss = F.mse_loss(values, returns)
        entropy = sum([torch.distributions.Categorical(logits=l).entropy().mean() for l in logits])

        loss = policy_loss + self.value_loss_coef*value_loss - self.entropy_coef * entropy
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        buffer.clear()

In [60]:
def preprocess(obs):
    gray = 0.299 * obs[0] + 0.587 * obs[1] + 0.114 * obs[2]
    return gray[...,None]

### Main Algorithm

In [None]:
learning_rate = 0.015
device = "cuda" if torch.cuda.is_available() else "cpu" 
update_threshold = 21

In [None]:
string_channel = StringSideChannel()
channel = CustomDataChannel()

reward_cum = [0,0]

channel.send_data(serve=212, p1=reward_cum[0], p2=reward_cum[1])

unity_env = UnityEnvironment(r"C:/Users/junmi\Documents\dPickleball BuildFiless (1)\dPickleball BuildFiles/Training\Windows\dp.exe",side_channels=[string_channel, channel])
env = UnityParallelEnv(unity_env)

env.reset()

current_reward = 0 
previous_reward = 0
episode_reward = [0,0]

update_freq = 0

curriculum_learning = CurriculumLearning()

action_head = [3,3,3]
player1_net = ActorCritic(action_head).to(device)
player2_net = ActorCritic(action_head).to(device)
 
optim_player1 = optim.AdamW(player1_net.parameters(), lr = learning_rate, eps = 1e-5)
optim_player2 = optim.AdamW(player2_net.parameters(), lr = learning_rate, eps = 1e-5)
 
player1_PPO = PPO(player1_net,optim_player1,device)
player2_PPO = PPO(player2_net,optim_player2,device) 
 
first_frame = env.reset()['PAgent1?team=0?agent_id=0']["observation"][0]
first_frame = preprocess(first_frame)
frame_processor = FrameProcessor(4)

for i in range(frame_processor.stack_frame):
    frame_processor.stack.append(first_frame)
    frame_stack = frame_processor.concat()

step = 0
player1_buffer = RolloutBuffer()
player2_buffer = RolloutBuffer()

try:
    while env.agents:
        
        if curriculum_learning.train_player == "LEFT":
            actions_left, logprob, values = player1_PPO.select_action(frame_stack)
            action_right = curriculum_learning.current_course(frame_stack)
        
        if curriculum_learning.train_player == "RIGHT":
            actions_right, logprob, values = player2_PPO.select_action(frame_stack)
            action_left = curriculum_learning.current_course()
            
        action_left = [actions_left[0],actions_left[1],actions_left[2]]
        action_right = [actions_right[0],actions_right[1],actions_right[2]]
        
        actions = {env.agents[0]:action_left,env.agents[1]:action_right}
        
        observation, reward, done, info = env.step(actions)
        
        obs = observation[env.agents[0]]['observation'][0]  # (3,84,168)
        obs = obs[:,20:80,15:153] # (3,60,138)
        obs = preprocess(obs)
        
        dones = 0
        reward_cum[0] += reward[env.agents[0]]
        reward_cum[1] += reward[env.agents[1]]
        
        current_reward = reward_cum[0]
        
        if current_reward != previous_reward:
            update_freq += 1
            dones = 1
        
        previous_reward = current_reward
        
        if curriculum_learning.train_player == "LEFT":
            player1_buffer.obs.append(obs)
            player1_buffer.rewards.append(reward[env.agents[0]])
            player1_buffer.dones.append(dones)
            player1_buffer.actions.append(action_left)
            player1_buffer.logprobs.append(logprob)
            player1_buffer.values.append(values)
        
        if curriculum_learning.train_player == "RIGHT":
            player2_buffer.obs.append(obs)
            player2_buffer.rewards.append(reward[env.agents[1]])
            player2_buffer.dones.append(dones)
            player2_buffer.actions.append(action_right)
            player2_buffer.logprobs.append(logprob)
            player2_buffer.values.append(values)
        
        if step % 4 == 0:
            frame_processor.stack.append(obs)
        frame_stack = frame_processor.concat()
        
        if done[env.agents[0]] or done[env.agents[1]]:
            sys.exit()

        
        
        
        if update_freq == update_threshold:
            if curriculum_learning.train_player == "LEFT":
                last_value = player1_PPO.net(torch.tensor(frame_stack, dtype=torch.float32, device=device).unsqueeze(0))[1].item()
                player1_PPO.update(player1_buffer, last_value)
            else:
                last_value = player2_PPO.net(torch.tensor(frame_stack, dtype=torch.float32, device=device).unsqueeze(0))[1].item()
                player2_PPO.update(player2_buffer, last_value)
            update_freq = 0
        
except KeyboardInterrupt:
    print("Training interrupted")
finally:
    env.close()

(3, 84, 168)
Training interrupted
