Imports

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import random
from collections import namedtuple, deque
import math
import random
from tqdm import tqdm
import os

import matplotlib
import matplotlib.pyplot as plt

from proxy import Proxy

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else"cpu")
print(device)

random.seed(6) #For Consistency
torch.manual_seed(6)

Q-Network & Replay Memory

In [3]:
class DQN(nn.Module):
    def __init__(self,state_len,action_len):
        super(DQN,self).__init__()
        self.layer1 = nn.Linear(state_len, 64)
        self.layer2 = nn.Linear(64, 64)
        self.layer3 = nn.Linear(64,64)
        self.layer4 = nn.Linear(64, action_len)

    def forward(self,x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = F.relu(self.layer3(x))
        x = F.relu(self.layer4(x))
        return x

In [4]:
#x = Transition(1,2,3,4)
#print(x.state) -> outputs 1
#print(x.reward) -> outputs 4
Transition = namedtuple('Transition',('state', 'action', 'next_state', 'reward'))

In [5]:
class ReplayMemory(object):
    def __init__(self,capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self,*args): #Converts State,Action,Next_State,Reward into transition tuple
        self.memory.append(Transition(*args))  

    def __sample__(self,batch_size):
        return random.sample(self.memory,batch_size)

    def __len__(self): #len(Object of Replay Memory) -> internally calls __len__
        return len(self.memory)

Epsilon Greedy Policy Algorithm

In [6]:
def choose_action(policy_net,state,steps):
    global EPS_START ,EPS_END , EPS_DECAY
    eps = EPS_START + (EPS_END - EPS_START)*math.exp(-1*(steps/EPS_DECAY))

    if random.random() < eps : #Exploration
        return random.randrange(2)  #Two Actions ["JUMP or FALL"]
    else : #Exploitation 
        with torch.no_grad():
            return torch.argmax(policy_net(state)).item() #Greedy Action

Training Step

In [7]:
def optimize_model(policy_net,target_net,memory,optimizer):
    global BATCH_SIZE,GAMMA,device

    if len(memory) < BATCH_SIZE : 
        return 

    transitions = memory.sample(BATCH_SIZE) #List of Transitions i.e (State Action Next_state Reward)
    batch = Transition(*zip(*transitions)) #Transition of Lists i.e Transition[State = (s1,s2,..) Action = (a1,a2,..) ... Reward = (r1,r2,..)]
    
    state_batch  = torch.cat(batch.state)  #batch_size * no. of states  [from tuple of tensors to tensor]
    action_batch = torch.cat(batch.action) #batch_size * no. of actions [from tuple of tensors to tensor]
    reward_batch = torch.cat(batch.reward) #batch_size * no. of rewards [form tuple of tensors to tensor]

    # [[Q(s1,action1),Q(s1,action2),...],[Q(s2,action1),Q(s2,action2),...],...] from neural network 
    #s1,s2,s3 -> state from memory buffer
    #action1,action2,... -> action space
    #a1,a2,...... -> action done when on s1,s2,...
    #final result would be what is the quality of actions a1,a2,.. on states s1,s2,.. respectively
    #i.e [Q(s1,action == a1),Q(s2,action == a2),Q(s3,action == a3),...]
    state_action_values = policy_net(state_batch).gather(1,action_batch)

    #Finding Expected State Action Values From Target Network
    #next_state can be None sometimes, because next_state could be end of episodes
    #So we ignore them
    non_final_mask   = torch.tensor(tuple(map(lambda s : s is not None,batch.next_state)),device=device,dtype=torch.bool) 
    non_final_states = torch.cat([s for s in batch.next_state if s is not None])
    
    #Predicted Final Reward
    #End of Episode = contributes 0
    #But others have some value -> we get from target_net
    #We know we get [[Q(s1,action1),Q(s1,action2),...],[Q(s2,action1),Q(s2,action2),...],...] out of neural network
    #.max(1) -> ([Q(s1,action_i) max value, it's coordinates],[Q(s2,action_i) max value, it's coordinates],.....)
    #.value  -> [Q(s1,action_i) max value, Q(s2,action_i) max value,...]
    #essentially just fetching max value of the state
    next_state_values = torch.zeros(BATCH_SIZE,device=device) 
    with torch.no_grad():
        next_state_values[non_final_mask] = target_net(non_final_states).max(1).values
    
    #TD Estimate
    expected_state_action_values = (GAMMA*next_state_values) + reward_batch 
    
    #Standard Backpropagation
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

Hyper Parameters

In [8]:
EPS_START = 0.99  #Starting : Give Priority to Exploration
EPS_END   = 0.01  #Ending   : Give Priority to Exploitation
EPS_DECAY = 100   #How Fast You Want to End Exploration

BATCH_SIZE = 128 #SHOULD BE OBVIOUS XD
GAMMA = 0.99 #Higher Gamma : Long Term Reward Maximization
NUM_EPS = 50 #Each Episode[Level] We Make the Game Tougher
TAU = 0.005  #Soft Update Factor For Updating targert_net towards policy_net
LR = 1e-3    #Learning Rate for Optimizer

Training

In [9]:
state_len  = 5 #Pipe Coords - Player Coords,Space Between Pipes, Player Vertical Speed, Player Horizontal Speed
action_len = 2 #JUMP or FALL

In [10]:
policy_net = DQN(state_len, action_len).to(device)
target_net = DQN(state_len, action_len).to(device)
target_net.load_state_dict(policy_net.state_dict()) #Target Net is a Copy of Policy Net (Just few updates behind)

optimizer = optim.AdamW(policy_net.parameters(), lr=LR, amsgrad=True)
memory = ReplayMemory(10000)

In [11]:
#Intial Params For Simulation
#Params Get Harder Overtime
#xGap - Decreases [Spacing B/w Pipes]
#yGap - Increases [Spacing B/w Holes]
#hGap - Decreases [Hole Space]
#xSpeed - Player x movement Speed 
#ySpeed - Player y movement Speed Decreases
env_config = { 
    "xGap": 200, #At Max Diff 100
    "yGap": 30,  #At Max Diff 60
    "hGap": 250, #At Max Diff 50
    "xSpeed": 10, 
    "ySpeed": 4, #At Max Diff 1
}

env = Proxy(level=env_config)
action_map = ["JUMP","FALL"]

In [12]:
#Final Params For Simulation
env_config_final = {
    "xGap" : 50,
    "yGap" : 60,
    "hGap" : 100,
    "xSpeed": 10,
    "ySpeed": 4,
}

#Step Size of decrementing Each of the Params
env_config_step_size = {
    key : (env_config_final[key] - env_config[key])//NUM_EPS for key in env_config.keys()   
}

In [None]:
for i in range(NUM_EPS):
    state_info = env.update("FALL") #Dummy Action To Extract State Info
    const_info = [env_config["ySpeed"],env_config["xSpeed"],env_config["hGap"]] #Only Change Across Eps
    state = torch.tensor(state_info["pos"] + const_info,device=device,dtype=torch.float32) 
    
    survial_score = 0 #Continue With EPS Till 20 pillars are Crossed
    survival_time = 0 #No. of steps Taken Before Reset
    actions_done  = 0 #Counting the Number of Actions Done, Using which We Decay EPS
    
    with tqdm(total=10, desc=f"Episode-{i} Survival Progress", leave=False) as pbar: #Loading Bar
        max_survival_score = 0
        while survial_score < 10 :
            #Loops Params
            actions_done  += 1
            survival_time += 1

            #Perform Action & Observe
            action = choose_action(policy_net,state,survival_time)
            state_info = env.update(action_map[action])

            done  = state_info['over']
            score = state_info['score']

            survial_score += score

            if done :
                survival_time = 0
                survial_score = 0

            #Next State
            next_state = torch.tensor(state_info["pos"] + const_info,device=device,dtype=torch.float32)
            reward = torch.tensor([score*10 - (200 if done else 0)] ,device=device,dtype=torch.float32) #Pillar Crossed = +1 & Survive = +0.1

            #Store in Memory
            memory.push(state,action,next_state if not done else None,reward)

            #Move to Next State
            state = next_state

            #Soft Update
            #Target_Net_Wts = (1-TAU)*Target_Net_Wts + TAU*POLICY_Net_Wts
            target_dict = target_net.state_dict()
            policy_dict = policy_net.state_dict()
            
            for key in policy_dict:
                target_dict[key] = policy_dict[key]*TAU + target_dict[key]*(1-TAU) 
                target_net.load_state_dict(target_dict)

            #Printing Progress
            pbar.update(score)
            max_survival_score = max(max_survival_score,survial_score)
            pbar.set_postfix_str(f"Steps: {survival_time}, Max Score: {max_survival_score}")

            if done :
                pbar.reset()

    #Moving onto next level
    #Updating Params Based on Step Size
    env_config = {key : env_config[key] + env_config_step_size[key] for key in env_config.keys()}
    env.levelUp(env_config)

In [14]:
#Stopping Training Env
env.exit()

Saving Models

In [16]:
def save_models(target_net, policy_net, models_dir='models'):
    # Ensure the models directory exists
    if not os.path.exists(models_dir):
        os.makedirs(models_dir)

    # Get a list of all folder names in the models directory
    existing_folders = [name for name in os.listdir(models_dir) if os.path.isdir(os.path.join(models_dir, name))]

    # Find the largest folder name
    if existing_folders:
        largest_folder = max(int(folder) for folder in existing_folders)
        new_folder = str(largest_folder + 1)
    else:
        new_folder = '0'

    # Create the new folder
    new_folder_path = os.path.join(models_dir, new_folder)
    os.makedirs(new_folder_path)

    # Save the target_net and policy_net in the new folder
    torch.save(target_net.state_dict(), os.path.join(new_folder_path, 'target_net.pth'))
    torch.save(policy_net.state_dict(), os.path.join(new_folder_path, 'policy_net.pth'))

    print(f"Models saved in folder: {new_folder_path}")

In [None]:
save_models(target_net, policy_net)

Testing 

In [1]:
def load_models(target_net, policy_net, folder_name, models_dir='models'):
    # Construct the path to the specific folder inside models
    folder_path = os.path.join(models_dir, folder_name)

    # Check if the folder exists
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Folder {folder_name} not found in {models_dir}")

    # Load the target_net and policy_net
    target_net_path = os.path.join(folder_path, 'target_net.pth')
    target_net.load_state_dict(torch.load(target_net_path))
    print(f"Target_Net loaded from {folder_path}")
    
    policy_net_path = os.path.join(folder_path, 'policy_net.pth')
    policy_net.load_state_dict(torch.load(policy_net_path))
    print(f"Policy_Net loaded from {folder_path}")

In [None]:
folder_name = '0' 
load_models(target_net, policy_net, folder_name)
