In [16]:
import gym #Environment

#Policy 
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim

import random
import math
import numpy

from collections import deque #For replay buffer
torch.set_default_device('cpu')#'cuda' if torch.cuda.is_available() else 'cpu')
torch.get_default_device(), numpy.__version__

(device(type='cpu'), '1.24.4')

In [17]:
import os
print(os.environ['CONDA_DEFAULT_ENV']) # checking current environment

rl_env


In [18]:
GAMMA = 0.99       # Discount Factor
LR = 0.001         # Learning Rate
BATCH_SIZE = 64    # Batch Size for Training
EPSILON = 1.0      # Exploration Rate
EPSILON_MIN = 0.01 # Minimum Exploration Rate
EPSILON_DECAY = 0.995 # Exploration Decay
MEMORY_SIZE = 10000  # Replay Buffer Size
TARGET_UPDATE = 10  # Update Target Network Every X Episodes

In [19]:
env = gym.make("CartPole-v1", render_mode='human')

In [20]:
STATE_SIZE = env.observation_space.shape[0] #ordered set (cart position, cart velocity, pole angle, and pole angular velocity)
ACTION_SIZE = env.action_space.n #LEFT or RIGHT
STATE_SIZE, ACTION_SIZE

(4, 2)

In [21]:
env.observation_space

Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)

In [22]:
class DQN(nn.Module):
    def __init__(self, state_space_size, action_space_size, hidden_units):
        super(DQN, self).__init__()
        self.state_space_size = state_space_size
        self.action_space_size = action_space_size
        self.block = nn.Sequential(
            nn.Linear(state_space_size, hidden_units),
            nn.Linear(hidden_units, hidden_units),
            nn.Linear(hidden_units, action_space_size)
        )

    def forward(self,state):
        x = self.block(state)
        return x
dqn = DQN(STATE_SIZE,ACTION_SIZE, 24) 

In [23]:
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    def sample(self, batch_size): #Random sampling
        return random.sample(self.buffer, batch_size) 
    def __len__(self):
        return len(self.buffer)

memory = ReplayBuffer(MEMORY_SIZE)
len(memory)

0

In [24]:
state = torch.FloatTensor(env.reset()[0]).unsqueeze(dim=0)
dqn(state)

tensor([[ 0.0750, -0.1241]], grad_fn=<AddmmBackward0>)

In [27]:
def train_model():
    if len(memory) < BATCH_SIZE: #To chekc if theres enough experience
        return

    batch = memory.sample(BATCH_SIZE)
    states, actions, rewards, next_states, dones = zip(*batch)
    # print(states)
    
    states = torch.FloatTensor(states)
    actions = torch.LongTensor(actions).unsqueeze(1)
    rewards = torch.FloatTensor(rewards)
    next_states = torch.FloatTensor(next_states)
    dones = torch.tensor(dones, dtype=torch.float32)



    current_q_values = policy_net(states).gather(1, actions).squeeze()

    next_q_values = target_net(next_states).max(1)[0].detach()
    expected_q_values = rewards + GAMMA * next_q_values * (1 - dones)

    loss = torch.nn.functional.mse_loss(current_q_values, expected_q_values)

    optimizer.zero_grad()  # Reset gradients
    loss.backward()        # Compute gradients
    optimizer.step()       # Update weights

    

In [28]:
policy_net = DQN(STATE_SIZE, ACTION_SIZE, 24)
target_net = DQN(STATE_SIZE, ACTION_SIZE, 24)

target_net.load_state_dict(policy_net.state_dict())  # Copy weights
target_net.eval()

# Optimizer
optimizer = optim.Adam(policy_net.parameters(), lr=LR)

# Training Loop
num_episodes = 1500
epsilon = EPSILON

optimizer = optim.Adam(policy_net.parameters(), lr=LR)


for episode in range(num_episodes):
    state = env.reset()
    state = state[0]  # Get state from tuple
    total_reward = 0

    for t in range(200):
        # Epsilon-greedy action selection
        if random.random() < epsilon:
            action = env.action_space.sample()  # Explore
        else:
            state_tensor = torch.FloatTensor(numpy.array(state)).unsqueeze(0)
            action = policy_net(state_tensor).argmax().item()  # Exploit

        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated
        memory.push(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward
        
        # Train the model
        train_model()
        
        if done:
            break

    # Decay epsilon
    epsilon = max(EPSILON_MIN, epsilon * EPSILON_DECAY)

    # Update Target Network
    if episode % TARGET_UPDATE == 0:
        # Get weights before update
        before_update = policy_net.block[0].weight.clone().detach()
        
        # Perform one training step
        train_model()
        
        # Get weights after update
        after_update = policy_net.block[0].weight.clone().detach()
        
        # Check if they changed
        print("Weight Change:", torch.norm(after_update - before_update).item())
        target_net.load_state_dict(policy_net.state_dict())
        
    # print(f"Episode {episode}, ")
    print(f"Episode {episode}: Total Reward = {total_reward} Epsilon: {epsilon:.4f}")

Weight Change: 0.008997206576168537
Episode 0: Total Reward = 47.0 Epsilon: 0.9950
Episode 1: Total Reward = 56.0 Epsilon: 0.9900
Episode 2: Total Reward = 13.0 Epsilon: 0.9851
Episode 3: Total Reward = 17.0 Epsilon: 0.9801
Episode 4: Total Reward = 26.0 Epsilon: 0.9752
Episode 5: Total Reward = 40.0 Epsilon: 0.9704
Episode 6: Total Reward = 13.0 Epsilon: 0.9655
Episode 7: Total Reward = 19.0 Epsilon: 0.9607
Episode 8: Total Reward = 16.0 Epsilon: 0.9559
Episode 9: Total Reward = 34.0 Epsilon: 0.9511
Weight Change: 0.002804040675982833
Episode 10: Total Reward = 24.0 Epsilon: 0.9464
Episode 11: Total Reward = 35.0 Epsilon: 0.9416
Episode 12: Total Reward = 18.0 Epsilon: 0.9369
Episode 13: Total Reward = 16.0 Epsilon: 0.9322
Episode 14: Total Reward = 11.0 Epsilon: 0.9276
Episode 15: Total Reward = 15.0 Epsilon: 0.9229
Episode 16: Total Reward = 16.0 Epsilon: 0.9183
Episode 17: Total Reward = 31.0 Epsilon: 0.9137
Episode 18: Total Reward = 49.0 Epsilon: 0.9092
Episode 19: Total Reward =

In [29]:
def test_agent():
    state = env.reset()
    state = state[0]
    done = False
    total_reward = 0

    while not done:
        env.render()
        state_tensor = torch.FloatTensor(numpy.array(state)).unsqueeze(0)
        action = policy_net(state_tensor).argmax().item()
        state, reward, done, _, _ = env.step(action)
        total_reward += reward

    print(f"Test Episode Reward: {total_reward}")

# Run the trained agent
test_agent()
env.close()


Test Episode Reward: 10.0


In [1]:
!pip install --upgrade pip setuptools wheel


Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Collecting setuptools
  Downloading setuptools-78.1.0-py3-none-any.whl.metadata (6.6 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ---------------------------------------- 1.8/1.8 MB 11.3 MB/s eta 0:00:00
Downloading setuptools-78.1.0-py3-none-any.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 1.3/1.3 MB 10.6 MB/s eta 0:00:00


ERROR: To modify pip, please run the following command:
C:\Users\91748\.conda\envs\rl_env\python.exe -m pip install --upgrade pip setuptools wheel


In [3]:
!C:\Users\91748\.conda\envs\rl_env\python.exe -m pip install --upgrade pip setuptools wheel



In [4]:
!pip install pygame --pre --no-cache-dir

Collecting pygame
  Downloading pygame-2.6.1-cp311-cp311-win_amd64.whl.metadata (13 kB)
Downloading pygame-2.6.1-cp311-cp311-win_amd64.whl (10.6 MB)
   ---------------------------------------- 0.0/10.6 MB ? eta -:--:--
   ------- -------------------------------- 2.1/10.6 MB 13.0 MB/s eta 0:00:01
   ----------------- ---------------------- 4.7/10.6 MB 11.9 MB/s eta 0:00:01
   -------------------------- ------------- 7.1/10.6 MB 12.1 MB/s eta 0:00:01
   ----------------------------------- ---- 9.4/10.6 MB 12.0 MB/s eta 0:00:01
   ---------------------------------------- 10.6/10.6 MB 12.0 MB/s eta 0:00:00
Installing collected packages: pygame
Successfully installed pygame-2.6.1


In [5]:
!pip install setuptools





In [36]:
!pip install --upgrade numpy==1.24.4





In [19]:
!pip install numba==0.58.0

Collecting numba==0.58.0
  Downloading numba-0.58.0-cp311-cp311-win_amd64.whl.metadata (2.8 kB)
Collecting llvmlite<0.42,>=0.41.0dev0 (from numba==0.58.0)
  Downloading llvmlite-0.41.1-cp311-cp311-win_amd64.whl.metadata (4.9 kB)
Downloading numba-0.58.0-cp311-cp311-win_amd64.whl (2.6 MB)
   ---------------------------------------- 0.0/2.6 MB ? eta -:--:--
   ---------------- ----------------------- 1.0/2.6 MB 12.7 MB/s eta 0:00:01
   -------------------------------- ------- 2.1/2.6 MB 5.3 MB/s eta 0:00:01
   ---------------------------------------- 2.6/2.6 MB 4.3 MB/s eta 0:00:00
Downloading llvmlite-0.41.1-cp311-cp311-win_amd64.whl (28.1 MB)
   ---------------------------------------- 0.0/28.1 MB ? eta -:--:--
   --- ------------------------------------ 2.4/28.1 MB 12.2 MB/s eta 0:00:03
   ------ --------------------------------- 4.7/28.1 MB 11.9 MB/s eta 0:00:02
   ---------- ----------------------------- 7.3/28.1 MB 11.9 MB/s eta 0:00:02
   ------------- -------------------------- 9

In [21]:
!pip install numpy==1.23.5


Collecting numpy==1.23.5
  Using cached numpy-1.23.5-cp311-cp311-win_amd64.whl.metadata (2.3 kB)
Using cached numpy-1.23.5-cp311-cp311-win_amd64.whl (14.6 MB)
Installing collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.4
    Uninstalling numpy-1.24.4:
      Successfully uninstalled numpy-1.24.4
Successfully installed numpy-1.23.5
