In [1]:
import gymnasium as gym
import random
import numpy as np
from collections import deque

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

In [8]:
# -----------------------------
# Hyperparameters
# -----------------------------
ENV_NAME = "CartPole-v1"
GAMMA = 0.99
LR = 1e-3
BATCH_SIZE = 64
MEMORY_SIZE = 10_000
EPS_START = 1.0
EPS_END = 0.05
EPS_DECAY = 500
NUM_EPISODES = 200
TARGET_UPDATE = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# -----------------------------
# Q-Network
# -----------------------------
class QNetwork(nn.Module):
    def __init__(self, state_dim, action_dim):
        super().__init__()
        self.fc1 = nn.Linear(state_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.out = nn.Linear(128, action_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.out(x)

In [5]:
# -----------------------------
# Replay Buffer
# -----------------------------
class ReplayBuffer:
    def __init__(self, capacity):
        self.buffer = deque(maxlen=capacity)

    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def sample(self, batch_size):
        batch = random.sample(self.buffer, batch_size)
        states, actions, rewards, next_states, dones = map(np.array, zip(*batch))
        return states, actions, rewards, next_states, dones

    def __len__(self):
        return len(self.buffer)

In [6]:
# -----------------------------
# Environment
# -----------------------------
env = gym.make(ENV_NAME)

state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

policy_net = QNetwork(state_dim, action_dim).to(device)
target_net = QNetwork(state_dim, action_dim).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=LR)
memory = ReplayBuffer(MEMORY_SIZE)

steps_done = 0

def epsilon(step):
    return EPS_END + (EPS_START - EPS_END) * np.exp(-step / EPS_DECAY)

In [7]:
# -----------------------------
# Training loop
# -----------------------------
for episode in range(NUM_EPISODES):
    state, info = env.reset()
    total_reward = 0

    while True:
        steps_done += 1

        # ε-greedy action selection
        if random.random() < epsilon(steps_done):
            action = env.action_space.sample()
        else:
            with torch.no_grad():
                s = torch.FloatTensor(state).unsqueeze(0).to(device)
                action = policy_net(s).argmax().item()

        next_state, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        memory.push(state, action, reward, next_state, done)
        state = next_state
        total_reward += reward

        # Learning step
        if len(memory) >= BATCH_SIZE:
            states, actions, rewards, next_states, dones = memory.sample(BATCH_SIZE)

            states = torch.FloatTensor(states).to(device)
            actions = torch.LongTensor(actions).unsqueeze(1).to(device)
            rewards = torch.FloatTensor(rewards).unsqueeze(1).to(device)
            next_states = torch.FloatTensor(next_states).to(device)
            dones = torch.FloatTensor(dones).unsqueeze(1).to(device)

            q_values = policy_net(states).gather(1, actions)
            next_q_values = target_net(next_states).max(1, keepdim=True)[0]
            target = rewards + GAMMA * next_q_values * (1 - dones)

            loss = F.mse_loss(q_values, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if done:
            break

    # Target network update
    if episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())

    print(f"Episode {episode:3d} | Reward: {total_reward}")

env.close()

Episode   0 | Reward: 23.0
Episode   1 | Reward: 96.0
Episode   2 | Reward: 14.0
Episode   3 | Reward: 25.0
Episode   4 | Reward: 14.0
Episode   5 | Reward: 10.0
Episode   6 | Reward: 11.0
Episode   7 | Reward: 9.0
Episode   8 | Reward: 18.0
Episode   9 | Reward: 10.0
Episode  10 | Reward: 11.0
Episode  11 | Reward: 14.0
Episode  12 | Reward: 26.0
Episode  13 | Reward: 9.0
Episode  14 | Reward: 17.0
Episode  15 | Reward: 11.0
Episode  16 | Reward: 11.0
Episode  17 | Reward: 15.0
Episode  18 | Reward: 12.0
Episode  19 | Reward: 12.0
Episode  20 | Reward: 17.0
Episode  21 | Reward: 9.0
Episode  22 | Reward: 10.0
Episode  23 | Reward: 17.0
Episode  24 | Reward: 12.0
Episode  25 | Reward: 11.0
Episode  26 | Reward: 11.0
Episode  27 | Reward: 10.0
Episode  28 | Reward: 12.0
Episode  29 | Reward: 10.0
Episode  30 | Reward: 15.0
Episode  31 | Reward: 10.0
Episode  32 | Reward: 48.0
Episode  33 | Reward: 21.0
Episode  34 | Reward: 12.0
Episode  35 | Reward: 11.0
Episode  36 | Reward: 21.0
Epis

In [9]:
torch.save(policy_net.state_dict(), "cartpole_dqn.pt")


## Inference

In [10]:
policy_net = QNetwork(state_dim, action_dim).to(device)
policy_net.load_state_dict(torch.load("cartpole_dqn.pt", map_location=device))
policy_net.eval()   # important!


QNetwork(
  (fc1): Linear(in_features=4, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (out): Linear(in_features=128, out_features=2, bias=True)
)

In [12]:
# Minimal inference loop (Gymnasium)
# This is pure exploitation: always pick the best action according to the network.
# state → neural net → [Q(left), Q(right)] → argmax

# 0 → push cart LEFT
# 1 → push cart RIGHT

import gymnasium as gym
import torch

env = gym.make("CartPole-v1", render_mode="human")

state, _ = env.reset()
done = False
step = 0

while not done:
    with torch.no_grad():
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = policy_net(state_tensor)
        action = q_values.argmax().item()

    action_name = "LEFT" if action == 0 else "RIGHT"

    print(
        f"Step {step:3d} | "
        f"Action: {action} ({action_name}) | "
        f"Q-values: {q_values.squeeze().tolist()}"
    )

    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    step += 1

env.close()




Step   0 | Action: 0 (LEFT) | Q-values: [41.635677337646484, 41.62831115722656]
Step   1 | Action: 1 (RIGHT) | Q-values: [41.72291946411133, 41.74708938598633]
Step   2 | Action: 0 (LEFT) | Q-values: [41.62157440185547, 41.61455535888672]
Step   3 | Action: 1 (RIGHT) | Q-values: [41.709842681884766, 41.734371185302734]
Step   4 | Action: 0 (LEFT) | Q-values: [41.60946273803711, 41.602787017822266]
Step   5 | Action: 1 (RIGHT) | Q-values: [41.6986083984375, 41.72348403930664]
Step   6 | Action: 0 (LEFT) | Q-values: [41.59904861450195, 41.592716217041016]
Step   7 | Action: 1 (RIGHT) | Q-values: [41.68894577026367, 41.71415710449219]
Step   8 | Action: 0 (LEFT) | Q-values: [41.590415954589844, 41.58393859863281]
Step   9 | Action: 1 (RIGHT) | Q-values: [41.68063735961914, 41.7061653137207]
Step  10 | Action: 0 (LEFT) | Q-values: [41.584556579589844, 41.57572555541992]
Step  11 | Action: 1 (RIGHT) | Q-values: [41.67348861694336, 41.69931411743164]
Step  12 | Action: 0 (LEFT) | Q-values: [

In [13]:


env = gym.make("CartPole-v1", render_mode="human")

state, _ = env.reset()
done = False
step = 0

while not done:
    with torch.no_grad():
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = policy_net(state_tensor)
        probs = F.softmax(q_values, dim=-1)

        action = q_values.argmax().item()

    p_left, p_right = probs.squeeze().tolist()
    action_name = "LEFT" if action == 0 else "RIGHT"

    print(
        f"Step {step:3d} | "
        f"Action: {action} ({action_name}) | "
        f"Q: {q_values.squeeze().tolist()} | "
        f"P: [LEFT={p_left:.2f}, RIGHT={p_right:.2f}]"
    )

    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    step += 1

env.close()


Step   0 | Action: 1 (RIGHT) | Q: [41.401100158691406, 41.464542388916016] | P: [LEFT=0.48, RIGHT=0.52]
Step   1 | Action: 0 (LEFT) | Q: [41.35016632080078, 41.337806701660156] | P: [LEFT=0.50, RIGHT=0.50]
Step   2 | Action: 1 (RIGHT) | Q: [41.42356491088867, 41.48436737060547] | P: [LEFT=0.48, RIGHT=0.52]
Step   3 | Action: 0 (LEFT) | Q: [41.368709564208984, 41.35772705078125] | P: [LEFT=0.50, RIGHT=0.50]
Step   4 | Action: 1 (RIGHT) | Q: [41.44454574584961, 41.502376556396484] | P: [LEFT=0.49, RIGHT=0.51]
Step   5 | Action: 0 (LEFT) | Q: [41.386837005615234, 41.37571716308594] | P: [LEFT=0.50, RIGHT=0.50]
Step   6 | Action: 1 (RIGHT) | Q: [41.46451950073242, 41.51900100708008] | P: [LEFT=0.49, RIGHT=0.51]
Step   7 | Action: 0 (LEFT) | Q: [41.40493392944336, 41.3922233581543] | P: [LEFT=0.50, RIGHT=0.50]
Step   8 | Action: 1 (RIGHT) | Q: [41.483943939208984, 41.53464889526367] | P: [LEFT=0.49, RIGHT=0.51]
Step   9 | Action: 0 (LEFT) | Q: [41.42338562011719, 41.40763854980469] | P: [LE

In [17]:
import gymnasium as gym
import torch
import torch.nn.functional as F

env = gym.make("CartPole-v1", render_mode="human")

state, _ = env.reset()
done = False
step = 0

while not done:
    with torch.no_grad():
        state_tensor = torch.FloatTensor(state).unsqueeze(0)
        q_values = policy_net(state_tensor)
        action = q_values.argmax().item()

    state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    step += 1

# ---- print reason AFTER episode ends ----
if terminated:
    reason = "TERMINATED: pole fell or cart out of bounds"
elif truncated:
    reason = "TRUNCATED: time limit reached"
else:
    reason = "UNKNOWN"

print(f"Episode finished at step {step}")
print(f"Reason: {reason}")

env.close()


Episode finished at step 251
Reason: TERMINATED: pole fell or cart out of bounds


In [19]:
total_steps = 5000  # total steps across episodes
steps_done = 0

env = gym.make("CartPole-v1") # Removed render_mode="human"

while steps_done < total_steps:
    state, _ = env.reset()
    done = False
    episode_steps = 0

    while not done and steps_done < total_steps:
        with torch.no_grad():
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            q_values = policy_net(state_tensor)
            action = q_values.argmax().item()

        state, reward, terminated, truncated, _ = env.step(action)
        done = terminated or truncated
        episode_steps += 1
        steps_done += 1

    print(f"Episode finished in {episode_steps} steps | Reason: {'TERMINATED' if terminated else 'TRUNCATED'}")

env.close()

Episode finished in 211 steps | Reason: TERMINATED
Episode finished in 274 steps | Reason: TERMINATED
Episode finished in 203 steps | Reason: TERMINATED
Episode finished in 235 steps | Reason: TERMINATED
Episode finished in 500 steps | Reason: TRUNCATED
Episode finished in 235 steps | Reason: TERMINATED
Episode finished in 500 steps | Reason: TRUNCATED
Episode finished in 187 steps | Reason: TERMINATED
Episode finished in 209 steps | Reason: TERMINATED
Episode finished in 259 steps | Reason: TERMINATED
Episode finished in 276 steps | Reason: TERMINATED
Episode finished in 500 steps | Reason: TRUNCATED
Episode finished in 203 steps | Reason: TERMINATED
Episode finished in 269 steps | Reason: TERMINATED
Episode finished in 423 steps | Reason: TERMINATED
Episode finished in 366 steps | Reason: TERMINATED
Episode finished in 150 steps | Reason: TRUNCATED
