In [None]:


import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import imageio
import os
import gradio as gr


class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)


def train_dqn(env_name, model_path, state_size, action_size, episodes=500):
    env = gym.make(env_name)
    model = QNetwork(state_size, action_size)
    target_model = QNetwork(state_size, action_size)
    target_model.load_state_dict(model.state_dict())

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    buffer = deque(maxlen=10000)

    gamma = 0.99
    batch_size = 64
    epsilon = 1.0
    epsilon_min = 0.01
    epsilon_decay = 0.995
    update_target_every = 10

    for ep in range(episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False

        while not done:
            if random.random() < epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(state).unsqueeze(0)
                    action = model(state_tensor).argmax().item()

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            buffer.append((state, action, reward, next_state, done))
            state = next_state
            total_reward += reward

            if len(buffer) >= batch_size:
                batch = random.sample(buffer, batch_size)
                states, actions, rewards_, next_states, dones = zip(*batch)
                states = torch.FloatTensor(states)
                actions = torch.LongTensor(actions).unsqueeze(1)
                rewards_ = torch.FloatTensor(rewards_).unsqueeze(1)
                next_states = torch.FloatTensor(next_states)
                dones = torch.BoolTensor(dones).unsqueeze(1)

                q_values = model(states).gather(1, actions)
                max_next_q = target_model(next_states).max(1)[0].unsqueeze(1)
                target = rewards_ + gamma * max_next_q * (~dones)

                loss = criterion(q_values, target)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        epsilon = max(epsilon_min, epsilon * epsilon_decay)
        if ep % update_target_every == 0:
            target_model.load_state_dict(model.state_dict())
        if (ep + 1) % 50 == 0:
            print(f"Episode {ep + 1}, Total Reward: {total_reward}, Epsilon: {epsilon:.3f}")

    torch.save(model.state_dict(), model_path)
    print(f"Training complete. Model saved to {model_path}.")


train_dqn("CartPole-v1", "dqn_cartpole.pth", 4, 2, episodes=500)
train_dqn("MountainCar-v0", "dqn_mountaincar.pth", 2, 3, episodes=1000)



def run_agent(env_name, model, episodes=1, agent_type="DQN", video_path="agent_video.mp4"):
    env = gym.make(env_name, render_mode="rgb_array")
    frames = []
    total_reward = 0

    for ep in range(episodes):
        state, _ = env.reset()
        done = False

        while not done:
            frames.append(env.render())

            if agent_type == "Random":
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    state_tensor = torch.FloatTensor(state).unsqueeze(0)
                    action = model(state_tensor).argmax().item()

            state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

    env.close()
    imageio.mimsave(video_path, frames, fps=30)
    return video_path, total_reward


cartpole_model = QNetwork(4, 2)
mountaincar_model = QNetwork(2, 3)

if os.path.exists("dqn_cartpole.pth"):
    cartpole_model.load_state_dict(torch.load("dqn_cartpole.pth", map_location="cpu"))
    cartpole_model.eval()

if os.path.exists("dqn_mountaincar.pth"):
    mountaincar_model.load_state_dict(torch.load("dqn_mountaincar.pth", map_location="cpu"))
    mountaincar_model.eval()


def gradio_interface(game, agent_type, episodes):
    env_name = "CartPole-v1" if game == "CartPole" else "MountainCar-v0"
    model = cartpole_model if game == "CartPole" else mountaincar_model
    model.eval()
    video_path = f"{game}_{agent_type}_demo.mp4"
    video_path, score = run_agent(env_name, model, episodes, agent_type, video_path)
    return video_path, score

iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Dropdown(["CartPole", "MountainCar"], label="Select Game"),
        gr.Dropdown(["DQN", "Random"], label="Select Agent"),
        gr.Slider(minimum=1, maximum=20, value=5, step=1, label="Episodes")
    ],
    outputs=[
        gr.Video(label="Gameplay"),
        gr.Number(label="Total Reward")
    ],
    title=" RL Agent Playground",
    description="Select a game and agent type, run episodes, and watch the result!"
)

iface.launch(debug=True)

  states = torch.FloatTensor(states)


Episode 50, Total Reward: 31.0, Epsilon: 0.778
Episode 100, Total Reward: 12.0, Epsilon: 0.606
Episode 150, Total Reward: 147.0, Epsilon: 0.471
Episode 200, Total Reward: 43.0, Epsilon: 0.367
Episode 250, Total Reward: 246.0, Epsilon: 0.286
Episode 300, Total Reward: 117.0, Epsilon: 0.222
Episode 350, Total Reward: 107.0, Epsilon: 0.173
Episode 400, Total Reward: 204.0, Epsilon: 0.135
Episode 450, Total Reward: 17.0, Epsilon: 0.105
Episode 500, Total Reward: 34.0, Epsilon: 0.082
Training complete. Model saved to dqn_cartpole.pth.
Episode 50, Total Reward: -200.0, Epsilon: 0.778
Episode 100, Total Reward: -200.0, Epsilon: 0.606
Episode 150, Total Reward: -200.0, Epsilon: 0.471
Episode 200, Total Reward: -200.0, Epsilon: 0.367
Episode 250, Total Reward: -200.0, Epsilon: 0.286
Episode 300, Total Reward: -200.0, Epsilon: 0.222
Episode 350, Total Reward: -200.0, Epsilon: 0.173
Episode 400, Total Reward: -200.0, Epsilon: 0.135
Episode 450, Total Reward: -200.0, Epsilon: 0.105
Episode 500, To

