In [None]:
import os
import random
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import ast  # For safely evaluating strings as Python literals

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the synthetic dataset
dataset = pd.read_csv("C:\\Users\\hp\\Desktop\\College\\Optimizing Assignment\\synthetic_meal_delivery_dataset_with_previous_time.csv")

# Convert state columns from strings to dictionaries
dataset["Current State"] = dataset["Current State"].apply(ast.literal_eval)
dataset["Next State"] = dataset["Next State"].apply(ast.literal_eval)

# Define neural network for Q-values
class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Experience Replay with Prioritized Sampling
class PrioritizedReplayBuffer:
    def __init__(self, capacity, alpha=0.6):
        self.capacity = capacity
        self.memory = []
        self.priorities = deque(maxlen=capacity)
        self.alpha = alpha

    def add(self, experience, error):
        self.memory.append(experience)
        priority = (abs(error) + 1e-5) ** self.alpha
        self.priorities.append(priority)

    def sample(self, batch_size, beta=0.4):
        probabilities = np.array(self.priorities) / sum(self.priorities)
        indices = np.random.choice(len(self.memory), batch_size, p=probabilities)
        experiences = [self.memory[i] for i in indices]
        weights = (len(self.memory) * probabilities[indices]) ** (-beta)
        weights /= weights.max()
        return experiences, weights, indices

    def update_priorities(self, indices, errors):
        for i, error in zip(indices, errors):
            self.priorities[i] = (abs(error) + 1e-5) ** self.alpha

# Define the RL Agent
class DDQNAgent:
    def __init__(self, state_dim, action_dim, gamma=0.9, lr=0.001, batch_size=64, memory_size=30000):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.gamma = gamma
        self.batch_size = batch_size
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.memory = PrioritizedReplayBuffer(memory_size)
        self.main_network = QNetwork(state_dim, action_dim).to(self.device)
        self.target_network = QNetwork(state_dim, action_dim).to(self.device)
        self.target_network.load_state_dict(self.main_network.state_dict())
        self.optimizer = optim.Adam(self.main_network.parameters(), lr=lr)
        self.epsilon = 1.0  # Initial exploration rate
        self.epsilon_min = 0.1
        self.epsilon_decay = 0.999

    def act(self, state):
        if random.random() < self.epsilon:
            return random.randint(0, self.action_dim - 1)
        state = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(self.device)
        with torch.no_grad():
            q_values = self.main_network(state)
        return torch.argmax(q_values).item()

    def remember(self, experience, error):
        self.memory.add(experience, error)

    def replay(self):
        if len(self.memory.memory) < self.batch_size:
            return

        experiences, weights, indices = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*experiences)
        weights = torch.tensor(weights, dtype=torch.float32).to(self.device)

        states = torch.tensor(states, dtype=torch.float32).to(self.device)
        actions = torch.tensor(actions, dtype=torch.long).to(self.device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(self.device)
        next_states = torch.tensor(next_states, dtype=torch.float32).to(self.device)
        dones = torch.tensor(dones, dtype=torch.float32).to(self.device)

        # Q-value predictions
        q_values = self.main_network(states).gather(1, actions.unsqueeze(-1)).squeeze(-1)
        with torch.no_grad():
            next_q_values = self.target_network(next_states).max(1)[0]
        target_q_values = rewards + (1 - dones) * self.gamma * next_q_values

        # Compute loss
        loss = (weights * (q_values - target_q_values) ** 2).mean()
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update priorities
        errors = torch.abs(q_values - target_q_values).cpu().detach().numpy()
        self.memory.update_priorities(indices, errors)

    def update_target_network(self):
        self.target_network.load_state_dict(self.main_network.state_dict())

    def save_model(self, path):
        torch.save(self.main_network.state_dict(), path)

    def load_model(self, path):
        self.main_network.load_state_dict(torch.load(path, map_location=self.device))
        self.target_network.load_state_dict(self.main_network.state_dict())

# Training process
def train_agent(agent, dataset, episodes=1000, target_update_freq=200):
    for episode in range(1, episodes + 1):
        state = dataset.iloc[0]["Current State"]  # This is now a dictionary
        done = False
        while not done:
            action = agent.act(state)
            reward = dataset.iloc[0]["Reward"]
            next_state = dataset.iloc[0]["Next State"]  # This is now a dictionary
            done = dataset.iloc[0]["Done"]

            agent.remember((
                [state["Expected Delivery Time (mins)"], state["Distance to Depot"], state["Distance to Restaurants"]],
                action, reward, 
                [next_state["Expected Delivery Time (mins)"], next_state["Distance to Depot"], next_state["Distance to Restaurants"]],
                done
            ), error=reward)
            state = next_state

            agent.replay()
            if episode % target_update_freq == 0:
                agent.update_target_network()

        if agent.epsilon > agent.epsilon_min:
            agent.epsilon *= agent.epsilon_decay

    print("Training completed.")

# Hyperparameters
state_dim = 3  # (Expected Delivery Time, Distance to Depot, Distance to Restaurants)
action_dim = 4  # (Accept, Reject, Return to Depot, Move to Restaurant)
memory_size = 30000
batch_size = 64
gamma = 0.9
learning_rate = 0.001
target_update_freq = 200
episodes = 1000

# Initialize the agent
agent = DDQNAgent(state_dim, action_dim, gamma=gamma, lr=learning_rate, batch_size=batch_size, memory_size=memory_size)

# Train the agent
train_agent(agent, dataset, episodes=episodes, target_update_freq=target_update_freq)

# Save the trained model
model_path = "ddqn_per_meal_delivery_model.pth"
agent.save_model(model_path)
print(f"Model saved at {model_path}.")