# 🎓 Train Double DQN for 2D Bin Packing
This notebook trains a Double DQN model with a corrected reward function to avoid unrealistic placements.

In [None]:
# ✅ Install dependencies
!pip install tensorflow numpy matplotlib

In [None]:
# ✅ Environment and agent setup
import numpy as np
import random
from collections import deque
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential, clone_model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import load_model

class BinPacking2DEnv:
    def __init__(self, bin_width=1.0, bin_height=1.0, num_bins=5, items_per_episode=20):
        self.bin_width = bin_width
        self.bin_height = bin_height
        self.num_bins = num_bins
        self.items_per_episode = items_per_episode
        self.reset()

    def reset(self):
        self.bins = [[] for _ in range(self.num_bins)]
        self.items = [tuple(np.random.uniform(0.1, 0.4, size=2)) for _ in range(self.items_per_episode)]
        self.current_index = 0
        self.current_item = self.items[self.current_index]
        return self.get_state()

    def _can_place(self, bin_items, item):
        y_offset = sum(h for _, h in bin_items)
        return y_offset + item[1] <= self.bin_height and item[0] <= self.bin_width

    def step(self, action):
        reward = -1
        item = self.current_item
        if self._can_place(self.bins[action], item):
            self.bins[action].append(item)
            reward = 1
        else:
            reward = -5  # ❗ Strong penalty for illegal placement
        self.current_index += 1
        done = self.current_index >= self.items_per_episode
        if not done:
            self.current_item = self.items[self.current_index]
        return self.get_state(), reward, done

    def get_state(self):
        flat = [d for bin in self.bins for r in bin for d in r][:30]
        padded = flat + [0.0] * (30 - len(flat))
        return np.array(list(self.current_item) + padded)

    def get_bins_used(self):
        return sum(1 for b in self.bins if b)

class DoubleDQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.model = self._build_model()
        self.target_model = clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())

    def _build_model(self):
        model = Sequential()
        model.add(Input(shape=(self.state_size,)))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(128, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(optimizer='adam', loss='mse')
        return model

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        q_values = self.model.predict(state, verbose=0)
        return np.argmax(q_values[0])

    def remember(self, s, a, r, s_, done):
        self.memory.append((s, a, r, s_, done))

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for s, a, r, s_, done in minibatch:
            target = self.model.predict(s, verbose=0)
            if done:
                target[0][a] = r
            else:
                next_a = np.argmax(self.model.predict(s_, verbose=0)[0])
                t_q = self.target_model.predict(s_, verbose=0)[0][next_a]
                target[0][a] = r + self.gamma * t_q
            self.model.fit(s, target, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target(self):
        self.target_model.set_weights(self.model.get_weights())

In [None]:
# ✅ Train Double DQN Agent
env = BinPacking2DEnv()
agent = DoubleDQNAgent(len(env.get_state()), env.num_bins)

episodes = 100
for e in range(episodes):
    state = env.reset().reshape(1, -1)
    done = False
    while not done:
        action = agent.act(state)
        next_state, reward, done = env.step(action)
        agent.remember(state, action, reward, next_state.reshape(1, -1), done)
        state = next_state.reshape(1, -1)
    if len(agent.memory) >= 32:
        agent.replay(32)
    if e % 10 == 0:
        agent.update_target()
    print(f"Episode {e+1} completed")

In [None]:
# ✅ Save model
agent.model.save("double_dqn_episode_final.keras")