In [None]:
pip install gym[box2d]

In [None]:
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from collections import deque
import random

np.random.seed(42)
tf.random.set_seed(42)

GAMMA = 0.99  
LEARNING_RATE = 0.001
MEMORY_SIZE = 100000
BATCH_SIZE = 64
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 0.995
TARGET_UPDATE_FREQ = 100
EPISODES = 500

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=MEMORY_SIZE)
        self.epsilon = EPSILON_START
        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model()
        self.step_count = 0

    def _build_model(self):
        model = tf.keras.Sequential([
            layers.Dense(64, input_dim=self.state_size, activation='relu'),
            layers.Dense(64, activation='relu'),
            layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
                      loss='mse')
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        state = np.reshape(state, [1, self.state_size])
        act_values = self.model.predict(state, verbose=0)
        return np.argmax(act_values[0])

    def replay(self):
        if len(self.memory) < BATCH_SIZE:
            return
        minibatch = random.sample(self.memory, BATCH_SIZE)
        states = np.zeros((BATCH_SIZE, self.state_size))
        targets = np.zeros((BATCH_SIZE, self.action_size))
        for i, (state, action, reward, next_state, done) in enumerate(minibatch):
            states[i] = state
            target = reward
            if not done:
                next_state = np.reshape(next_state, [1, self.state_size])
                target = reward + GAMMA * np.amax(self.target_model.predict(next_state, verbose=0)[0])
            targets[i] = self.model.predict(np.reshape(state, [1, self.state_size]), verbose=0)[0]
            targets[i][action] = target
        self.model.fit(states, targets, epochs=1, verbose=0)
        if self.epsilon > EPSILON_END:
            self.epsilon *= EPSILON_DECAY

    def update_step(self):
        self.step_count += 1
        if self.step_count % TARGET_UPDATE_FREQ == 0:
            self.update_target_model()

def train_dqn():
    env = gym.make('LunarLander-v2')
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    agent = DQNAgent(state_size, action_size)
    scores = []

    for episode in range(EPISODES):
        state = env.reset()
        score = 0
        done = False
        while not done:
            action = agent.act(state)
            next_state, reward, done, _ = env.step(action)
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            score += reward
            agent.replay()
            agent.update_step()
            if done:
                scores.append(score)
                print(f"Episode: {episode+1}/{EPISODES}, Score: {score:.2f}, Epsilon: {agent.epsilon:.2f}")
                break

        if episode % 50 == 0 and episode > 0:
            test_env = gym.make('LunarLander-v2', render_mode='human')
            state = test_env.reset()
            done = False
            test_score = 0
            while not done:
                action = agent.act(state)
                state, reward, done, _ = test_env.step(action)
                test_score += reward
                test_env.render()
            print(f"Test Episode {episode}, Score: {test_score:.2f}")
            test_env.close()

    env.close()
    return scores

if __name__ == "__main__":
    scores = train_dqn()
    import matplotlib.pyplot as plt
    plt.plot(scores)
    plt.xlabel('Episode')
    plt.ylabel('Score')
    plt.title('Training Progress')
    plt.savefig('lunar_lander_training.png')