### This code uses Python 3.12.5, and all libraries used are listed in "requirments.txt"
The gymnasium library requires downgrading to v029.1 for the code to work.  
```bash
pip install -r requirements.txt



In [1]:
import tensorflow as tf
import keras
from keras import layers
import gymnasium as gym
import ale_py
from gymnasium.wrappers.atari_preprocessing import AtariPreprocessing
from gymnasium.wrappers.frame_stack import FrameStack
import numpy as np

#### Enviroment setup to use atari games within gymnasium library 

In [2]:
gym.register_envs(ale_py)

env = gym.make("SpaceInvadersNoFrameskip-v4", render_mode="rgb_array")
env = AtariPreprocessing(env)  
env = FrameStack(env, 4)  
trigger = lambda t: t % 1000 == 0 # save every 1000 episode
env = gym.wrappers.RecordVideo(env, video_folder="videos", episode_trigger=trigger, disable_logger=True)
num_actions = env.action_space


  logger.warn(


#### Create DQN model

In [None]:
def create_q_model():
    
    return keras.Sequential([

        keras.Input(shape=(84, 84, 4)),
        layers.Conv2D(32, kernel_size=8, strides=4, activation="relu"),
        layers.Conv2D(64, kernel_size=4, strides=2, activation="relu"),
        layers.Conv2D(64, kernel_size=3, strides=1, activation="relu"),
        layers.Flatten(),
        layers.Dense(512, activation="relu"),
        layers.Dense(num_actions, activation="linear")
    ])

model = create_q_model()
model_target = create_q_model()

optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)

#### Hyperparmeters and history setup

In [None]:
# Hyperparameters for training
gamma = 0.99  
epsilon = 1.0  
epsilon_min = 0.1  
epsilon_max = 1.0  
epsilon_interval = epsilon_max - epsilon_min
batch_size = 32  

# Training parameters
max_episodes = 0  
max_frames = 1e7  
epsilon_random_frames = 50000  
epsilon_greedy_frames = 1e6 
max_memory_length = 1e5  
max_steps_per_episode = 10000 
update_after_actions = 4  
update_target_network = 10000  

# Loss function for more stable training
loss_function = keras.losses.Huber()

# History tracking
action_history = []
state_history = []
state_next_history = []
rewards_history = []
episode_reward_history = []
done_history = []

running_reward = 0
episode_count = 0
frame_count = 0

#### Training 

Result:

I trained the model for about 40 hours. Initially, performance improved significantly up to around 1000 frames, but after about 2000 frames, progress stagnated and slightly worsened. I saved the model at frame 1060, which corresponded to the best score achieved. Despite adjusting hyperparameters for efficiency, I couldn't reach my target running reward of 800. The best running reward I achieved was 500, which was based on the best score at frame 1060. The running reward averaged around 390 before I stopped the training.




In [None]:
# Training
while True:
    observation, _ = env.reset()
    state = np.array(observation)
    episode_reward = 0

    for timestep in range(1, max_steps_per_episode):
        frame_count += 1

        # Epsilon-greedy action selection
        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
            action = env.action_space.sample()
        else:
            # Preprocess state for model prediction
            state_tensor = tf.convert_to_tensor(state, dtype=tf.float32)  
            state_tensor = tf.transpose(state_tensor, [1, 2, 0]) 
            state_tensor = tf.expand_dims(state_tensor, 0) 
            
            action_probs = model(state_tensor, training=False)
            action = tf.argmax(action_probs[0]).numpy()

        epsilon -= epsilon_interval / epsilon_greedy_frames
        epsilon = max(epsilon, epsilon_min)

        state_next, reward, done, _, _ = env.step(action)
        state_next = np.array(state_next)

        episode_reward += reward

        action_history.append(action)
        state_history.append(state)
        state_next_history.append(state_next)
        done_history.append(done)
        rewards_history.append(reward)
        state = state_next

        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
        
            indices = np.random.choice(range(len(done_history)), size=batch_size)

            state_sample = np.array([state_history[i] for i in indices])
            state_next_sample = np.array([state_next_history[i] for i in indices])
            rewards_sample = [rewards_history[i] for i in indices]
            action_sample = [action_history[i] for i in indices]
            
            done_sample = tf.convert_to_tensor([float(done_history[i]) for i in indices], dtype=tf.float32)

            state_next_sample_reshaped = np.transpose(state_next_sample, (0, 2, 3, 1))
            future_rewards = model_target.predict(state_next_sample_reshaped, verbose=0)

            # Calculate updated Q-values using Bellman equation
            updated_q_values = rewards_sample + gamma * tf.reduce_max(future_rewards, axis=1)
            updated_q_values = updated_q_values * (1 - done_sample) - done_sample

            masks = tf.one_hot(action_sample, num_actions)

            with tf.GradientTape() as tape:
                state_sample_reshaped = np.transpose(state_sample, (0, 2, 3, 1))
                q_values = model(state_sample_reshaped)

                q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1)

                loss = loss_function(updated_q_values, q_action)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

        if frame_count % update_target_network == 0:
            model_target.set_weights(model.get_weights())
            print(f"Best score of last 100: {np.max(episode_reward_history) if episode_reward_history else 0}, "
                  f"running_reward: {running_reward} at episode {episode_count}, frame {frame_count}")
            model.save(f"models/spaceinvaders_qmodel_{episode_count}.keras")

        if len(rewards_history) > max_memory_length:
            del rewards_history[:1]
            del state_history[:1]
            del state_next_history[:1]
            del action_history[:1]
            del done_history[:1]

        if done:
            break

    # Track episode rewards
    episode_reward_history.append(episode_reward)
    if len(episode_reward_history) > 1000:
        del episode_reward_history[:1]
    running_reward = np.mean(episode_reward_history)

    episode_count += 1
    print(episode_count - 1, episode_reward)

    if running_reward > 800: 
        print(f"Solved at episode {episode_count}!")
        model.save(f"models/spaceinvaders_qmodel_solved.keras")
        break

    if (max_episodes > 0 and episode_count >= max_episodes):
        print(f"Stopped at episode {episode_count}!")
        break

    if (max_frames > 0 and frame_count >= max_frames):
        print(f"Stopped at frame {frame_count}!")
        break