<a href="https://colab.research.google.com/github/LoQiseaking69/SM2/blob/main/SephMnotebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports
This section includes all necessary library imports. Ensure that all the libraries used in the notebook are imported here.

In [None]:
# Update system packages and install dependencies
!apt-get update -y
!apt-get install -y python-dev swig python-pygame

# Install gym and the box2d environment
!pip install gym[box2d]


In [4]:
import os
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, regularizers
from collections import deque
import gym
import random
import logging

# Model Build
Detailed description of the model building process. This section should include information about the architecture, layers used, and any custom components.

In [35]:
class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = deque(maxlen=max_size)

    def store_transition(self, transition):
        self.buffer.append(transition)

    def sample_buffer(self, batch_size):
        if len(self.buffer) < batch_size:
            return None
        samples = np.array(random.sample(self.buffer, batch_size), dtype=object)
        return [np.stack(samples[:, i]) for i in range(samples.shape[1])]

class RBMLayer(layers.Layer):
    def __init__(self, num_hidden_units):
        super(RBMLayer, self).__init__()
        self.num_hidden_units = num_hidden_units

    def build(self, input_shape):
        if len(input_shape) != 2:
            raise ValueError("RBMLayer expects input shape of length 2")
        self.rbm_weights = self.add_weight(shape=(input_shape[-1], self.num_hidden_units),
                                           initializer='glorot_uniform',
                                           trainable=True)
        self.biases = self.add_weight(shape=(self.num_hidden_units,),
                                      initializer='zeros',
                                      trainable=True)

    def call(self, inputs):
        activation = tf.matmul(inputs, self.rbm_weights) + self.biases
        return tf.nn.sigmoid(activation)

class QLearningLayer(layers.Layer):
    def __init__(self, action_space_size, learning_rate=0.001, gamma=0.99, epsilon=0.1):
        super(QLearningLayer, self).__init__()
        self.action_space_size = action_space_size
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_decay = 0.995
        self.min_epsilon = 0.01
        self.buffer_index = 0
        self.replay_buffer = ReplayBuffer(100000)
        self.q_network = models.Sequential([
            layers.Dense(128, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=regularizers.l2(0.01)),
            layers.Dense(64, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=regularizers.l2(0.01)),  # Added another dense layer
            layers.Dense(self.action_space_size, activation='tanh', kernel_initializer='glorot_uniform')  # For continuous actions
        ])
        self.q_network.compile(optimizer=optimizers.Adam(learning_rate=self.learning_rate), loss='mse')
        self.target_q_network = models.clone_model(self.q_network)

    def call(self, state):
        return self.q_network(state)

    def update(self, batch_size):
        data = self.replay_buffer.sample_buffer(batch_size)
        if data is None:
            return
        states, actions, rewards, next_states, dones = data
        target_q_values = rewards + (1 - dones) * self.gamma * np.max(self.target_q_network.predict(next_states), axis=1)
        with tf.GradientTape() as tape:
            q_values = tf.reduce_sum(self.q_network(states) * tf.one_hot(actions, self.action_space_size), axis=1)
            loss = tf.reduce_mean(tf.square(target_q_values - q_values))
        grads = tape.gradient(loss, self.q_network.trainable_variables)
        self.q_network.optimizer.apply_gradients(zip(grads, self.q_network.trainable_variables))
        self.buffer_index += 1
        if self.buffer_index % 1000 == 0:
            self.target_q_network.set_weights(self.q_network.get_weights())
        if self.epsilon > self.min_epsilon:
            self.epsilon *= self.epsilon_decay

    def store_transition(self, state, action, reward, next_state, done):
        self.replay_buffer.store_transition((state, action, reward, next_state, done))

    def choose_action(self, state):
      if np.random.rand() < self.epsilon:
        return np.random.randint(self.action_space_size)
      else:
        q_values = self.q_network.predict(state)
        action = np.argmax(q_values[0])
        return np.clip(action, 0, self.action_space_size - 1)


def create_neural_network_model(input_dim, num_hidden_units, action_space_size):
    input_layer = layers.Input(shape=(input_dim,))  # Adjusted for 24-dimensional input

    # A simpler architecture
    x = layers.Dense(128, activation='relu', kernel_initializer='he_uniform')(input_layer)
    x = layers.Dense(64, activation='relu', kernel_initializer='he_uniform')(x)
    x_rbm = RBMLayer(num_hidden_units)(x)  # Including your RBM layer
    q_learning_layer = QLearningLayer(action_space_size)(x_rbm)  # Q-Learning layer

    model = models.Model(inputs=input_layer, outputs=q_learning_layer)
    return model

# Example usage
input_dim = 24  # BipedalWalker observation space dimension
num_hidden_units = 128  # Example value
action_space_size = 4  # BipedalWalker action space dimension

model = create_neural_network_model(input_dim, num_hidden_units, action_space_size)

# Train
This section is dedicated to the training process of the model. It should include details about the training loop, optimization process, and any data preprocessing.

In [36]:
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def train_model_in_bipedalwalker(env_name, q_learning_layer, num_episodes, epsilon=0.1):
    env = gym.make(env_name)

    for episode in range(num_episodes):
        state = env.reset()
        state = np.array(state).reshape(1, -1)
        done = False
        total_reward = 0

        while not done:
            action = q_learning_layer.choose_action(state)  # Use Q-learning layer to choose action

            next_state, reward, done, _ = env.step(action)
            next_state = np.array(next_state).reshape(1, -1)

            # Assuming store_transition and update are methods of the QLearningLayer
            q_learning_layer.store_transition(state, action, reward, next_state, done)
            q_learning_layer.update(batch_size=32)  # Update the Q-learning layer with a batch size of 32

            state = next_state
            total_reward += reward

        logger.info(f'Episode {episode + 1}/{num_episodes}, Total Reward: {total_reward}')

    env.close()

    # Save the trained model
    save_path = 'trained_model.h5'
    q_learning_layer.save_weights(save_path)
    logger.info(f"Model saved successfully at {save_path}")

# Example usage
env_name = 'BipedalWalker-v3'
num_episodes = 1000
epsilon = 0.1  # Define epsilon value for exploration

q_learning_layer = QLearningLayer(action_space_size)
train_model_in_bipedalwalker(env_name, q_learning_layer, num_episodes, epsilon=epsilon)



IndexError: invalid index to scalar variable.

# Eval
Description of the evaluation process. This section should cover how the model is evaluated, including metrics used, test datasets, and interpretation of the results.

In [None]:
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Define or import your custom layers here (RBMLayer, QLearningLayer)
# Ensure these definitions are the same as in your model creation

def load_model(model_path):
    """
    Load the saved model from a specified path.
    """
    try:
        custom_objects = {'RBMLayer': RBMLayer, 'QLearningLayer': QLearningLayer}
        model = models.load_model(model_path, custom_objects=custom_objects)
        logger.info("Model loaded successfully.")
        return model
    except Exception as e:
        logger.error(f"Error loading model: {e}")
        raise

def evaluate_model(model, env_name, num_episodes):
    """
    Evaluate the model on the environment over a number of episodes.
    """
    env = gym.make(env_name)
    total_rewards = []

    for episode in range(num_episodes):
        state = env.reset()
        state = np.array(state).reshape(1, -1)  # Reshape for model input
        done = False
        total_reward = 0

        while not done:
            action = model.choose_action(state)
            next_state, reward, done, _ = env.step(action)
            next_state = np.array(next_state).reshape(1, -1)  # Reshape for model input
            state = next_state
            total_reward += reward

        total_rewards.append(total_reward)
        logger.info(f'Episode: {episode + 1}, Total Reward: {total_reward}')

    env.close()
    avg_reward = np.mean(total_rewards)
    std_reward = np.std(total_rewards)
    logger.info(f'Average Reward over {num_episodes} episodes: {avg_reward}')
    logger.info(f'Standard Deviation of Reward: {std_reward}')
    return avg_reward, std_reward

# Example usage
model_path = 'path_to_your_saved_model_directory/trained_model'  # Adjust this path as needed
loaded_model = load_model(model_path)

env_name = 'BipedalWalker-v3'
num_episodes = 100  # Number of episodes for evaluation

# Evaluate and print model performance
avg_reward, std_reward = evaluate_model(loaded_model, env_name, num_episodes)
print(f'Average Reward: {avg_reward}, Standard Deviation of Reward: {std_reward}')