In [10]:
!pip3 install --upgrade pip
!pip3 install -r requirements.txt



In [11]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# import gymnasium as gym
import gym
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam # adaptive momentum 
import random
import tensorflow as tf
import datetime
# from datetime import datetime
import os


# Tensorboard

In [12]:
%load_ext tensorboard
%tensorboard --logdir logs/fit
# Load the TensorBoard notebook extension

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 82117), started 2 days, 0:08:23 ago. (Use '!kill 82117' to kill it.)

In [13]:
# Clear any logs from previous runs
!rm -rf ./logs/ 

# DQL agent

In [14]:
class DQLAgent(): 
    
    def __init__(self, env, model=None):
        # parameters and hyperparameters
        
        # this part is for neural network or build_model()
        self.state_size = env.observation_space.shape[0] # this is for input of neural network node size
        self.action_size = env.action_space.n # this is for out of neural network node size
        
        # this part is for replay()
        self.gamma = 0.95
        self.learning_rate = 0.001
        
        # this part is for adaptiveEGreedy()
        self.epsilon = 1 # initial exploration rate
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.01
        
        self.memory = deque(maxlen = 1000) # a list with 1000 memory, if it becomes full first inputs will be deleted
        # Use existing model if need it
        self.model = model if model is not None else self.build_model()
    
    def build_model(self):
        # neural network for deep Q learning
        model = Sequential()
        model.add(Dense(48, input_dim = self.state_size, activation = 'tanh')) # first hidden layer
        model.add(Dense(self.action_size, activation = 'linear')) # output layer
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate), metrics=['accuracy', 'mse'])


        current_time =  datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
        # Saving the model
        model_name = f"model_{current_time}"
        model.save(f'{model_name}.keras')

        # Define the directory
        directory = './weights/'

        # Saving weights
        filepath = f'./weights/{model_name}.weights.h5'

        if not os.path.exists(directory):
            os.makedirs(directory)
 
        model.save_weights(filepath, overwrite=True)

        return model
    
    def remember(self, state, action, reward, next_state, done):
        # storage
        self.memory.append((state, action, reward, next_state, done))
    
    def act(self, state):
        # acting, exploit or explore
        if random.uniform(0,1) <= self.epsilon:
            return env.action_space.sample()
        else:
            act_values = self.model.predict(state)
            return np.argmax(act_values[0])
            
    
    def replay(self, batch_size):
        # training
        
        if len(self.memory) < batch_size:
            return # memory is still not full
        
        minibatch = random.sample(self.memory, batch_size) # take 16 (batch_size) random samples from memory
        for state, action, reward, next_state, done in minibatch:
            if done: # if the game is over, I dont have next state, I just have reward 
                target = reward
            else:
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0]) 
                # target = R(s,a) + gamma * max Q`(s`,a`)
                # target (max Q` value) is output of Neural Network which takes s` as an input 
                # amax(): flatten the lists (make them 1 list) and take max value

            train_target = self.model.predict(state) # s --> NN --> Q(s,a)=train_target
            train_target[0][action] = target
            ## Tensorboard Logs
            log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
            tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)
            history = self.model.fit(state, train_target, verbose = 0,callbacks=[tensorboard_callback]) # verbose: dont show loss and epoch
            print(history.history)
    
    def adaptiveEGreedy(self):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            


# Train model - Unncoment this to train a new model

In [17]:
if __name__ == "__main__":
    # initialize gym environment and agent
    env = gym.make('CartPole-v1', render_mode='human')
    agent = DQLAgent(env)

    batch_size = 16
    episodes = 5
    for e in range(episodes):
        print(episodes)
        
        # initialize environment
        state = env.reset()
        state = np.reshape(state[0], [1,4])
        
        time = 0 # each second I will get reward, because I want to sustain a balance forever
        while True:
            
            # act
            action = agent.act(state)
            
            # step
            next_state, reward, done, _, _ = env.step(action)
            next_state = np.reshape(next_state, [1,4])
            env.render()
            
            # remember / storage
            agent.remember(state, action, reward, next_state, done)
            
            # update state
            state = next_state
            
            # replay
            agent.replay(batch_size)
            
            # adjust epsilon
            agent.adaptiveEGreedy()
            
            time += 1
            
            if done:
                print('episode: {}, time: {}'.format(e, time))
                break
    

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  if not isinstance(terminated, (bool, np.bool8)):


5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step
[1

# Test preloaded model

In [16]:

import time

from keras.models import load_model

# Specify the directory where models are stored
model_directory = './models'
model_files = sorted([f for f in os.listdir(model_directory) if f.endswith('.keras')],
                     key=lambda x: os.path.getmtime(os.path.join(model_directory, x)),
                     reverse=True)

# Load the most recent model if no specific model is given
if not model_files:
    raise FileNotFoundError("No model files found in the specified directory.")

model_path = os.path.join(model_directory, model_files[0])
model = load_model(model_path)

# Create DQLAgent agent with preloaded mdodel
env = gym.make('CartPole-v1', render_mode='human')
agent = DQLAgent(env, model=model)

trained_model = agent # Now I have trained agent 
state = env.reset() # Game will start with inital random state
state = np.reshape(state[0], [1,4])
time_t = 0

while True:
    env.render()
    action = trained_model.act(state)
    next_state, reward, done, _, _ = env.step(action)
    next_state = np.reshape(next_state, [1,4])
    state = next_state 
    time_t += 1 
    print(time_t) 
    time.sleep(0.01)
    if done:
        env.close()
        break

print('Done')

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
Done
