In [2]:
import os
import datetime
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' # Avoid TF Debug Warnings

In [96]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import callbacks

### Gather the data

In [7]:
from dataclasses import dataclass

In [8]:
@dataclass
class Memory:
    prev_obs: int
    action: int
    actual_obs: int
    reward: float
    done: bool

In [13]:
import gym

In [17]:
EPISODES = 1000
memory_list = []

In [76]:
env = gym.make("FrozenLake-v1", is_slippery=False)

In [77]:
for ep in range(EPISODES):
    obs = env.reset()
    done = False
    while not done:
        
        # Select Action
        action = env.action_space.sample() # Random action: Exploration

        next_obs, reward, done, _ = env.step(action)
        
        # Save into memory
        mem = Memory(
            prev_obs = obs,
            action = action,
            actual_obs = next_obs,
            reward = reward,
            done = done
        )
        
        memory_list.append(mem)
        
        obs = next_obs

In [78]:
len(memory_list)

15625

### Define the model

In [79]:
model = Sequential()
model.add(Input(shape=1, name="input"))
model.add(Dense(16, activation='relu',name="hidden0"))
model.add(Dense(16, activation='relu',name="hidden1"))
model.add(Dense(16, activation='relu',name="hidden2"))
model.add(Dense(16, activation='relu',name="hidden3"))
model.add(Dense(4, activation="sigmoid", name="output"))

In [80]:
model.compile(loss='mse', optimizer=Adam(lr=0.1))

### Train the model

In [81]:
import numpy as np
import random

In [82]:
batch_size = 64
gamma = 0.9

In [83]:
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = callbacks.TensorBoard(logdir, histogram_freq=1)

In [93]:
prev_obs_arr = np.array([mem.prev_obs for mem in memory_list])
next_obs_arr = np.array([mem.actual_obs for mem in memory_list])
done_arr = np.array([mem.done for mem in memory_list])
reward_arr = np.array([mem.reward for mem in memory_list])

In [85]:
prev_obs_Q = model.predict(prev_obs_arr)
next_obs_Q = model.predict(next_obs_arr)

In [92]:
target = prev_obs_Q

In [None]:
target

In [90]:
memory_sample = random.sample(memory_list, batch_size)


In [94]:
from typing import List

In [97]:
def learn(memory_sample: List[Memory], model: Model):
    for mem in memory_sample:
    
        prev_obs = np.array([mem.prev_obs])
        target = model.predict(prev_obs)

        if mem.done:
            action_target = mem.reward
        else:
            Q_actions = model.predict([mem.actual_obs])  # Array with the value of every action
            best_Q = np.max(Q_actions)  # highest Q (value of action)
            action_target = mem.reward + gamma * best_Q

        target[0][action] = action_target
        model.fit(prev_obs, target, epochs=1, verbose=0)
    
    return model

In [95]:
model

<tensorflow.python.keras.engine.sequential.Sequential at 0x7f05de853f10>

In [75]:
model.predict(np.array([14]))

array([[0.48881668, 0.46771443, 0.39897805, 0.4786297 ]], dtype=float32)

In [69]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [70]:
import tensorboard
%tensorboard --logdir logs

In [49]:
np.max(Q_actions)

0.5