In [None]:
import gym
import numpy as np
np.set_printoptions(suppress=True)
import IPython.display
from time import sleep
import pickle
import datetime
import tensorflow as tf
from tensorflow import keras as K
from keras.engine import data_adapter
import matplotlib.pyplot as plt
import PIL
%matplotlib inline

In [None]:
def print_stats(env=None, agent=None, i=None, reward=None, clear=False):
    if clear:
        IPython.display.clear_output(True)
    if i is not None:
        print(i)
    if reward is not None:
        print(reward)
    if env is not None:
        env.render()
    if agent is not None:
        print('cur_eps', agent.cur_eps)
        agent.print_weights()

In [None]:
# env = gym.make('FrozenLake-v1',  map_name='4x4', is_slippery=False)#desc=['SH', 'FG'],
env = gym.make('CartPole-v1')


In [None]:
try:
    env.reset()
    done = False
    actions = [2, 2, 1, 1, 1, 2]
    for action in actions:
        action = env.action_space.sample()
        obs, reward, done, info = env.step(action)
        print_stats(env)
        # sleep(1)
finally:
    env.close()

In [None]:
def eps_gen(set_eps):
        decay = 1.00001
        epsilon = set_eps
        while True:
            yield epsilon
            epsilon /= decay

In [None]:
class QLearningAgent:
    def __init__(self, eps_gen, init_weights=0, size=4, lr=0.45, discount=0.95):
        self.moves = 4
        self.q_table = np.ones((size*size, self.moves)) * init_weights
        self.lr = lr
        self.discount = discount
        self.eps_gen = eps_gen
        
    def set_eps_gen(self, eps_gen):
        self.eps_gen = eps_gen
        
    def pick_move(self, observation, greedy=False):
        self.lastpos = observation
        self.cur_eps = next(self.eps_gen)
        r = np.random.random()
        if r < self.cur_eps and not greedy:
            self.last_action = np.random.choice(self.moves) # explore
        else:
            self.last_action = np.argmax(self.q_table[observation]) # exploit
        return self.last_action
        
    def update_table(self, observation, reward):
        calc = reward
        calc += self.discount * np.max(self.q_table[observation])
        self.q_table[self.lastpos][self.last_action] = (1-self.lr) * self.q_table[self.lastpos][self.last_action] + self.lr * calc
#         self.print_weights()
    def print_weights(self):
        print(self.q_table)

In [None]:
# Don't execute this cell if you don't want to use Q-Learning

agent = QLearningAgent(eps_gen(1), size=2)
for i in range(1000):
    obs = env.reset()
    while True:
        action = agent.pick_move(obs)
        obs, reward, done, info = env.step(action)
        print_stats(env, agent, i)
        agent.update_table(obs, reward)
        if done:
            eps = 1
            break
print('Done')

In [None]:
class DeepQLearningAgent:
    def __init__(self, eps_gen, env, hidden=None, size=4, lr=0.7, discount=0.95, set_eps=1, replay_memory_capacity=500):
        self.eps_gen = eps_gen
        self.cur_eps = None
        self.moves = env.action_space.n
        print('moves', self.moves)
        self.size = size
        self.discount = discount
        self.lr = lr
        
        self.cart_pole = not isinstance(env.env, gym.envs.toy_text.FrozenLakeEnv)
        
        if self.cart_pole:
            self.model = K.Sequential([
                K.Input(shape=env.observation_space.shape)
            ])
        else:
            self.model = K.Sequential([
                K.Input(shape=(2,))])
        
        
        if hidden is None:
            hidden = []
            print('`hidden` was not provided, using 0 hidden layers')
        for size in hidden:
            print(size)
            self.model.add(K.layers.Dense(size, activation="relu"))
        self.model.add(K.layers.Dense(self.moves))
        
        self.model.compile(optimizer='adam',
                           loss='huber')
        
        config = self.model.get_config()
        self.target_model = K.Sequential.from_config(config)
        self.save_target_model()
        
        self.buf_shape = (replay_memory_capacity, )
        self.s_buf = np.zeros(self.buf_shape + env.observation_space.shape)
        self.a_buf = np.zeros(self.buf_shape)
        self.r_buf = np.zeros(self.buf_shape)
        self.t_buf = np.zeros(self.buf_shape)
        self.s_prime_buf = np.zeros(self.buf_shape + env.observation_space.shape)
        self.buf_index = 0
        
    def set_eps_gen(self, eps_gen):
        self.eps_gen = eps_gen
    
    def transform_observation(self, observation):
        return np.array([observation // self.size, observation % self.size])
    
    def pick_move(self, observation, random=False, greedy=False, verbose=False):
        if random:
            return np.random.choice(self.moves)
        if greedy:
            if self.cart_pole:
                observation = np.expand_dims(observation, 0)
            else:
                observation = np.expand_dims(self.transform_observation(observation), 0)
            if verbose:
                print(observation)
            q_values = self.model.predict(observation)
            action = np.argmax(q_values) # exploit
            if verbose:
                print(q_values)
            return action
        
        # eps-greedy
        self.cur_eps = next(self.eps_gen)
        r = np.random.random()
        if r < self.cur_eps:
            action = np.random.choice(self.moves) # explore
        else:
            if self.cart_pole:
                observation = np.expand_dims(observation, 0)
            else:
                observation = np.expand_dims(self.transform_observation(observation), 0)
            
            q_values = self.model.predict(observation)
            if verbose:
                print(q_values)
            action = np.argmax(q_values) # exploit
            return action
        return action
    
    def save_sarts(self, sarts): # state, action, reward, terminal, state prime
        self.s_buf[self.buf_index] = sarts[0]
        self.a_buf[self.buf_index] = sarts[1]
        self.r_buf[self.buf_index] = sarts[2]
        self.t_buf[self.buf_index] = sarts[3]
        self.s_prime_buf[self.buf_index] = sarts[4]
        self.buf_index = (self.buf_index + 1) % self.buf_shape[0]
    
    def save_target_model(self):
        self.target_model.set_weights(self.model.get_weights())
    
    def update_model(self, size=100, verbose=False):
        choice = np.random.choice(self.s_buf.shape[0], size=size)
        
        if self.cart_pole:
            obs = self.s_buf[choice]
            obs_prime = self.s_prime_buf[choice]
        else:
            obs = self.transform_observation(self.s_buf[choice]).T
            obs_prime = self.transform_observation(self.s_prime_buf[choice]).T
        actions = self.a_buf[choice]
        rewards = self.r_buf[choice]
        done = self.t_buf[choice]

        
        if verbose:
            for i in obs, actions, rewards, done, obs_prime:
                print(i)
        
        q_pred = self.model.predict(obs)
        if verbose:
            print('q_pred before:')
            print(q_pred)
        q_pred_prime = self.target_model.predict(obs_prime)
        target_pred = np.max(q_pred_prime, axis=1)
        future_q = rewards + (1 - done) * self.discount * target_pred
        if verbose:
            print('future_q')
            print(future_q)
        
        mask = np.arange(q_pred.shape[0]), np.int64(actions)
        q_pred[mask] = (1 - self.lr) * q_pred[mask] + self.lr * future_q
        if verbose:
            print('q_pred after')
            print(q_pred)
        res = self.model.fit(obs, q_pred)
    
    def print_weights(self):
        for l in self.model.get_weights():
            print(l)
    
    def save_agent(self):
        config = self.model.get_config()
        weights = self.model.get_weights()
        name = 'model' + datetime.datetime.now().isoformat(timespec='seconds').replace(':', ' ') + '.pickle'
        with open(name, 'wb') as f:
            pickle.dump((config, weights), f)
            
    def load_agent(self, name):
        with open(name, 'rb') as f:
            (config, weights) = pickle.load(f)

        self.model = K.Sequential.from_config(config)
        self.target_model = K.Sequential.from_config(config)
        self.model.set_weights(weights)
        self.save_target_model()
        
#for i in range(16):
#    print(dq.pick_move(i, greedy=True))
# for i in range(500):
#     dq.save_sarts(np.random.choice(4, size=5))
# dq.update_model()
#dq.model.summary()

In [None]:
replay_memory_capacity = 100000
agent = DeepQLearningAgent(eps_gen(1),
                           env,
                           replay_memory_capacity=replay_memory_capacity,
                           lr=0.01,
                           hidden=[4],
                           size=2)
rewards = []
# fill initial replay memory buffer
counter = 0
while counter < replay_memory_capacity:
    obs = env.reset()
    while counter < replay_memory_capacity:
        action = agent.pick_move(obs, random=True) # pick random moves for initial buffer
        obs_prime, reward, done, info = env.step(action)
        
        agent.save_sarts([obs, action, reward, int(done), obs_prime])
        obs = obs_prime
        counter += 1
        
        if done:
            break

In [None]:
agent.print_weights()

In [None]:
agent.print_weights()

In [None]:
agent.set_eps_gen(eps_gen(1))

In [None]:
try:
    for episode in range(500):
        obs = env.reset()
        #for move in range(200):
        while True:
            action = agent.pick_move(obs, verbose=True)
            obs_prime, reward, done, info = env.step(action)
            agent.save_sarts([obs, action, reward, int(done), obs_prime])
            obs = obs_prime
            print_stats(env=None, agent=None, i=episode, clear=True)
            
            if done:
                break
        agent.update_model(size=1000, verbose=False)
        rewards.append(reward)
        if episode % 10 == 0:
            agent.save_target_model()
        
    print('Done')
finally:
    env.close()

In [None]:
plt.plot(rewards)#np.cumsum(rewards))# / np.arange(1, len(rewards)+1))

In [None]:
try:
    N = 100
    for i in reversed(range(N)):
        obs = env.reset()
        obs = np.array([0, 0, i/N, 0])
        env.env.state = obs
        r = 0
        while True:
            # action = agent2.pick_move(obs, greedy=True,verbose=False)
            action = env.action_space.sample()
            obs, reward, done, info = env.step(action)
            print_stats(env=env, agent=None)
            r += reward
            if done:
                break
        print('Done', i/N, r)
finally:
    env.close()

In [None]:
agent2 = DeepQLearningAgent(eps_gen(1), env)
agent2.load_agent('model2022-04-22T13 27 18.pickle')

In [None]:
a = \
'''Done 0.2  147.0 Done 0.2 3.0
Done 0.19 155.0 Done 0.19 24.0
Done 0.18 173.0 Done 0.18 15.0
Done 0.17 174.0 Done 0.17 25.0
Done 0.16 177.0 Done 0.16 12.0
Done 0.15 179.0 Done 0.15 28.0
Done 0.14 215.0 Done 0.14 7.0
Done 0.13 214.0 Done 0.13 29.0
Done 0.12 225.0 Done 0.12 9.0
Done 0.11 224.0 Done 0.11 15.0
Done 0.1  299.0 Done 0.1 26.0
Done 0.09 314.0 Done 0.09 14.0
Done 0.08 316.0 Done 0.08 19.0
Done 0.07 315.0 Done 0.07 27.0
Done 0.06 321.0 Done 0.06 18.0
Done 0.05 326.0 Done 0.05 35.0
Done 0.04 336.0 Done 0.04 18.0
Done 0.03 500.0 Done 0.03 19.0
Done 0.02 500.0 Done 0.02 11.0
Done 0.01 500.0 Done 0.01 14.0
Done 0.0  500.0 Done 0.0 12.0'''
for line in a.split('\n'):
    _, angle, reward, _, _, r2 = line.split()
    print(angle + '\t\t\t' + reward + '\t\t\t' + r2)