# 0. Install Dependencies

In [1]:
!pip install tensorflow==2.3.0
!pip install gym
!pip install keras
!pip install keras-rl2

Collecting tensorflow==2.3.0
  Downloading tensorflow-2.3.0-cp38-cp38-manylinux2010_x86_64.whl (320.5 MB)
[K     |████████████████████████████████| 320.5 MB 2.2 kB/s  eta 0:00:01  |▉                               | 8.4 MB 2.4 MB/s eta 0:02:13     |██▊                             | 27.6 MB 2.4 MB/s eta 0:02:05    |█████████████████████████████▊  | 298.1 MB 9.8 MB/s eta 0:00:03
Collecting numpy<1.19.0,>=1.16.0
  Downloading numpy-1.18.5-cp38-cp38-manylinux1_x86_64.whl (20.6 MB)
[K     |████████████████████████████████| 20.6 MB 22.0 MB/s eta 0:00:01B 22.0 MB/s eta 0:00:01
Collecting scipy==1.4.1
  Downloading scipy-1.4.1-cp38-cp38-manylinux1_x86_64.whl (26.0 MB)
[K     |████████████████████████████████| 26.0 MB 65.4 MB/s eta 0:00:01     |████████████████████████▉       | 20.2 MB 65.4 MB/s eta 0:00:01
Collecting tensorflow-estimator<2.4.0,>=2.3.0
  Downloading tensorflow_estimator-2.3.0-py2.py3-none-any.whl (459 kB)
[K     |████████████████████████████████| 459 kB 30.6 MB/s eta 0:00:01

# 1. Test Random Environment with OpenAI Gym

In [2]:
from gym import Env
from gym.spaces import Discrete, Box
import numpy as np
import random




In [5]:
a = np.array([0,1,2,3,4,5,6,7,8,9])
a[(6 - 6%4):(6 - 6%4 + 4)] = [0,0,0,0]
a

array([0, 1, 2, 3, 0, 0, 0, 0, 8, 9])

In [259]:
import sympy

class MathEquationEnv(Env):
    def __init__(self):
        self.action_space = Discrete(40)
        
        self.observation_space = Discrete(40)
        
        self.state = self.random_state()
        
        self.generator_length = 60
        
        x,y = sympy.symbols('x y')
        self.x = x
        self.y = y
    
    def random_state(self):
        random_state = np.array([]).astype(int)
        for i in range(10):
            a = np.array([1,0,0,0]).astype(int)
            random.shuffle(a)
            random_state = np.concatenate((random_state, a))
        return random_state
    
    def state_to_equation(self, eqNr):        
        if eqNr == 0:
            start, end = 0, 16
        elif eqNr == 1:
            start, end = 20, 36
            
        left = '0'
        for i in range(start, end):
            if self.state[i] == 1:
                if i % 4 == 1:
                    left+=('+'+str(random.randint(1,10)))
                elif i % 4 == 2:
                    left+='+x'
                elif i % 4 == 3:
                    left+='+y'
                    
        right = 0
        if self.state[end] == 1:
            right = 0
        elif self.state[end+1] == 1:
            right = random.randint(1,10)
        elif self.state[end+2] == 1:
            right = self.x
        elif self.state[end+3] == 1:
            right = self.y
            
        return left, right
            
    def step(self, action):
        self.generator_length-=1
        info = {}
        reward = -1
        if self.state[action] == 1:
            reward-=1
            #return self.state, reward, False, info
        else:
            self.state[(action - action%4) : (action - action%4 + 4)] = [0,0,0,0]
            self.state[action] = 1
            
            
        #x,y = sympy.symbols('x y')
        left1, right1 = self.state_to_equation(0)
        eq1 = sympy.Eq(sympy.sympify(left1), right1)
                
        left2, right2 = self.state_to_equation(1)
        eq2 = sympy.Eq(sympy.sympify(left2), right2)
        
        done = False      
        if self.generator_length <= 0:
            done = True
        #print(sympy.sympify(left1), right1)
        #print(sympy.sympify(left2), right2)
        #print('RESULT', eq1, eq2)
        if eq1 == True or eq2 == True or eq1 == False or eq2 == False:
            return self.state, reward, done, info
        
        result = sympy.linsolve([eq1, eq2], (self.x, self.y))
        if type(result) == sympy.sets.sets.EmptySet:
            reward-=1
        else:
            reward=8
            done = True
            
        return self.state, reward, done, info
        
    def render(self):
        # Implement viz
        pass
    
    def reset(self):
        init_state = np.array([]).astype(int)
        for i in range(10):
            a = np.array([1,0,0,0]).astype(int)
            random.shuffle(a)
            init_state = np.concatenate((init_state, a))
        self.state = init_state
        
        self.generator_length = 60
        
        return self.state

In [260]:
env = MathEquationEnv()

In [261]:
env.observation_space.sample()

10

In [262]:
episodes = 40
for episode in range(1, episodes+1):
    state = env.reset()
    done = False
    score = 0 
    
    while not done:
        #env.render()
        #print('hmm')
        action = env.action_space.sample()
        n_state, reward, done, info = env.step(action)
        score+=reward
    print('Episode:{} Score:{}'.format(episode, score))

Episode:1 Score:8
Episode:2 Score:4
Episode:3 Score:8
Episode:4 Score:2
Episode:5 Score:4
Episode:6 Score:8
Episode:7 Score:6
Episode:8 Score:8
Episode:9 Score:-6
Episode:10 Score:8
Episode:11 Score:8
Episode:12 Score:8
Episode:13 Score:8
Episode:14 Score:8
Episode:15 Score:8
Episode:16 Score:6
Episode:17 Score:5
Episode:18 Score:-9
Episode:19 Score:7
Episode:20 Score:8
Episode:21 Score:8
Episode:22 Score:8
Episode:23 Score:8
Episode:24 Score:8
Episode:25 Score:8
Episode:26 Score:8
Episode:27 Score:8
Episode:28 Score:8
Episode:29 Score:8
Episode:30 Score:7
Episode:31 Score:8
Episode:32 Score:8
Episode:33 Score:8
Episode:34 Score:8
Episode:35 Score:8
Episode:36 Score:1
Episode:37 Score:8
Episode:38 Score:2
Episode:39 Score:8
Episode:40 Score:8


# 2. Create a Deep Learning Model with Keras

In [263]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

In [264]:
env.observation_space.n

40

In [265]:
states = env.observation_space.shape
print(states)
actions = env.action_space.n

()


In [266]:
env.action_space

Discrete(40)

In [267]:
def build_model(states, actions):
    model = Sequential()    
    model.add(Dense(40, activation='relu', input_shape=(1,40) )  )
    model.add(Dense(24, activation='relu'))
    model.add(Dense(actions, activation='linear'))
    model.add(Flatten())
    return model

In [268]:
del model 

In [269]:
model = build_model(states, actions)

In [270]:
model.output_shape

(None, 40)

In [271]:
model.summary()

Model: "sequential_26"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_65 (Dense)             (None, 1, 40)             1640      
_________________________________________________________________
dense_66 (Dense)             (None, 1, 24)             984       
_________________________________________________________________
dense_67 (Dense)             (None, 1, 40)             1000      
_________________________________________________________________
flatten_6 (Flatten)          (None, 40)                0         
Total params: 3,624
Trainable params: 3,624
Non-trainable params: 0
_________________________________________________________________


# 3. Build Agent with Keras-RL

In [272]:
from rl.agents import DQNAgent
from rl.policy import BoltzmannQPolicy
from rl.memory import SequentialMemory

In [273]:
def build_agent(model, actions):
    policy = BoltzmannQPolicy()
    memory = SequentialMemory(limit=50000, window_length=1)
    dqn = DQNAgent(model=model, memory=memory, policy=policy, 
                  nb_actions=actions, nb_steps_warmup=10, target_model_update=1e-2)
    return dqn

In [274]:
tuple(model.input.shape)

(None, 1, 40)

In [276]:
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])
dqn.fit(env, nb_steps=5000, visualize=False, verbose=1)

Training for 5000 steps ...
Interval 1 (0 steps performed)


<tensorflow.python.keras.callbacks.History at 0x7f2ec16b6d90>

In [277]:
scores = dqn.test(env, nb_episodes=50, visualize=False)
print(np.mean(scores.history['episode_reward']))

Testing for 50 episodes ...
Episode 1: reward: 8.000, steps: 1
Episode 2: reward: 8.000, steps: 1
Episode 3: reward: 8.000, steps: 1
Episode 4: reward: 8.000, steps: 1
Episode 5: reward: 8.000, steps: 1
Episode 6: reward: 8.000, steps: 1
Episode 7: reward: 8.000, steps: 1
Episode 8: reward: 8.000, steps: 1
Episode 9: reward: 8.000, steps: 1
Episode 10: reward: 8.000, steps: 1
Episode 11: reward: 8.000, steps: 1
Episode 12: reward: 8.000, steps: 1
Episode 13: reward: 8.000, steps: 1
Episode 14: reward: 8.000, steps: 1
Episode 15: reward: 8.000, steps: 1
Episode 16: reward: 8.000, steps: 1
Episode 17: reward: 8.000, steps: 1
Episode 18: reward: 8.000, steps: 1
Episode 19: reward: 8.000, steps: 1
Episode 20: reward: 8.000, steps: 1
Episode 21: reward: 8.000, steps: 1
Episode 22: reward: 8.000, steps: 1
Episode 23: reward: 6.000, steps: 2
Episode 24: reward: 8.000, steps: 1
Episode 25: reward: 8.000, steps: 1
Episode 26: reward: 7.000, steps: 2
Episode 27: reward: 8.000, steps: 1
Episode 2

# 4. Reloading Agent from Memory

In [30]:
dqn.save_weights('dqn_weights.h5f', overwrite=True)

In [31]:
del model
del dqn
del env

In [9]:
env = gym.make('CartPole-v0')
actions = env.action_space.n
states = env.observation_space.shape[0]
model = build_model(states, actions)
dqn = build_agent(model, actions)
dqn.compile(Adam(lr=1e-3), metrics=['mae'])

In [10]:
dqn.load_weights('dqn_weights.h5f')

In [11]:
_ = dqn.test(env, nb_episodes=5, visualize=True)

Testing for 5 episodes ...
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Episode 1: reward: 200.000, steps: 200
Episode 2: reward: 200.000, steps: 200
Episode 3: reward: 200.000, steps: 200
Episode 4: reward: 200.000, steps: 200
Episode 5: reward: 200.000, steps: 200
