In [6]:
import numpy as np
import gym
import random
from IPython.display import clear_output
from time import sleep


def print_frames(frames):
    for i, frame in enumerate(frames):
        clear_output(wait=True)
        print(frame['frame'].getvalue())
        print(f"Timestep: {i + 1}")
        print(f"State: {frame['state']}")
        print(f"Action: {frame['action']}")
        print(f"Reward: {frame['reward']}")
        print(f"Total Reward: {frame['total reward']}")
    
        sleep(.5)
 

# Q-Learning - Taksi problem 

<img src="taxi.png" height='200' width='200'>

## Hiperparametri

In [7]:
env = gym.make('Taxi-v2')
lr = 0.6
gamma = 0.8
decay_rate = 0.01
n_episodes = 5000

## Treniranje - apdejtovanje q-table

In [None]:
q_table = np.zeros((env.env.nS, env.env.nA))
for episode in range(n_episodes):
    s = env.reset()
    epsilon = np.exp(-decay_rate*episode)
    total_reward = 0  
    while True:
        if epsilon < random.uniform(0,1):
            a = np.argmax(q_table[s,:])
        else:
            a = env.action_space.sample()
        new_s, reward, done, info = env.step(a)
        q_table[s,a] += lr * (reward + gamma*np.max(q_table[new_s,:]) - q_table[s,a])
        s = new_s
        total_reward += reward
        if done:
            break
    if episode % 1000 == 0:
        print('Epizoda: {}; nagrada: {}'.format(episode, total_reward))
        

## Primer netrenirane igre

In [15]:
frames = []
done = False
s = env.reset()
total_reward = 0
while not done:
    a = env.action_space.sample()
    next_s, reward, done, info = env.step(a)
    total_reward += reward
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': s,
        'action': a,
        'reward':reward,
        'total reward': total_reward
    })
    s = next_s
    if len(frames) == 25:
        break

print_frames(frames)

+---------+
|[34;1mR[0m: | : :G|
| :[43m [0m: : : |
| : : : : |
| | : | : |
|[35mY[0m| : |B: |
+---------+
  (North)

Timestep: 25
State: 222
Action: 1
Reward: -1
Total Reward: -97


## Primer trenirane igre

In [17]:
frames = []
done = False
s = env.reset()
total_reward = 0
while not done:
    a = np.argmax(q_table[s,:])
    next_s, reward, done, info = env.step(a)
    total_reward += reward
    frames.append({
        'frame': env.render(mode='ansi'),
        'state': s,
        'action': a,
        'reward':reward,
        'total reward': total_reward
    })
    s = next_s

print_frames(frames)

+---------+
|R: | : :G|
| : : : : |
| : : : : |
| | : | : |
|[35m[42mY[0m[0m| : |B: |
+---------+
  (Dropoff)

Timestep: 11
State: 418
Action: 5
Reward: 20
Total Reward: 10


# Q-Learning - Frozen Lake

<img src="frozenlake.png" height='200' width='200'>

In [64]:
env = gym.make('FrozenLake-v0')

In [74]:
q_table = np.zeros((env.env.nS, env.env.nA))
n_episodes = 15000
lr = 0.8
gamma = 0.9
decay_rate = 0.005

In [75]:
for episode in range(n_episodes):
    s = env.reset()
    total_reward = 0
    epsilon = np.exp(-decay_rate*episode)
    frames = []
    done = False
    while not done:
        if epsilon < random.uniform(0,1):
            a = np.argmax(q_table[s,:])
        else:
            a = env.action_space.sample()

        new_s, reward, done, info = env.step(a)
        q_table[s,a] += lr * (reward + gamma*np.max(q_table[new_s,:]) - q_table[s,a])
        s = new_s
        total_reward += reward
                    
    if episode % 1000 == 0:
        print('Epizoda {}: nagrada {}'.format(episode, total_reward))

    

Epizoda 0: nagrada 0.0
Epizoda 1000: nagrada 0.0
Epizoda 2000: nagrada 1.0
Epizoda 3000: nagrada 0.0
Epizoda 4000: nagrada 1.0
Epizoda 5000: nagrada 1.0
Epizoda 6000: nagrada 1.0
Epizoda 7000: nagrada 1.0
Epizoda 8000: nagrada 0.0
Epizoda 9000: nagrada 1.0
Epizoda 10000: nagrada 1.0
Epizoda 11000: nagrada 1.0
Epizoda 12000: nagrada 1.0
Epizoda 13000: nagrada 0.0
Epizoda 14000: nagrada 0.0


In [76]:
s = env.reset()
step = 0
frames = []
done = False
while not done:
    a = np.argmax(q_table[s,:])
    next_s, reward, done, info = env.step(a)
    s = next_s
    step += 1
    frames.append({
            'frame': env.render(mode='ansi'),
            'state': s,
            'action': a,
            'reward': reward,
            'total reward': total_reward
        })
    

In [77]:
print_frames(frames)

  (Down)
SFFF
FHFH
FFFH
HFF[41mG[0m

Timestep: 29
State: 15
Action: 1
Reward: 1.0
Total Reward: 0.0


# Space Invaders - Tensorflow

<img src='spaceinvaders.gif' height='200' width='200'>

In [1]:
import tensorflow as tf
import numpy as np
import gym

from skimage import transform
from skimage.color import rgb2gray

import matplotlib.pyplot as plt

from collections import deque

import random
import warnings
warnings.filterwarnings('ignore')


In [2]:
env = gym.make('SpaceInvaders-v4')
print('Velicina frejma: {}'.format(env.observation_space))
print('Broj mogucih akcija: {}'.format(env.action_space.n))
possible_actions = np.array(np.identity(env.action_space.n, dtype=int).tolist())

Velicina frejma: Box(210, 160, 3)
Broj mogucih akcija: 6


## Hyperparameters

In [3]:
### PREPROCESSING HYPERPARAMETERS
stack_size = 4

### MODEL HYPERPARAMETERS
state_size = [110, 84, stack_size]
action_size = env.action_space.n
lr = 2.5e-4

### Q-Learning HYPERPARAMETERS
gamma = 0.9

### Exploration/Exploatation HYPERPARAMETERS
decay_rate = 1e-5

### TRAINING HYPERPARAMETERS
n_episodes = 50
max_steps = 50000
batch_size = 64

### MEMORY HYPERPARAMETERS
pretrain_length = batch_size
memory_size = int(1e6)

### DISPLAY HYPERPARAMETERS
training = False
episode_render = False

## Preprocessing

In [4]:
def preprocess_frame(frame):
    gray = rgb2gray(frame)
    cropped_frame = gray[8:-12, 4:-12]
    norm_frame = cropped_frame/255.
    preprocessed_frame = transform.resize(norm_frame, [110,84])
    
    return preprocessed_frame    

In [5]:
stacked_frames = deque([np.zeros((110,84), dtype=int) for i in range(stack_size)], maxlen=4)

def stack_frames(stacked_frames, state, is_new_episode):
    frame = preprocess_frame(state)
    if is_new_episode:
        stacked_frames = stacked_frames = deque([np.zeros((110,84), dtype=int) for i in range(stack_size)], maxlen=4)
        for _ in range(4):
            stacked_frames.append(frame)
    else:
        stacked_frames.append(frame)
    stacked_state = np.stack(stacked_frames, axis=2)
    return stacked_state, stacked_frames


In [6]:
class DQNetwork:
    def __init__(self, state_size, action_size, learning_rate, name='DQNetwork'):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        
        with tf.variable_scope(name):
            # We create the placeholders
            # *state_size means that we take each elements of state_size in tuple hence is like if we wrote
            # [None, 84, 84, 4]
            self.inputs_ = tf.placeholder(tf.float32, [None, *state_size], name="inputs")
            self.actions_ = tf.placeholder(tf.float32, [None, self.action_size], name="actions_")
            
            # Remember that target_Q is the R(s,a) + ymax Qhat(s', a')
            self.target_Q = tf.placeholder(tf.float32, [None], name="target")
            
            """
            First convnet:
            CNN
            ELU
            """
            # Input is 110x84x4
            self.conv1 = tf.layers.conv2d(inputs = self.inputs_,
                                         filters = 32,
                                         kernel_size = [8,8],
                                         strides = [4,4],
                                         padding = "VALID",
                                          kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                         name = "conv1")
            
            self.conv1_out = tf.nn.elu(self.conv1, name="conv1_out")
            
            """
            Second convnet:
            CNN
            ELU
            """
            self.conv2 = tf.layers.conv2d(inputs = self.conv1_out,
                                 filters = 64,
                                 kernel_size = [4,4],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv2")

            self.conv2_out = tf.nn.elu(self.conv2, name="conv2_out")            
            
            """
            Third convnet:
            CNN
            ELU
            """
            self.conv3 = tf.layers.conv2d(inputs = self.conv2_out,
                                 filters = 64,
                                 kernel_size = [3,3],
                                 strides = [2,2],
                                 padding = "VALID",
                                kernel_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
                                 name = "conv3")

            self.conv3_out = tf.nn.elu(self.conv3, name="conv3_out")
            
            self.flatten = tf.contrib.layers.flatten(self.conv3_out)
            
            self.fc = tf.layers.dense(inputs = self.flatten,
                                  units = 512,
                                  activation = tf.nn.elu,
                                       kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                name="fc1")
            
            self.output = tf.layers.dense(inputs = self.fc, 
                                           kernel_initializer=tf.contrib.layers.xavier_initializer(),
                                          units = self.action_size, 
                                        activation=None)
            

  
            # Q is our predicted Q value.
            self.Q = tf.reduce_sum(tf.multiply(self.output, self.actions_))
            
            # The loss is the difference between our predicted Q_values and the Q_target
            # Sum(Qtarget - Q)^2
            self.loss = tf.reduce_mean(tf.square(self.target_Q - self.Q))
            
            self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)

In [7]:
# Reset the graph
tf.reset_default_graph()

# Instantiate the DQNetwork
DQNetwork = DQNetwork(state_size, action_size, lr)

In [8]:
class Memory():
    def __init__(self, max_size):
        self.buffer = deque(maxlen = max_size)
    
    def add(self, experience):
        self.buffer.append(experience)
    
    def sample(self, batch_size):
        buffer_size = len(self.buffer)
        index = np.random.choice(np.arange(buffer_size),
                                size = batch_size,
                                replace = False)
        
        return [self.buffer[i] for i in index]

In [15]:
action

array([0, 0, 0, 0, 1, 0])

In [12]:
# Instantiate memory
memory = Memory(max_size = memory_size)
for i in range(pretrain_length):
    # If it's the first step
    if i == 0:
        state = env.reset()
        state, stacked_frames = stack_frames(stacked_frames, state, True)
      

  
    # Get the next_state, the rewards, done by taking a random action
    choice = random.randint(1,len(possible_actions))-1
    action = possible_actions[choice]
    next_state, reward, done, _ = env.step(action)
    
    #env.render()
    
    # Stack the frames
    next_state, stacked_frames = stack_frames(stacked_frames, next_state, False)
    
    
    # If the episode is finished (we're dead 3x)
    if done:
        # We finished the episode
        next_state = np.zeros(state.shape)
        
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Start a new episode
        state = env.reset()
        
        # Stack the frames
        state, stacked_frames = stack_frames(stacked_frames, state, True)
        
    else:
        # Add experience to memory
        memory.add((state, action, reward, next_state, done))
        
        # Our new state is now the next_state
        state = next_state

TypeError: only size-1 arrays can be converted to Python scalars

# DOOM

In [16]:
import tensorflow as tf
import numpy as np
from vizdoom import *
import random
import time
from skimage import transform
from collections import deque
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'vizdoom'