# Library Imports

In [1]:
import gym
import numpy as np
from gym import wrappers
import random as rand
env = gym.make('Breakout-ram-v0')

# Main Programme

In [2]:
#converting a continuouss state into a discrete state
MAX_STATES = 10**1
GAMMA = 0.7   #discount factor
ALPHA = 0.001 #Learning rate

#q table disctionary lookup(like weights) gets state with highest reward?
#q table disctionary lookup(like weights) gets state with highest reward?
def q_states_dict(q):
    a,b = max(q.items(), key=lambda k: k[1])
    return a,b
# our inputs
def create_bins():
    bins = np.zeros((1,10))
    bins = np.linspace(-1,256,10)
    return bins
#digitize bins since we are converting a continuous state to a discrete state

def assign_bins(observation,bins):
    env_state = np.zeros(1) #creates a single array to store the 4 states
    env_state = np.digitize(observation,bins)
    return env_state

def get_state_as_string(env_state):
    env_string = ''.join(str(int(e)) for e in env_state)
    return env_string

def get_all_states_string():
    env_states = []
    for i in range(MAX_STATES):
        env_states.append(str(i).zfill(8))
    return env_states

def initialise_q():
    Q = {}
    all_states = get_all_states_string()
    for state in all_states:
        Q[state] = {}
        for action in range(env.action_space.n):
            Q[state][action] = 0 #set rewards
    return Q

def add_state_to_table(new_state,table):
    n_value = {}
    n_value[new_state] = {}
    for action in range(env.action_space.n):
        n_value[new_state][action] = 0
    table.update(n_value)
    
def play_a_game(bins,Q,eps):
    observation = env.reset()
    done = False
    count = 0
    new_states_created = 0
    total_reward = 0
    state = get_state_as_string(assign_bins(observation,bins))
    y = Q.get(state)
    if y == None:
        add_state_to_table(state,Q)
        new_states_created+=1
    while not done:
        #print("state",state)
        count+=1
        init_random = np.random.uniform(0,1)
        if init_random < eps:
            act = env.action_space.sample() #pick a random action
        else:
            act,_ = q_states_dict(Q[state])
        observation,reward,done,_ = env.step(act)
        total_reward+=reward
        #print("total_reward", total_reward)
        if count >= 20 and total_reward < 18:
            reward = -5
        elif count >= 20 and total_reward > 22:
            reward = 10
        elif count >= 100 and total_reward < 50:
            reward = -20
        elif count >= 100 and total_reward >=90:
            reward = 25
        if done and reward < 1:
            reward = -10
        new_state = get_state_as_string(assign_bins(observation,bins))
        y = Q.get(new_state)
        if y == None:
            new_states_created+=1
            add_state_to_table(new_state,Q)
        a1,max_q_s1a1 = q_states_dict(Q[new_state])
        #Bellmans Equation
        Q[state][act]= ((1 - ALPHA)*(Q[state][act])) + ALPHA*(reward + GAMMA * max_q_s1a1)
        state,act = new_state, a1
        if done:
            q_t = Q
    return total_reward, count,new_states_created,q_t

def play_multiple(bins, N):
    Q = initialise_q()
    count=[]
    new_state_array = 0
    reward = []
    for n in range(N):
        if n > 1:
            epsilon = 1/np.sqrt(n+1)
        else:
            epsilon = 0
        epsilon = 1/np.sqrt(n+1)
        ep_reward,ep_length, states_created,table = play_a_game(bins,Q,epsilon)
        #print("ep_reward",ep_reward,"length",ep_length)
        new_state_array+=states_created
        if n %100 == 0:
            print(n,ep_reward,new_state_array)
        count.append(ep_length)
        reward.append(ep_reward)
    return count,reward,table

# Launch Training

In [None]:
bins = create_bins()  #create observation bins
x,y,table = play_multiple(bins,1000) #observation bins, number of iterations

# Test Learned values against environment

In [None]:
for episode in range (20):
    observation = env.reset()
    done = False
    print("-----------------------")
    #time.sleep(1)
    count = 0
    state = get_state_as_string(assign_bins(observation,bins))
    while not done:
        env.render()
        y = table.get(state)
        if y is None:
            act = env.action_space.sample()
        else:
            act,_ = q_states_dict(table[state])
        observation,reward,done,_ = env.step(act)
        state = get_state_as_string(assign_bins(observation,bins))
        count+=1
        if done:
            print("counter", count)
            break
            
        state = get_state_as_string(assign_bins(observation,bins))

env.close()
env.close()