In [2]:
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from random import randint
import random

ROWS      = 5
COLUMNS   = 7
WALL_SIZE = 7
ACTIONS   = 4
STATES    = ROWS*COLUMNS 
NEXT_STATES = 4
γ         = 0.9  # γάμμα

# This is my world
M =([0,  0,  0, 0, 0,  0, 0],
    [0,  1, -1, 0, 0,  0, 0],
    [0, -1, -1, 0, 0,  0, 0],
    [0,  0, -1, 0, 0, -1, 0],
    [0,  0,  0, 0, 1, -1, 2])

# Actions
A=["E", "W", "N","S"] # East, West, North, South

# States
S = ((0, 0), (0, 1), (0, 2), (0, 3), (0, 4), (0, 5), (0, 6),
     (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6),
     (2, 0), (2, 1), (2, 2), (2, 3), (2, 4), (2, 5), (2, 6),
     (3, 0), (3, 1), (3, 2), (3, 3), (3, 4), (3, 5), (3, 6),
     (4, 0), (4, 1), (4, 2), (4, 3), (4, 4), (4, 5), (4, 6),
     )

# Start and Goal
starting_state = S[8]
goal_state     = S[34]

# Wall 
W=((1, 2), (2, 1), (2, 2), (2, 1), (3, 2), (3, 5), (4, 5))

In [5]:
def ConvertIndexToTuple(state):
    return(tuple(int(x) for x in np.base_repr(state, 7, 1)[-2::]))

def ConvertTupleToIndex(state):
    return(int("".join(str(x) for x in state), 7))

def getNextState(state, action):
    # Check if we reached goal state
    if state == goal_state:
        return state
    
    if action == "E":
        next_state_index=ConvertTupleToIndex(state)+1
    elif action == "W":
        next_state_index=ConvertTupleToIndex(state)-1 
    elif action == "N":
        next_state_index=ConvertTupleToIndex(state)-COLUMNS
    elif action == "S":
        next_state_index=ConvertTupleToIndex(state)+COLUMNS 
             
    # Check if next state hits the wall.
    for i in range(0,WALL_SIZE):
        if next_state_index == ConvertTupleToIndex(W[i]):
            return state
            
    # Check if next state is within Maze.
    if next_state_index >=0 and next_state_index <=34:
        next_state=ConvertIndexToTuple(next_state_index)
        return next_state
    else:
        return state    
def getPossibleStates(state, action):
    next_states = [0 for x in range(NEXT_STATES)]
    next_states_index = [0 for x in range(NEXT_STATES)]
    
    for i in range(NEXT_STATES):
        next_states[i]=state
    
    # Check if we reached goal state
    if state == goal_state:
        return next_states
    
    next_states_index[0]=ConvertTupleToIndex(state)+1 # E
    next_states_index[1]=ConvertTupleToIndex(state)-1 # W
    next_states_index[2]=ConvertTupleToIndex(state)-COLUMNS # N
    next_states_index[3]=ConvertTupleToIndex(state)+COLUMNS # S
             
    # Check if next state hits the wall.
    for i in range(0,NEXT_STATES):
        for j in range(0,WALL_SIZE):
            if next_states_index[i] == ConvertTupleToIndex(W[j]):
                next_states_index[i]=ConvertTupleToIndex(state)

        # Check if next state is within Maze.
        if next_states_index[i] <0 or next_states_index[i] >34:
            next_states_index[i]=ConvertTupleToIndex(state)

        next_states[i]=ConvertIndexToTuple(next_states_index[i])
 
    return next_states
def AllStatesTransition(current_state, action):
    next_states_probability = [0 for x in range(NEXT_STATES)]
    
    for i in range(0,NEXT_STATES):
        next_states_probability[i]=0
    
    if action == "E":
        next_states_probability[0] = 0.7
    elif action == "W":
        next_states_probability[1] = 0.7
    elif action == "N":
        next_states_probability[2] = 0.7
    elif action == "S":
        next_states_probability[3] = 0.7 
    
    for i in range(0,NEXT_STATES):
        if next_states_probability[i]==0:
            next_states_probability[i]=0.1
    return next_states_probability        

def getReward( action, next_states):
    next_states_reward = [0 for x in range(NEXT_STATES)]
    
    for i in range(0,NEXT_STATES):
        if next_states[i] == goal_state:
            next_states_reward[i]=10
        else:
            next_states_reward[i]=0
    return next_states_reward       

def StateTransition(current_state, action, next_state):
    #next_state = getNextState( current_state, action) 
    if current_state == next_state:
        return 1
    else:
        return 0.7
    
# def getReward(current_state, action, next_state):
#     if next_state == goal_state:
#         return 10
#     else:
#         return 0

# Might keep it.
def weighted_choice(weights):
    totals = []
    running_total = 0

    for w in weights:
        running_total += w
        totals.append(running_total)

    rnd = random.random() * running_total
    for i, total in enumerate(totals):
        if rnd < total:
            return i    
# for i in range (0,100):        
#     print(weighted_choice([7,1,1,1]))  

In [44]:
def PolicyEvaluation(P,R,V,π,γ):
    Δ=0
    θ=2
    while Δ<θ:
        for s in range (0, STATES):
            v=V[s]
            for a in range (0, ACTIONS):
                for n in range (0, NEXT_STATES):
                    V[s]+=π[s][a]*P[s][a][n] * (R[s][a][n]+ γ*V[n])
                    Δ=max(Δ, abs(v-V[s]))
        return V

def PolicyImprovement(P,R,V,π,γ):
    policy_stable=True
    for s in range (0, STATES):
        for a in range (0, ACTIONS):
            b=π[s][a]
            temp=[0 for x in range(NEXT_STATES)]
            for n in range (0, NEXT_STATES):
                temp[n]+=P[s][a][n] * (R[s][a][n]+ γ*V[n])
                #np.argmax()
            max_index_action= np.argmax(temp)      
            π[s][a]= temp[max_index_action]
            if b!=π[s][a]:
                policy_stable=False
    return policy_stable
#     if policy_stable==True:
#         return π
#     else:
#         V=PolicyEvaluation(P,R,V,π,γ)


def PolicyImprovementRet(P,R,V,π,γ):
    policy_stable=True
    for s in range (0, STATES):
        for a in range (0, ACTIONS):
            b=π[s][a]
            temp=[0 for x in range(NEXT_STATES)]
            for n in range (0, NEXT_STATES):
                temp[n]+=P[s][a][n] * (R[s][a][n]+ γ*V[n])
                #np.argmax()
            max_index_action= np.argmax(temp)      
            π[s][a]= temp[max_index_action]
    #     print("Policy Next ", π[s], "\n\n")
            if b!=π[s][a]:
                policy_stable=False
    #             print("False")
    return π

In [45]:
# Define P and R matrices.
# P = [[0 for x in range(ACTIONS)] for y in range(STATES)]  # Probabilty
R = [[[0 for z in range(NEXT_STATES)] for x in range(ACTIONS)] for y in range(STATES)]  # Reward
# N = [[0 for x in range(ACTIONS)] for y in range(STATES)]  # Next State
N = [[[0 for z in range(NEXT_STATES)] for x in range(ACTIONS)] for y in range(STATES)]  # Next State
P = [[[0 for z in range(NEXT_STATES)] for x in range(ACTIONS)] for y in range(STATES)]  # Probabilty_new

# Initialize P, R and N
for i in range (0, STATES):
    for j in range (0, ACTIONS):
        current_state = S[i]
        action = A[j]
        next_states = getPossibleStates(current_state, action)
        N[i][j] = next_states
        P[i][j] = AllStatesTransition(current_state, action)
        R[i][j] = getReward( action, next_states)


In [53]:
# Initialize V(s)=0
V = [0 for x in range(STATES)] 
Q = [[0 for x in range(ACTIONS)] for y in range(STATES)] 

# Initialize π(s,α)=0.25
π = [[0.25 for x in range(ACTIONS)] for y in range(STATES)]  

# change gamma
γ=0.9

stability=True
counter=0

V=PolicyEvaluation(P,R,V,π,γ)
print(γ,π, "\n\n", V,"\n")
stability=PolicyImprovement(P,R,V,π,γ)
print(γ,π, "\n\n", V,"\n")
print(stability,"\n")

for i in range (0,10):
    V=PolicyEvaluation(P,R,V,π,γ)
    stability=PolicyImprovement(P,R,V,π,γ)
    

# while stability == False:
#     V=PolicyEvaluation(P,R,V,π,γ)
#     stability=PolicyImprovement(P,R,V,π,γ)
#     print(γ,π, "\n\n", V,"\n")
#     print(stability,"\n")
    

print(γ,π, "\n\n", V,"\n", counter)
π

0.9 [[0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25]] 

 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

[[0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [1.0, 1.0, 1.0, 7.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0],
 [7.0, 1.0, 1.0, 1.0],
 [7.0, 7.0, 7.0, 7.0]]

In [52]:
# Greedy

# Initialize V(s)=0
V = [0 for x in range(STATES)] 
Q = [[0 for x in range(ACTIONS)] for y in range(STATES)] 

# Initialize π(s,α)=0.25
π = [[0.25 for x in range(ACTIONS)] for y in range(STATES)]  
π

[[0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25],
 [0.25, 0.25, 0.25, 0.25]]