In [None]:
# Q-learning table사용
import scipy
import scipy.stats
import numpy as np
import random
import pylab
from keras.layers import Dense
from keras.models import Sequential
from collections import defaultdict

MINPRICE = 3000
MAXPRICE = 4600
INITIALPRICE = 3800


# 환경
class Env:
    def __init__(self):
        self.action_space = ['u', 'l', 'c', 'r', 'd'] # 행동 순서대로 -80, -20, 0, +20, +80
        self.n_actions = len(self.action_space)  # 5 actions
        self.marketPrice = INITIALPRICE            # InitialPrice
        self.p_array = np.random.normal(self.marketPrice, 190, 500) #Passenger의 선호 가격 리스트
        self.p_count = len(self.p_array)                                  #승객의 수 초기 500명
        self.d_array = np.random.normal(self.marketPrice, 190, 500) #Driver의 선호 가격 리스트
        self.d_count = len(self.d_array)                                  #운전자 수 초기 500명
        #만족도옵션 추가
        
    #승객이 가격승낙할 확률
    def acceptP(self, offer, preferred):
        num = 0
        for i in preferred:
            x = (-offer + i) / 1600 
            # 누적분포 확률이용
            if np.random.rand() <= scipy.stats.norm(0, 1).cdf(x):
                num += 1
        return num
        
    #운전자가 가격승낙할 확률
    def acceptD(self, offer, preferred):
        num = 0
        for i in preferred:
            x = (offer - i) / 1600
            # 누적분포 확률이용
            if np.random.rand() <= scipy.stats.norm(0, 1).cdf(x):
                num += 1
        return num
        
        
    def step(self, action, count):
        
        if   action == 0:   # up : price-=80
            count=0
            self.marketPrice-=80
            
        elif action == 1:   # left : price-=20
            count=0
            self.marketPrice-=20
            
        elif action == 2:   # center : Nothing happen
            count+=1
            
        elif action == 3:   # right : price+=20
            count=0
            self.marketPrice+=20

        elif action == 4:   # down : price+=80
            count=0
            self.marketPrice+=80


        next_state = self.marketPrice
        reward = 0

        #보상 = next_state가격에서의 (match성사수) * (match 성사가격)
        p_ok = self.acceptP(next_state, self.p_array)
        d_ok = self.acceptD(next_state, self.d_array)
        match = min([p_ok, d_ok])
        reward = match * (next_state/1000)
        
        #보상 가격허용치를 넘으면 -100
        if next_state < MINPRICE or next_state > MAXPRICE:
            reward = -100
        
        # 10번 state변화 없으면 done
        done = False
        if count==10:
            done = True
        
        
        return next_state, reward, done
        
            
    def reset(self):
        self.marketPrice = INITIALPRICE
        self.p_array = np.random.normal(self.marketPrice, 190, 500) #Passenger의 선호 가격 리스트
        self.d_array = np.random.normal(self.marketPrice, 190, 500) #Driver의 선호 가격 리스트

        return self.marketPrice
        
# 가격 Agent
class priceActionModel:
    def __init__(self, actions):
        # 행동 = [0, 1, 2, 3, 4] 순서대로 -80, -20, 0, +20, +80
        self.actions = actions
        self.learning_rate = 0.01
        self.discount_factor = 0.9
        self.epsilon = 1.  # exploration
        self.epsilon_decay = .9999
        self.epsilon_min = 0.01
        self.q_table = defaultdict(lambda: [0.0, 0.0, 0.0, 0.0, 0.0]) # 5열
        
    # <s, a, r, s'> 샘플로부터 큐함수 업데이트
    def learn(self, state, action, reward, next_state):
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
        q_1 = self.q_table[state][action]
        # 벨만 최적 방정식을 사용한 큐함수의 업데이트
        q_2 = reward + self.discount_factor * max(self.q_table[next_state])
        self.q_table[state][action] += self.learning_rate * (q_2 - q_1)
    
    # 큐함수에 의거하여 입실론 탐욕 정책에 따라서 행동을 반환
    def get_action(self, state):
        if np.random.rand() < self.epsilon:
            action = np.random.choice(self.actions)
        else:
            # 큐함수에 따른 행동 반환
            print(state, self.q_table[state])
            state_action = self.q_table[state]
            action = self.arg_max(state_action)
        return action
        
    @staticmethod
    def arg_max(state_action):
        max_index_list = []
        max_value = state_action[0]
        for index, value in enumerate(state_action):
            if value > max_value:
                max_index_list.clear()
                max_value = value
                max_index_list.append(index)
            elif value == max_value:
                max_index_list.append(index)
        return random.choice(max_index_list)
    
if __name__ == "__main__":
    
    env = Env()
    agent = priceActionModel(actions=list(range(env.n_actions)))            # [0, 1, 2, 3, 4]
    
    global_step = 0
    scores, episodes = [], []
    
    
    for episode in range(20):
        state = env.reset()
        count = 0
        
        for i in range(50000):
            global_step += 1

            # 현재 상태에 대한 행동 선택
            action = agent.get_action(str(state))
            
            # 행동을 취한 후 다음 상태, 보상 에피소드의 종료여부를 받아옴
            next_state, reward, done = env.step(action, count)

            # <s,a,r,s'>로 큐함수를 업데이트
            agent.learn(str(state), action, reward, str(next_state))
            state = next_state
            # 모든 큐함수를 화면에 표시
#           env.print_value_all(agent.q_table)

           
            scores.append(state)
            episodes.append(episode*50000 + i)
            pylab.plot(episodes, scores, 'b')
            pylab.savefig("./DP_graph.png")
            print("episode:", episode, "  score:", state, " global_step:",
                  global_step, "  epsilon:", agent.epsilon)
            

Using TensorFlow backend.


episode: 0   score: 3880  global_step: 1   epsilon: 0.9999
