In [1]:
import torch                                  
import torch.nn as nn                          
import torch.nn.functional as F  

import numpy as np      

import time
import flappy_bird_gym

import matplotlib.pyplot as plt

In [2]:
BATCH_SIZE = 32                                 
LR = 0.001                                       
EPSILON = 0.999                
GAMMA = 0.9                                     
TARGET_REPLACE_ITER = 100                       
MEMORY_CAPACITY = 1000

env = flappy_bird_gym.make("FlappyBird-v0")         
N_ACTIONS = env.action_space.n                 
N_STATES = env.observation_space.shape[0] 

In [3]:
class Net(nn.Module):
    def __init__(self, n_states, n_actions, n_hidden):                                                        
       
        super(Net, self).__init__()                                             
        self.fc1 = nn.Linear(n_states, n_hidden)
        self.fc2 = nn.Linear(n_hidden, n_hidden)
        self.fc3 = nn.Linear(n_hidden, n_hidden)
        #self.fc4 = nn.Linear(n_hidden, n_hidden)
        #self.fc5 = nn.Linear(n_hidden, n_hidden)
        self.out = nn.Linear(n_hidden, n_actions)                                     
        
    def forward(self, x):                                                       
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        #x = F.relu(self.fc4(x))
        #x = F.relu(self.fc5(x))
        actions_value = self.out(x)                                           
        return actions_value       
                        

In [4]:
class DQN(object):
    def __init__(self, n_states, n_actions, n_hidden, 
                 batch_size, lr, epsilon, gamma, target_replace_iter, memory_capacity): 
                                                             
        self.eval_net = Net(n_states, n_actions, n_hidden)
        self.target_net = Net(n_states, n_actions, n_hidden)
        
        self.learn_step_counter = 0                                             
        self.memory_counter = 0
        
        self.memory = np.zeros((MEMORY_CAPACITY, N_STATES * 2 + 2))
        
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)    
        self.loss_func = nn.MSELoss()
        
        self.loss_record=[]

    def choose_action(self, x):                                                 
        x = torch.unsqueeze(torch.FloatTensor(x), 0)                            
        if np.random.uniform() < EPSILON:                                       
            actions_value = self.eval_net.forward(x)                            
            action = torch.max(actions_value, 1)[1].data.numpy()                
            action = action[0]                                                  
        else:                                                                  
            action = np.random.randint(0, N_ACTIONS)                        
        return action                                                           

    def store_transition(self, state, action, reward, next_state):                                    
        transition = np.hstack((state, [action, reward], next_state))                                

        index = self.memory_counter % MEMORY_CAPACITY                           
        self.memory[index, :] = transition                                      
        self.memory_counter =self.memory_counter + 1                                                

    def learn(self):                                                            
        #print(self.memory.shape)
        if self.learn_step_counter % TARGET_REPLACE_ITER == 0:                  
            self.target_net.load_state_dict(self.eval_net.state_dict())         
        self.learn_step_counter = self.learn_step_counter + 1                                            

        sample_index = np.random.choice(MEMORY_CAPACITY, BATCH_SIZE)            
        
        
        
        b_memory = np.zeros((BATCH_SIZE, N_STATES * 2 + 2))
        
        N=3
        for i in range(BATCH_SIZE):
            r=0
           
            s=self.memory[(sample_index[i]), :N_STATES]
            a=self.memory[(sample_index[i]), N_STATES:N_STATES+1]
            for j in range(N):
                r+=self.memory[(sample_index[i]+j)%MEMORY_CAPACITY, N_STATES+1:N_STATES+2]
            s_=self.memory[(sample_index[i]+N)%MEMORY_CAPACITY, -N_STATES:]
            t = np.hstack((s, a, r, s_))
            b_memory[i, :] = t



        
        
        
        #b_memory = self.memory[sample_index, :]        
        
        b_s = torch.FloatTensor(b_memory[:, :N_STATES])
        
        b_a = torch.LongTensor(b_memory[:, N_STATES:N_STATES+1].astype(int))
       
        b_r = torch.FloatTensor(b_memory[:, N_STATES+1:N_STATES+2])
       
        b_s_ = torch.FloatTensor(b_memory[:, -N_STATES:])

        q_eval = self.eval_net(b_s).gather(1, b_a)
  
        q_next = self.target_net(b_s_).detach()
  
        q_target = b_r + GAMMA * q_next.max(1)[0].view(BATCH_SIZE, 1)
    
        loss = self.loss_func(q_eval, q_target)
        
        self.loss_record.append(loss)
        
        self.optimizer.zero_grad()                                      
        loss.backward()                                                 
        self.optimizer.step()
        

    def plot_loss(self):
        
        plt.plot(np.arange(len(self.loss_record)),self.loss_record,'r')
        plt.title("Loss Record") # title
        plt.ylabel("Loss") # y label
        plt.xlabel("Step") # x label
        plt.show()
        
    def save_params(self):
        #print(self.eval_net.state_dict())
        torch.save(self.eval_net.state_dict(),'params.pkl')
    
    def load_params(self):
        self.eval_net.load_state_dict(torch.load('params.pkl'))

In [5]:
dqn = DQN(N_STATES,N_ACTIONS,128,BATCH_SIZE,LR,EPSILON,GAMMA,TARGET_REPLACE_ITER,MEMORY_CAPACITY)   

In [6]:
#new reward
score_record=[]
reward_record=[]


new_reward=0
in_area=0
is_pass=0
is_crash=0

score=0

episode=500

for i in range(episode):

    print('<<<<<<<<<Episode: %s' % (i+1))
    state = env.reset()  

    episode_reward_sum = 0                                              

    new_reward=0
    while True:
      
        action = dqn.choose_action(state)                                        
        next_state, reward, done, info = env.step(action)
         
        #env.render()
        #time.sleep(1 / 300) 
        
        #k=(abs(next_state[1])+abs(next_state[2]))/2
        
        if next_state[0]<0.2:
           
            if (next_state[1]>0 and next_state[2]<0):
                if abs(next_state[1])>abs(next_state[2]):
                    in_area=0.3*abs(next_state[2])
                else:
                    in_area=0.3*abs(next_state[1])
            else:
                if (next_state[1]<0 and next_state[2]<0):
                    in_area=(-0.3)*abs(next_state[1])
                else:
                    in_area=(-0.3)*abs(next_state[2])

                         
        else:
            in_area=0
        
        
        
        if done==True:
            is_crash=-1
        else:
            is_crash=0
        
        if info['score']>score:
            score=score+1
            is_pass=1
        else:
            is_pass=0      
      
        new_reward=in_area+reward*0.0001+is_pass+is_crash
                
        dqn.store_transition(state, action, new_reward, next_state) 
        
        episode_reward_sum = episode_reward_sum + new_reward                           

        state = next_state                                               

        if dqn.memory_counter > MEMORY_CAPACITY:              
            
            dqn.learn()

        if done:

            print(next_state, episode_reward_sum,info)
            score_record.append(info['score'])
            reward_record.append(episode_reward_sum)
            
            break
            
    #env.close()         
            
env.close() 

dqn.save_params()

<<<<<<<<<Episode: 1
[0.45486111 0.33765625 0.18765625 1.        ] -0.9968 {'score': 0}
<<<<<<<<<Episode: 2
[0.45486111 0.58179687 0.43179688 1.        ] -0.9968 {'score': 0}
<<<<<<<<<Episode: 3
[0.45486111 0.37085938 0.22085938 1.        ] -0.9968 {'score': 0}
<<<<<<<<<Episode: 4
[0.45486111 0.49390625 0.34390625 1.        ] -0.9968 {'score': 0}
<<<<<<<<<Episode: 5
[0.45486111 0.37867188 0.22867188 1.        ] -0.9968 {'score': 0}
<<<<<<<<<Episode: 6
[0.45486111 0.365      0.215      1.        ] -0.9968 {'score': 0}
<<<<<<<<<Episode: 7
[0.45486111 0.55445312 0.40445313 1.        ] -0.9968 {'score': 0}
<<<<<<<<<Episode: 8
[0.45486111 0.58179687 0.43179688 1.        ] -0.9968 {'score': 0}
<<<<<<<<<Episode: 9
[0.45486111 0.58765625 0.43765625 1.        ] -0.9968 {'score': 0}
<<<<<<<<<Episode: 10
[0.45486111 0.3728125  0.2228125  1.        ] -0.9968 {'score': 0}
<<<<<<<<<Episode: 11
[0.45486111 0.46460938 0.31460938 1.        ] -0.9968 {'score': 0}
<<<<<<<<<Episode: 12
[0.45486111 0.351328

KeyboardInterrupt: 

In [None]:
plt.plot(np.arange(len(score_record)),score_record)
plt.title("Score Record")
plt.ylabel("Score") 
plt.xlabel("Episode") 
plt.show()

In [None]:
plt.plot(np.arange(len(reward_record)),reward_record,'g')
plt.title("Reward Sum Record") 
plt.ylabel("Reward Sum") 
plt.xlabel("Episode") 
plt.show()

In [None]:
dqn.plot_loss()

In [None]:
max_score=0
score_sum=0
reward_sum=0

for i in range(episode):
    if max_score<score_record[i]:
        max_score=score_record[i]
    reward_sum=reward_sum+reward_record[i]
    score_sum=score_sum+score_record[i]

print(max_score)
print(score_sum)
print(reward_sum)
