In [1]:
import gym
import random
import os
import numpy as np
import tensorflow
from tensorflow.keras.optimizers import Adam 
from keras.models import Sequential 
from keras.layers import Dense
from collections import deque

In [2]:
env = gym.make('CartPole-v0')

In [3]:
state_size = env.observation_space.shape[0]
state_size

4

In [4]:
action_size = env.action_space.n
action_size

2

In [5]:
batch_size = 32
n_episodes = 1001
output_dir = 'model_output'


In [6]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [10]:
class DQNAgent: 
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.epsilon = 1.0
        self.gamma = 0.95
        self.epsilon_decay = 0.995 
        
        self.epsilon_min = 0.01 
        
        self.learning_rate = 0.001 
        self.model = self._build_model()
    
    def _build_model(self): 
        model = Sequential()
        model.add(Dense(24,input_dim = self.state_size,activation='relu'))
        model.add(Dense(24,activation='relu'))
        model.add(Dense(self.action_size,activation='linear'))
        model.compile(loss='mse', optimizer = Adam(lr= self.learning_rate))
        return model 
    
    def remeber(self,state, action, reward, next_state,done):
        self.memory.append((state,action,reward,next_state,done))
    
    def act(self, state):
        if np.random.rand() <= self.epsilon: 
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])
        
    def replay(self,batch_size):
        minibatch = random.sample(self.memory,batch_size)
        for state,action,reward,next_state,done in minibatch: 
            target = reward #gehen mal davon aus wir sind am ende! dann kennen wir den reward 
            #falls wir noch nicht am ende sind müssen wir den zukünftigen reward schätzen und zum aktuellen dazu rechnen
            if not done: 
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0])) 
            
            
            target_f = self.model.predict(state) 
            #print(target_f.shape)
            target_f[0][action]= target
            #update single target row. 
            self.model.fit(state,target_f,epochs=1,verbose=0) #model updaten mit einzelnem korrigierten wert
            
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay
            
    def load(self,name):
        self.model.load_weights(name)
    
    def save(self,name):
        self.model.save_weights(name)

In [51]:
agent = DQNAgent(state_size,action_size)

In [52]:

done = False
for e in range(n_episodes):
    state = env.reset()
    state = np.reshape(state,[1,state_size])
    
    for time in range(1000): #10 instead of 5000 
        
        #env.render()
        
        action = agent.act(state)
        next_state, reward, done ,_ = env.step(action)
        
        #für jeden step wo er nicht umfällt bekommt er einen reward von 1 
        #falls das spiel endet bekommt er jedoch eine bestrafung von -10
        reward = reward if not done else -10
        next_state = np.reshape(next_state,[1,state_size])
        agent.remeber(state,action,reward,next_state,done)
        state = next_state
        if done: 
            print("episode:{}/{},score:{},e:{:.2}".format(e,n_episodes,time,agent.epsilon))
            break;
        
    if len(agent.memory) > batch_size:
        agent.replay(batch_size)
        
    if e%50 == 0:
        agent.save("model_output/" + "weights" + str(time))

episode:0/1001,score:12,e:1.0
episode:1/1001,score:11,e:1.0
episode:2/1001,score:41,e:1.0
episode:3/1001,score:53,e:0.99
episode:4/1001,score:12,e:0.99
episode:5/1001,score:12,e:0.99
episode:6/1001,score:32,e:0.98
episode:7/1001,score:23,e:0.98
episode:8/1001,score:29,e:0.97
episode:9/1001,score:16,e:0.97
episode:10/1001,score:14,e:0.96
episode:11/1001,score:33,e:0.96
episode:12/1001,score:20,e:0.95
episode:13/1001,score:14,e:0.95
episode:14/1001,score:15,e:0.94
episode:15/1001,score:55,e:0.94
episode:16/1001,score:13,e:0.93
episode:17/1001,score:10,e:0.93
episode:18/1001,score:13,e:0.92
episode:19/1001,score:35,e:0.92
episode:20/1001,score:10,e:0.91
episode:21/1001,score:34,e:0.91
episode:22/1001,score:37,e:0.9
episode:23/1001,score:45,e:0.9
episode:24/1001,score:13,e:0.9
episode:25/1001,score:26,e:0.89
episode:26/1001,score:19,e:0.89
episode:27/1001,score:9,e:0.88
episode:28/1001,score:67,e:0.88
episode:29/1001,score:33,e:0.87
episode:30/1001,score:13,e:0.87
episode:31/1001,score:14,

KeyboardInterrupt: 