In [18]:
import gym
import numpy as np
import random
import torch 
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from collections import namedtuple
import random
from torch.autograd import Variable
import torch.utils.data as utils_data
import sklearn.datasets as datasets
import network

from itertools import count

In [2]:
Transition = namedtuple('Transition',('state','action','next_state','reward')) 

In [5]:
class ReplayMemory(object):
    
    def __init__(self,capacity):
        self.capacity = capacity
        self.memory = []
        self.position=0
    
    def push(self,*args):
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position +1)%self.capacity
    
    def sample(self,batch_size):
        return random.sample(self.memory,batch_size)
    
    def __len__(self):
        return len(self.memory)

In [49]:
class DQN:
    def __init__(self,layers):
        self.policy_net=network.Network(layers) # policy network
        self.memory = ReplayMemory(1000)
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.state_dimention = layers[0]
        self.n_action = layers[-1]
        self.nlayers = layers
        self.gamma = 0.95
        
    def select_action(self,state,eps=0.01):
        sample=random.random()
        state
        if sample<eps:
            return random.choice(range(self.n_action))
        else:
            return torch.max(self.policy_net(state),0)[1].tolist()
    
    def learn_model(self):
        if len(self.memory)<BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)

        batch = Transition(*zip(*transitions))
        state_batch = torch.tensor(batch.state)
        nextState_batch = torch.tensor(batch.next_state)
        reward_batch = torch.tensor(batch.reward)
        action_batch = torch.tensor(batch.action)
        
        state_action_values = self.policy_net(state_batch.float())
        
        next_state_value = self.policy_net(nextState_batch.float()).max(1)[0].detach()
        
        expected_rewards=next_state_value*self.gamma+reward_batch.float()
        
        target_action_values= state_action_values.clone()
        
        r_idx=torch.arange(target_action_values.size(0)).long()
        
        target_action_values[r_idx,action_batch] = expected_rewards
        
        loss = F.smooth_l1_loss(state_action_values,target_action_values)
        
        self.optimizer.zero_grad()
        
        loss.backward()
        
        for param in self.policy_net.parameters():#Gradient cliping to 
            param.grad.data.clamp_(-1,1)
            
        self.optimizer.step()
        
    def remember(self,state,action,next_state,reward):
        self.memory.push(state,action,next_state,reward)

In [50]:
class DDQN:
    def __init__(self,layers):
        self.policy_net=network.Network(layers) # policy network
        self.target_net=network.Network(layers) # target network 
        self.memory = ReplayMemory(1000)
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.state_dimention = layers[0]
        self.n_action = layers[-1]
        self.nlayers = layers
        self.gamma = 0.95
        
    
    def copy_policy_net_To_target(self):
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
    
    def select_action(self,state,eps=0.01):
        sample=random.random()
        
        if sample<eps:
            return random.choice(range(self.n_action))
        else:
            return torch.max(self.policy_net(state),0)[1].tolist()
    
    def learn_model(self):
        if len(self.memory)<BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        batch = Transition(*zip(*transitions))


        state_batch = torch.tensor(batch.state)
        nextState_batch = torch.tensor(batch.next_state)
        reward_batch = torch.tensor(batch.reward)
        action_batch = torch.tensor(batch.action)
        
        state_action_values = self.policy_net(state_batch.float())
        
        next_state_value = self.target_net(nextState_batch.float()).max(1)[0].detach()
        
        expected_rewards=next_state_value*GAMMA+reward_batch.float()
        
        target_action_values= state_action_values.clone()
        
        r_idx=torch.arange(target_action_values.size(0)).long()
        
        target_action_values[r_idx,action_batch] = expected_rewards
        
        loss = F.smooth_l1_loss(state_action_values,target_action_values)
        
        self.optimizer.zero_grad()
        
        loss.backward()
        
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1,1)
            
        self.optimizer.step()
    
    def remember(self,state,action,next_state,reward):
        self.memory.push(state,action,next_state,reward)

In [86]:
import matplotlib
import matplotlib.pyplot as plt
from gym import wrappers
from datetime import datetime

import os
env = gym.make('CartPole-v0').env
is_ipython = 'inline' in matplotlib.get_backend()

if is_ipython:
    from IPython import display
plt.ion()
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent=DDQN([state_size,10,10,action_size])
done = False
BATCH_SIZE = 128
argv='monitor'
filename="test"

In [91]:
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
done=False
t=0
num_episodes = 100000
if 'monitor' in argv:
    #filename = os.path.basename(__file__).split('.')[0]
    monitor_dir = './' + filename + '_' + str(datetime.now())
    env = wrappers.Monitor(env, monitor_dir)
for e in range(num_episodes):
    state =env.reset()
    t=0
    while not done and t<10000:
        eps_threshold = EPS_END + (EPS_START - EPS_END)* math.exp(-1. * t / EPS_DECAY)
        action = agent.select_action(torch.from_numpy(state).float())
        next_state,reward,done,_ = env.step(action)
        
        reward = reward if not done else -10
        
        agent.remember(state.tolist(),action,next_state.tolist(),reward)
        state = next_state
        agent.learn_model()
        print(t)
        if done and t<199:
            reward =-300
        t += 1
    
    
env.close()

0
1
2
3
4
5
6
7
8


In [94]:
agent.memory.position

17

In [77]:
os.path.dirname("test").split('.')[0]


''

In [72]:
os.path.basename("./")

''

In [30]:
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 2000
import math
for t  in range(100):
    print(EPS_END + (EPS_START - EPS_END)* math.exp(-1. * t / EPS_DECAY))

0.9
0.899575106232294
0.8991504248583688
0.8987259557720543
0.8983016988672331
0.8978776540378411
0.8974538211778671
0.8970302001813527
0.8966067909423928
0.896183593355135
0.89576060731378
0.8953378327125813
0.894915269445845
0.8944929174079306
0.8940707764932498
0.8936488465962676
0.8932271276115016
0.8928056194335218
0.8923843219569512
0.8919632350764655
0.8915423586867929
0.8911216926827145
0.8907012369590634
0.8902809914107261
0.8898609559326409
0.8894411304197992
0.8890215147672444
0.8886021088700727
0.8881829126234326
0.8877639259225251
0.8873451486626033
0.886926580738973
0.8865082220469924
0.8860900724820715
0.8856721319396732
0.8852544003153122
0.8848368775045556
0.8844195634030227
0.8840024579063851
0.8835855609103663
0.883168872310742
0.8827523920033402
0.8823361198840407
0.8819200558487755
0.8815041997935286
0.881088551614336
0.8806731112072855
0.8802578784685173
0.879842853294223
0.8794280355806463
0.8790134252240828
0.8785990221208799
0.878184826167437
0.8777708372602048

In [21]:
s=env.reset()
s.tolist()

[-0.037493615117251355,
 -0.029122303595574574,
 0.012974990787389215,
 -0.033191755329343015]

In [95]:
tt=net(state_batch.float())

In [19]:
env.step(0)

(array([ 0.02547568, -0.18396779, -0.0259671 ,  0.2724869 ]), 1.0, False, {})

In [107]:
vals=net2(state_batch.float()).max(1)[0].detach()+reward_batch.float()

In [114]:
cln=tt.clone()

In [109]:
action_batch

tensor([2, 0, 2, 1, 0, 1, 0])

In [115]:
cln

tensor([[ 0.3086, -0.4095, -0.2798],
        [ 0.2973, -0.2378, -0.4622],
        [ 0.3305, -0.1605, -0.5672],
        [ 0.2659, -0.3189, -0.3751],
        [ 0.3075, -0.2131, -0.4892],
        [ 0.3332, -0.4646, -0.2286],
        [ 0.2872, -0.2626, -0.4353]], grad_fn=<CloneBackward>)

In [112]:
cln[:,action_batch] = vals

In [116]:
vals

tensor([ 0.2281,  4.3208,  8.4637,  2.2620,  5.3565, -0.7846,  3.2904])

In [4]:
mem=ReplayMemory(10)

In [76]:
[mem.push([1,2,i],random.choice(range(3)),i+1,i-1)  for i in range(10)]

[None, None, None, None, None, None, None, None, None, None]

In [77]:
mem.memory

[Student(state=[1, 2, 9], action=2, next_state=10, reward=8),
 Student(state=[1, 2, 0], action=1, next_state=1, reward=-1),
 Student(state=[1, 2, 1], action=2, next_state=2, reward=0),
 Student(state=[1, 2, 2], action=0, next_state=3, reward=1),
 Student(state=[1, 2, 3], action=1, next_state=4, reward=2),
 Student(state=[1, 2, 4], action=0, next_state=5, reward=3),
 Student(state=[1, 2, 5], action=0, next_state=6, reward=4),
 Student(state=[1, 2, 6], action=0, next_state=7, reward=5),
 Student(state=[1, 2, 7], action=2, next_state=8, reward=6),
 Student(state=[1, 2, 8], action=1, next_state=9, reward=7)]

In [78]:
batch=mem.sample(7)

In [79]:
batch = Transition(*zip(*batch))

In [125]:
BATCH_SIZE=32

In [118]:
j=torch.arange(cln.size(0)).long()

In [120]:
cln[j,action_batch] = vals

In [121]:
cln

tensor([[ 0.3086, -0.4095,  0.2281],
        [ 4.3208, -0.2378, -0.4622],
        [ 0.3305, -0.1605,  8.4637],
        [ 0.2659,  2.2620, -0.3751],
        [ 5.3565, -0.2131, -0.4892],
        [ 0.3332, -0.7846, -0.2286],
        [ 3.2904, -0.2626, -0.4353]], grad_fn=<IndexPutBackward>)