In [7]:
import numpy as np
import gym


In [8]:
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from random import seed
from sklearn import preprocessing
from torch.nn.init import kaiming_uniform_
seed(1)

In [9]:
def decay_schedule(init_value,
                   min_value,
                   decay_ratio,
                   max_steps,
                   log_start = -2,
                   log_base=10
):
    decay_steps = int(max_steps * decay_ratio)
    rem_steps = max_steps - decay_steps
    
    values = np.logspace(log_start,
                        0,
                        decay_steps,
                        base = log_base,
                        endpoint = True)[::-1]
    #print(value)
    values = (values - values.min())/(values.max() - values.min())
    values = np.pad(values, (0, rem_steps), 'edge')
    return values

In [10]:
def generate_trajectory(env, select_action, Q, epsilon, max_steps=200): #generate single trajectory from start to terminal state
    done, trajectory = False, []
    state = env.reset()
    while not done:
        #print(max_steps)
        for t in range(max_steps):
            action = select_action(state, Q, epsilon)
            next_state, reward, done, _ = env.step(action)
            experience = (state, action, reward, next_state)
            trajectory.append(experience)
            if done == True:
                break
            state = next_state
    return np.array(trajectory, np.object)

In [115]:
class linearApproximator(nn.Module):
    def __init__(self,state_shape,n_fc1,action_n):
        super(linearApproximator, self).__init__()
        self.input_size = state_shape
        self.n_fc1 = n_fc1
        self.out = action_n
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.fc1  = nn.Linear(self.input_size,self.n_fc1)
        self.fc2  = nn.Linear(self.n_fc1,self.out)
        self.to(self.device)
        
    def forward(self, state_shape):
        x = self.fc1(state_shape)
        x = F.relu(x)
        
        out = self.fc2(x)
        #print('out shape: ', out.shape)
        return out

In [116]:
env = gym.make("CartPole-v0")


In [85]:
state

array([ 0.0139616 , -0.01148015, -0.04848314,  0.01808707])

In [123]:
def select_action(state, model, epsilon):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    state = torch.tensor(state).float()
    with torch.no_grad():
        q_values= model(state).cpu().detach()
        q_values = q_values.data.numpy().squeeze()
    if np.random.rand() > epsilon:
        action = np.argmax(q_values)
    else:
        action = np.random.randint(len(q_values))
    return action

In [208]:
def nfq_(env,
         gamma=0.99,
         init_alpha=0.5,
         min_alpha=0.01,
         alpha_decay_ratio=0.5,
         init_epsilon=1.0,
         min_epsilon=0.1,
         epsilon_decay_ratio=0.9,
         k=7,
         n_ep=3000):
    
    alphas = decay_schedule(
    init_alpha,
    min_alpha,
    alpha_decay_ratio,
    n_ep)
    
    epsilons = decay_schedule(
    init_epsilon,
    min_epsilon,
    epsilon_decay_ratio,
    n_ep)
    
    action_space = env.action_space.n
    state = env.reset()
    state_space = np.zeros((len(state)))
    
    model = linearApproximator(len(state), 100, action_space)
    loss = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
    
    experience = []
    
    for e in range(n_ep):
        state = env.reset()
        while True:
            action = select_action(state, model, epsilons[e])
            next_state, reward, done, _ = env.step(action)
            exp_tuple = (state, action, next_state, reward, done)
            experience.append(exp_tuple)
            if len(experience) > k:
                for index in range(k):
                    t_next_states = [ns for s, a, ns, r, d in experience]
                    t_rewards = [r for s, a, ns, r, d in experience]
                    t_states = [s for s, a, ns, r, d in experience]
                    t_done = [d for s, a, ns, r, d in experience]
                    t_actions = [a for s, a, ns, r, d in experience]
                
                    t_next_states = torch.tensor(t_next_states).float()
                    t_states = torch.tensor(t_states).float()
                    t_actions = torch.tensor(t_actions)#shoulb be int or long
                    t_rewards = torch.tensor(t_rewards).float()
                
                    optimizer.zero_grad()
                
                    next_states_action_pairs = model(t_next_states).detach()
                    next_states_action_pairs_max = next_states_action_pairs.max(1)[0] #Q not sarsa
                
                    state_action_pairs = model(t_states)
                    #state_action_pairs = state_action_pairs.detach()
                    #print(t_actions.shape)
                    t_actions = t_actions.unsqueeze(0)
                    #print(t_actions.shape)
                    #t_actions = t_actions.type(torch.LongTensor)
                    #print(t_actions)
                    state_action_pairs = state_action_pairs.gather(1, t_actions)
                    #what actions were selected(gather matches action indices to model out)
                    #print(t_rewards.shape, next_states_action_pairs_max.shape)
                
                    td_target = t_rewards + gamma*next_states_action_pairs_max
                    td_error = td_target - state_action_pairs
                    #print(state_action_pairs.shape, td_error.shape)
                    output = loss(state_action_pairs, td_error)
                    output.backward()
                    optimizer.step()
                
                
                    #print( "values", max_sq.shape )
                experience = []
            state = next_state
            if done == True:
                break

In [209]:
nfq_(env)

KeyboardInterrupt: 

In [None]:
#ntwrk <class 'numpy.int64'>

In [49]:
exp = []
for t, i in enumerate(range(10)):
    v = (t,i)
    exp.append(v)

In [57]:
s = [s for s, a  in exp]

ValueError: not enough values to unpack (expected 3, got 2)

In [40]:
s = []
for k, v in exp:
    s.append(v)

In [41]:
type(s)

list

In [54]:
s

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [101]:
type(1)

int