In [7]:
import numpy as np
import gym


In [8]:
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
from random import seed
from sklearn import preprocessing
from torch.nn.init import kaiming_uniform_
seed(1)

In [9]:
def decay_schedule(init_value,
                   min_value,
                   decay_ratio,
                   max_steps,
                   log_start = -2,
                   log_base=10
):
    decay_steps = int(max_steps * decay_ratio)
    rem_steps = max_steps - decay_steps
    
    values = np.logspace(log_start,
                        0,
                        decay_steps,
                        base = log_base,
                        endpoint = True)[::-1]
    #print(value)
    values = (values - values.min())/(values.max() - values.min())
    values = np.pad(values, (0, rem_steps), 'edge')
    return values

In [10]:
def generate_trajectory(env, select_action, Q, epsilon, max_steps=200): #generate single trajectory from start to terminal state
    done, trajectory = False, []
    state = env.reset()
    while not done:
        #print(max_steps)
        for t in range(max_steps):
            action = select_action(state, Q, epsilon)
            next_state, reward, done, _ = env.step(action)
            experience = (state, action, reward, next_state)
            trajectory.append(experience)
            if done == True:
                break
            state = next_state
    return np.array(trajectory, np.object)

In [115]:
class linearApproximator(nn.Module):
    def __init__(self,state_shape,n_fc1,action_n):
        super(linearApproximator, self).__init__()
        self.input_size = state_shape
        self.n_fc1 = n_fc1
        self.out = action_n
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
        self.fc1  = nn.Linear(self.input_size,self.n_fc1)
        self.fc2  = nn.Linear(self.n_fc1,self.out)
        self.to(self.device)
        
    def forward(self, state_shape):
        x = self.fc1(state_shape)
        x = F.relu(x)
        
        out = self.fc2(x)
        #print('out shape: ', out.shape)
        return out

In [116]:
env = gym.make("CartPole-v0")


In [85]:
state

array([ 0.0139616 , -0.01148015, -0.04848314,  0.01808707])

In [123]:
def select_action(state, model, epsilon):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    state = torch.tensor(state).float()
    with torch.no_grad():
        q_values= model(state).cpu().detach()
        q_values = q_values.data.numpy().squeeze()
    if np.random.rand() > epsilon:
        action = np.argmax(q_values)
    else:
        action = np.random.randint(len(q_values))
    return action

In [142]:
def nfq_(env,
         gamma=0.99,
         init_alpha=0.5,
         min_alpha=0.01,
         alpha_decay_ratio=0.5,
         init_epsilon=1.0,
         min_epsilon=0.1,
         epsilon_decay_ratio=0.9,
         k=7,
         n_ep=3000):
    
    alphas = decay_schedule(
    init_alpha,
    min_alpha,
    alpha_decay_ratio,
    n_ep)
    
    epsilons = decay_schedule(
    init_epsilon,
    min_epsilon,
    epsilon_decay_ratio,
    n_ep)
    
    action_space = env.action_space.n
    state = env.reset()
    state_space = np.zeros((len(state)))
    
    model = linearApproximator(len(state), 100, 2)
    loss = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)
    
    experience = []
    
    for e in range(n_ep):
        state = env.reset()
        while True:
            action = select_action(state, model, epsilons[e])
            next_state, reward, done, _ = env.step(action)
            exp_tuple = (state, action, next_state, reward, done)
            experience.append(exp_tuple)
            if len(experience) > k:
                t_next_states = [ns for s, a, ns, r, d in experience]
                t_rewards = [r for s, a, ns, r, d in experience]
                t_states = [s for s, a, ns, r, d in experience]
                t_done = [d for s, a, ns, r, d in experience]
                t_actions = [a for s, a, ns, r, d in experience]
                
                t_next_states = torch.tensor(t_next_states).float()
                t_states = torch.tensor(t_states).float()
                t_actions = torch.tensor(t_actions).float()
                
                q_sp = model(t_next_states).detach()
                max_sq = q_sp.max(1)[0]
                
                q_sa = model(t_states).detatch()
                q_sa = q_sa.gather(1)
                
                
                print( "values", max_sq.shape )
                
                t_next_states
            state = next_state
            if done == True:
                break

In [143]:
nfq_(env)

values torch.Size([8])
values torch.Size([9])
values torch.Size([10])
values torch.Size([11])
values torch.Size([12])
values torch.Size([13])
values torch.Size([14])
values torch.Size([15])
values torch.Size([16])
values torch.Size([17])
values torch.Size([18])
values torch.Size([19])
values torch.Size([20])
values torch.Size([21])
values torch.Size([22])
values torch.Size([23])
values torch.Size([24])
values torch.Size([25])
values torch.Size([26])
values torch.Size([27])
values torch.Size([28])
values torch.Size([29])
values torch.Size([30])
values torch.Size([31])
values torch.Size([32])
values torch.Size([33])
values torch.Size([34])
values torch.Size([35])
values torch.Size([36])
values torch.Size([37])
values torch.Size([38])
values torch.Size([39])
values torch.Size([40])
values torch.Size([41])
values torch.Size([42])
values torch.Size([43])
values torch.Size([44])
values torch.Size([45])
values torch.Size([46])
values torch.Size([47])
values torch.Size([48])
values torch.Size(

values torch.Size([375])
values torch.Size([376])
values torch.Size([377])
values torch.Size([378])
values torch.Size([379])
values torch.Size([380])
values torch.Size([381])
values torch.Size([382])
values torch.Size([383])
values torch.Size([384])
values torch.Size([385])
values torch.Size([386])
values torch.Size([387])
values torch.Size([388])
values torch.Size([389])
values torch.Size([390])
values torch.Size([391])
values torch.Size([392])
values torch.Size([393])
values torch.Size([394])
values torch.Size([395])
values torch.Size([396])
values torch.Size([397])
values torch.Size([398])
values torch.Size([399])
values torch.Size([400])
values torch.Size([401])
values torch.Size([402])
values torch.Size([403])
values torch.Size([404])
values torch.Size([405])
values torch.Size([406])
values torch.Size([407])
values torch.Size([408])
values torch.Size([409])
values torch.Size([410])
values torch.Size([411])
values torch.Size([412])
values torch.Size([413])
values torch.Size([414])


values torch.Size([705])
values torch.Size([706])
values torch.Size([707])
values torch.Size([708])
values torch.Size([709])
values torch.Size([710])
values torch.Size([711])
values torch.Size([712])
values torch.Size([713])
values torch.Size([714])
values torch.Size([715])
values torch.Size([716])
values torch.Size([717])
values torch.Size([718])
values torch.Size([719])
values torch.Size([720])
values torch.Size([721])
values torch.Size([722])
values torch.Size([723])
values torch.Size([724])
values torch.Size([725])
values torch.Size([726])
values torch.Size([727])
values torch.Size([728])
values torch.Size([729])
values torch.Size([730])
values torch.Size([731])
values torch.Size([732])
values torch.Size([733])
values torch.Size([734])
values torch.Size([735])
values torch.Size([736])
values torch.Size([737])
values torch.Size([738])
values torch.Size([739])
values torch.Size([740])
values torch.Size([741])
values torch.Size([742])
values torch.Size([743])
values torch.Size([744])


KeyboardInterrupt: 

In [None]:
#ntwrk <class 'numpy.int64'>

In [49]:
exp = []
for t, i in enumerate(range(10)):
    v = (t,i)
    exp.append(v)

In [57]:
s = [s for s, a  in exp]

ValueError: not enough values to unpack (expected 3, got 2)

In [40]:
s = []
for k, v in exp:
    s.append(v)

In [41]:
type(s)

list

In [54]:
s

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [101]:
type(1)

int