In [1]:
from carUtils import *
from linesUtils import *
from env import *

In [2]:
from collections import namedtuple, deque
from itertools import count
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 200
TARGET_UPDATE = 10
SEED=10

np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)

env=Env()

In [4]:
def computeReward(env):
    """
    right now we are going to use a simple reward of the deviation from the line, well the negative of it
    """
    return -env.distance_from_track()

In [5]:
possible_motor_percentages=[-60,60]
states_to_MotorPer=np.empty((len(possible_motor_percentages)**2,2))
for i in range(len(possible_motor_percentages)):
    for j in range(len(possible_motor_percentages)):
        states_to_MotorPer[i*len(possible_motor_percentages)+j]=[possible_motor_percentages[i],
                                                                 possible_motor_percentages[j]]

In [6]:
states_to_MotorPer

array([[-60., -60.],
       [-60.,  60.],
       [ 60., -60.],
       [ 60.,  60.]])

In [7]:
class DQN(nn.Module):
    def __init__(self, n_output_states):
        super(DQN, self).__init__()
        self.f1=nn.Linear(8,n_output_states)
        
    def forward(self,x):
        return self.f1(x)

In [8]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [9]:
n_actions=states_to_MotorPer.shape[0]

policy_net = DQN(n_actions).to(device)
target_net = DQN(n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters())
memory = ReplayMemory(10000)

In [10]:
steps_done = 0


def select_action(state):
    global steps_done
    sample = np.random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        np.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            print(policy_net(state))
            return torch.argmax(policy_net(state))
    else:
        return torch.tensor([[np.random.randint(n_actions)]], device=device, dtype=torch.long)

In [11]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))

    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    
    #print(len(batch.state))
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [12]:
num_episodes = 50
for i_episode in range(num_episodes):
    # Initialize the environment and state
    env.reset()
    last_screen = torch.tensor(env.move_car(0,0), device=device,
                                     dtype=torch.float)
    current_screen = torch.tensor(env.move_car(0,0), device=device,
                                     dtype=torch.float)
    state = current_screen - last_screen
    for t in count():
        # Select and perform an action
        action = select_action(state)
        # Observe old state
        last_screen = current_screen
        
        #move car
        #print(action.shape)
        current_screen = torch.tensor(env.move_car(*states_to_MotorPer[action.item()]), device=device,
                                     dtype=torch.float)
        #check if too far off track
        done=env.off_track()
        #check distance and compute reward
        reward = torch.tensor([computeReward(env)], device=device)
        
        if not done:
            next_state = current_screen - last_screen
        else:
            next_state = None

        # Store the transition in memory
        print("state")
        print(state)
        print("action")
        print(action)
        print("next_state")
        print(next_state)
        print("reward")
        print(reward)
        print("------------------------------")
        memory.push(state, action, next_state, reward)

        # Move to the next state
        state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()
        if done:
            episode_durations.append(t + 1)
            plot_durations()
            break
    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())


torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
tensor([ 0.1897,  0.1199,  0.1562, -0.1953], device='cuda:0')
torch.Size([])
torch.Size([1, 1])
torch.Size([1, 1])
tensor([ 0.1897,  0.1199,  0.1562, -0.1953], device='cuda:0')
torch.Size([])
torch.Size([1, 1])
tensor([-0.1211,  0.1688,  0.3423, -0.1176], device='cuda:0')
torch.Size([])
torch.Size([1, 1])
torch.Size([1, 1])
tensor([ 0.5014,  0.0709, -0.0303, -0.2759], device='cuda:0')
torch.Size([])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
tensor([ 0.5005,  0.0709, -0.0295, -0.2797], device='cuda:0')
torch.Size([])
torch.S

RuntimeError: zero-dimensional tensor (at position 1) cannot be concatenated

In [None]:
env.move_car(*states_to_MotorPer[action.item()])

In [None]:
torch.double