In [None]:
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import argparse
import numpy as np
import math
from collections import deque
import matplotlib.pyplot as plt
import gym
from gym import spaces
import numpy as np
import carla
from stable_baselines3 import PPO2
from stable_baselines3.ppo.policies import MlpPolicy

from gym.utils.env_checker import check_env
import csv
import random
from time import sleep


In [None]:
threshold = 30
CSV_FILE = 'waypoints.csv'

The value and policy networks

In [None]:
class value_network(nn.Module):
	'''
	Value Network: Designed to take in state as input and give value as output
	Used as a baseline in Policy Gradient (PG) algorithms
	'''
	def __init__(self,state_dim):
		'''
			state_dim (int): state dimenssion
		'''
		super(value_network, self).__init__()
		self.l1 = nn.Linear(state_dim, 128)
		self.l2 = nn.Linear(128, 256)
		self.l3 = nn.Linear(256, 256)
		self.l4 = nn.Linear(256, 256)
		self.l5 = nn.Linear(256, 128)
		self.l6 = nn.Linear(128, 64)
		self.l7 = nn.Linear(64,1)

	def forward(self,state):
		'''
		Input: State
		Output: Value of state
		'''
		v = F.tanh(self.l1(state))
		v = F.tanh(self.l2(v))
		v = F.tanh(self.l3(v)) 
		v = F.tanh(self.l4(v)) 
		v = F.tanh(self.l5(v)) 
		v = F.tanh(self.l6(v)) 
		return self.l7(v)
	
class policy_network(nn.Module):
	'''
	Policy Network: Designed for continous action space, where given a 
	state, the network outputs the mean and standard deviation of the action
	'''
	def __init__(self, state_dim, action_dim, log_std = 0.0):
		""" 
			state_dim (int): state dimenssion
			action_dim (int): action dimenssion
			log_std (float): log of standard deviation (std)
		"""
		super(policy_network, self).__init__()
		self.state_dim = state_dim
		self.action_dim = action_dim
		self.l1 = nn.Linear(state_dim,64)
		self.l2 = nn.Linear(64,128)
		self.l3 = nn.Linear(128,256)
		self.l4 = nn.Linear(256, 256)
		self.l5 = nn.Linear(256, 256)
		self.l6 = nn.Linear(256, 128)
		self.l7 = nn.Linear(128, 64)
		self.mean = nn.Linear(64,action_dim)
		self.log_std = nn.Parameter(torch.ones(1, action_dim) * log_std)
		
	
	def forward(self,state):
		'''
		Input: State
		Output: Mean, log_std and std of action
		'''
		a = F.tanh(self.l1(state))
		a = F.tanh(self.l2(a))
		a = F.tanh(self.l3(a)) 
		a = F.tanh(self.l4(a)) 
		a = F.tanh(self.l5(a)) 
		a = F.tanh(self.l6(a))
		a = F.tanh(self.l7(a))
		a_mean = self.mean(a)
		
		a_log_std = self.log_std.expand_as(a_mean)
		a_std = torch.exp(a_log_std)		
		return a_mean, a_log_std, a_std
	
	def select_action(self, state):
		'''
		Input: State
		Output: Sample drawn from a normal disribution with mean and std
		'''		
		a_mean, _, a_std = self.forward(state)
		action = torch.normal(a_mean)
		
		return action
	
	def get_log_prob(self, state, action):
		'''
		Input: State, Action
		Output: log probabilities
		'''
		mean, log_std, std = self.forward(state)
		var = std.pow(2)
		log_density = -(action - mean).pow(2) / (2 * var) - 0.5 * math.log(2 * math.pi) - log_std
		return log_density.sum(1, keepdim=True)

The Policy Gradient Agent - using GTBaseline update

In [None]:
class PGAgent():
	'''
	An agent that performs different variants of the PG algorithm
	'''
	def __init__(self,
	 env,
	 batch_size,
	 discount=0.99,
	 lr=1e-3,
	 gpu_index=1,
	 seed=0,
	 ):
		self.env = env
		self.state_dim = 5
		self.action_dim = 2
		self.discount = discount
		self.lr = lr
		self.device = torch.device('cuda', index=gpu_index) if torch.cuda.is_available() else torch.device('cpu')
		self.seed = seed
		self.policy = policy_network(self.state_dim,self.action_dim)
		self.value = value_network(self.state_dim)
		self.optimizer_policy = torch.optim.Adam(self.policy.parameters(), lr=self.lr)
		self.optimizer_value = torch.optim.Adam(self.value.parameters(), lr=self.lr)

	def sample_traj(self, batch_size, evaluate = False):
		self.policy.to("cpu") #Move network to CPU for sampling
		states = []
		actions = []
		rewards = []
		n_dones = []
		curr_reward_list = []
		while len(states) < batch_size:
			state, _ = self.env.reset()
			curr_reward = 0
			#print(curr_reward)
			for t in range(1000):
				state_ten = torch.from_numpy(state).float().unsqueeze(0)
				with torch.no_grad():
					if evaluate:
						action = self.policy(state_ten)[0][0].numpy() # Take mean action during evaluation
					else:
						action = self.policy.select_action(state_ten)[0].numpy() # Sample from distribution during training
						
				action = action.astype(np.float64)
				n_state,reward,done,_ = self.env.step(action) # Execute action in the environment
				states.append(state)
				#print(len(states))
				actions.append(action)
				rewards.append(reward)
				n_done = 0 if done else 1
				n_dones.append(n_done)
				state = n_state
				curr_reward += reward
				if done:
					break
			curr_reward_list.append(curr_reward)
		if evaluate:
			return np.mean(curr_reward_list)
		return states,actions,rewards,n_dones, np.mean(curr_reward_list)
	
	def update(self,states,actions,rewards,n_dones):

		self.policy.to(self.device) #Move policy to GPU

		states_ten = torch.from_numpy(np.stack(states)).to(self.device)   #Convert to tensor and move to GPU
		action_ten = torch.from_numpy(np.stack(actions)).to(self.device)  #Convert to tensor and move to GPU
		rewards_ten = torch.from_numpy(np.stack(rewards)).to(self.device) #Convert to tensor and move to GPU
		n_dones_ten = torch.from_numpy(np.stack(n_dones)).to(self.device) #Convert to tensor and move to GPU

		states_np = np.array(states, dtype=np.float32)
		states_ten = torch.from_numpy(states_np).to(self.device)
	
		with torch.no_grad():
				self.value.to(self.device)
				values_adv = self.value(states_ten).to(self.device)
		gt = torch.zeros(rewards_ten.shape[0],1).to(self.device)
		g = 0
		# Compute reward-to-go (gt) and advantages
		advantages = torch.zeros((rewards_ten.shape[0], 1)).to(self.device)
		for i in reversed(range(rewards_ten.size(0))):
			g = rewards_ten[i] + self.discount * g * n_dones_ten[i]
			gt[i] = g

		advantages = gt - values_adv

		# Normalize advantages
		advantages = (advantages - advantages.mean()) / advantages.std()
		
		# Update value network to predict gt for each state (L2 norm)
		loss = torch.nn.MSELoss()
		value_loss = loss(self.value(states_ten), gt)
		self.optimizer_value.zero_grad()
		#with torch.no_grad():
		value_loss.backward()
		self.optimizer_value.step()

		# Compute log probabilities using states_ten and action_ten
		log_probs = self.policy.get_log_prob(states_ten, action_ten)

		# Compute policy loss (using advantages) and update the policy
		self.optimizer_policy.zero_grad()
		policy_loss = (-log_probs * advantages.detach()).mean()
		policy_loss.backward()
		self.optimizer_policy.step()


Some additional functions

In [None]:
def read_waypoints(file_path):
    waypoints = []
    with open(file_path, 'r') as csvfile:
        csvreader = csv.reader(csvfile)
        for row in csvreader:
            x, y, yaw = map(float, row)
            waypoints.append((x, y, yaw))
            
    return waypoints


def setupCarla():

    # Connect to CARLA server
    client = carla.Client('localhost', 2000)
    client.set_timeout(10.0)

    return client.get_world()


The CARLA wrapper based on gym

In [None]:
class CarlaInstance(gym.Env):
    metadata = {}

    def __init__(self, world, waypoints):
        super(CarlaInstance, self).__init__()
        try:
            self.waypoints = waypoints
            self.current_waypoint_index = 0

            self.blueprint_library = world.get_blueprint_library()
            world.set_weather(carla.WeatherParameters.ClearNoon)


            # Spawn vehicle
            vehicle_bp = random.choice(self.blueprint_library.filter('wrangler_rubicon'))
            spawn_point = carla.Transform(carla.Location(x=-23.6,y=137.5,z=1),carla.Rotation(yaw=0))
            self.vehicle = world.spawn_actor(vehicle_bp, spawn_point)
            #self.vehicle.set_simulate_physics(True)
            self.vehicle.apply_control(carla.VehicleControl(throttle=1.0, brake=0.0, steer=0.0))
            sleep(2.0)

            # Attach Lane Invasion Sensor to car
            sensor_bp = world.get_blueprint_library().find('sensor.other.lane_invasion')
            sensor_transform = carla.Transform(carla.Location(x=2.5, z=0.7))
            self.lane_invasion_sensor = world.spawn_actor(sensor_bp, sensor_transform, attach_to=self.vehicle)
            self.lane_invasion_sensor.listen(lambda event: self.on_lane_invasion(event))

            self.collision_sensor = world.spawn_actor(world.get_blueprint_library().find('sensor.other.collision'), sensor_transform, attach_to=self.vehicle)
            self.collision_sensor.listen(lambda event: self.on_lane_invasion(event))

            self.action_space = spaces.Tuple((spaces.Box(low=0.0, high=1.0, dtype=np.float64, shape=(1,)),
                                              spaces.Box(low=-1.0, high=1.0, shape=(1,), dtype=np.float64)))
            self.observation_space = spaces.Box(low=-500000, high=500000, shape=(5,), dtype=np.float64)
            self.problem = bool

        except RuntimeError or KeyboardInterrupt:
            print("entered init exception")
            for actor in world.get_actors():
                actor.destroy()
            pass

    def on_lane_invasion(self, event):
        self.problem = True 
        #print("Theek kar")
        self.reset()
    
    def step(self, action):

        #print("{}\n", format(action))
        
        throttle = action[0]
        steer = action[1]
        brake = 0
        # print(action)
        if action.any() == None:
            reward = -10000

        # if action.all() == 0:
        #     steer = 0
        # elif action.all() == 1:
        #     steer = 1.0                                
        # elif action.all() == 2:
        #     steer = -1.0
        # elif action.all() == 3:
        #     throttle = 0
        # elif action.all() == 4:
        #     throttle = 1

        control = carla.VehicleControl(throttle=throttle, brake=brake, steer=steer)
        self.vehicle.apply_control(control)

        next_waypoint = self.waypoints[self.current_waypoint_index]
        next_waypoint_location = carla.Location(x=next_waypoint[0], y=next_waypoint[1])
        current_location = self.vehicle.get_location()
        distance = current_location.distance(next_waypoint_location)

        if distance < threshold:
            self.current_waypoint_index += 1
            if self.current_waypoint_index >= len(self.waypoints):
                self.current_waypoint_index = 0

        reward = self.get_reward(distance, control)
        done = False
        if self.current_waypoint_index == len(self.waypoints) - 1 or self.problem:
            done = True
#         print (f'step taken: Throttle: {throttle} and steer: {steer}')
        info = {}
        sleep(0.05)

        return self.get_observation(), reward, done, info
    
    
    def get_observation(self):
        location = self.vehicle.get_location()
        orientation = self.vehicle.get_transform().rotation.yaw
        speed = self.vehicle.get_velocity()
        speed = np.sqrt(speed.x**2 + speed.y**2 + speed.z**2)
        next_waypoint = self.waypoints[self.current_waypoint_index]
        next_waypoint_location = carla.Location(x=next_waypoint[0], y=next_waypoint[1])
        distance = location.distance(next_waypoint_location)
#         print('observation taken')

        return np.array([location.x, location.y, orientation, speed, distance], dtype=np.float64)
    
    def get_reward(self, distance, control):
        reward = 0
        if control.throttle > 0:
            reward += 5
        if distance < threshold:
            reward += 100    
        # Apply a large negative reward for lane invasion
        if self.problem:
            reward -= 1000  # Adjust the value as needed
#         print('reward calculated')    
        return reward
    
    def close(self):
        self.vehicle.destroy()
        self.lane_invasion_sensor.destroy()
        self.collision_sensor.destroy()    

    def reset(self, seed=None, options=None):
        vel = carla.Vector3D()
        vel.x = 0
        vel.y = 0
        vel.z = 0
        self.vehicle.set_target_velocity(vel)
        x, y, yaw = self.waypoints[0]
        transform = carla.Transform(carla.Location(x=-23.6,y=137.5),carla.Rotation(yaw=0))
        self.vehicle.set_transform(transform)
        self.current_waypoint_index = 0        

        info = {}

        return self.get_observation(), info
    
    def render(self):
        pass


The training function

In [None]:
def main():

    seed=0          # Sets Gym, PyTorch and Numpy seeds
    n_iter = 900      # Maximum number of training iterations
    discount=0.99   # Discount factor
    batch_size=100 # Training samples in each batch of training
    lr=8e-3       # Learning rate
    gpu_index=1		# GPU index

    waypoints = read_waypoints(CSV_FILE)
    world = setupCarla()
    
    #Creating and verifying the carla environment instance
    env = CarlaInstance(world, waypoints)
    #check_env(env)

    # Setting seeds
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

    kwargs = {
        "batch_size": batch_size,
        "discount":discount,
        "lr":lr,
        "gpu_index":gpu_index,
        "seed":seed,
        "env":env
    }	
    learner = PGAgent(**kwargs) # Creating the PG learning agent
    average_rewards=[]
    moving_window = deque(maxlen=10)
    old_reward=-1
    for e in range(n_iter):
        states,actions,rewards,n_dones,train_reward = learner.sample_traj(batch_size=batch_size)
        learner.update(states,actions,rewards,n_dones)
        eval_reward= learner.sample_traj(batch_size, evaluate=True)
        moving_window.append(eval_reward)
        if not e: print('Training Iteration {} Training Reward: {:.2f} Evaluation Reward: {:.2f} \
        Average Evaluation Reward: {:.2f}'.format(e,train_reward,eval_reward,np.mean(moving_window)))
        
        average_rewards.append(np.mean(moving_window))

        if np.mean(moving_window) > old_reward:
            old_reward = np.mean(moving_window)
            torch.save(learner.policy.state_dict(), ('CARLA_PPO_checkpoint1.pth'))

    window_size = 20
    averages = []

In [None]:
main()