In [41]:
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import ast
import csv

In [42]:
import simple_custom_taxi_env # the custom environment

In [43]:
fuel_limit = 5000
action_size = 6
Envs = [simple_custom_taxi_env.SimpleTaxiEnv(fuel_limit = 5000)] # modified from gym Taxi-v3
def sign(x):
    return np.sign(x)

A = {"Move South":0, "Move North":1, "Move East":2, "Move West":3, "Pick Up":4, "Drop Off":5}

In [44]:
class Q_table_learning():
    def __init__(self, grid_size = 7, obstacle_count = 0, Env_index=0):
        self.Q_table = {}
        self.alpha = 0.1
        self.gamma = 0.6
        self.epsilon_end = 0.1
        self.epsilon_start = 1.0
        self.epsilon_decay = 0.99995
        #self.grid_size = grid_size
        #self.fuel_limit = fuel_limit
        self.env = Envs[Env_index]

        #for testing
        self.global_current_passenger_picked = False
        self.global_current_state = None
        self.global_current_obs = None
        self.global_action = None
        self.test_table = {}

        self.target_counter = 0
        self.targets = []
    
    def get_state(self, current_state, next_obs, action, current_passenger_picked):
        """
        Convert observations into a structured state for Q-learning.
        Tracks passenger pickup and drop-off correctly.
        """
        taxi_row, taxi_col, S1x, S1y, S2x, S2y, S3x, S3y, S4x, S4y, obstacle_north, obstacle_south, obstacle_east, obstacle_west, passenger_look, destination_look = next_obs
        # Compute directions towards the stations
        if len(self.targets) == 0:
            self.targets.append((S1x, S1y))
            self.targets.append((S2x, S2y))
            self.targets.append((S3x, S3y))
            self.targets.append((S4x, S4y))
        on_station = (taxi_row, taxi_col) in [(S1x, S1y), (S2x, S2y), (S3x, S3y), (S4x, S4y)]

        reached = False

        if (taxi_row, taxi_col) == self.targets[self.target_counter]:
            reached = True
            
        tar_dir = (np.sign(self.targets[self.target_counter][0] - taxi_row), np.sign(self.targets[self.target_counter][1] - taxi_col))
        passenger_picked = current_passenger_picked
        
        if action == 4 and passenger_look and not current_passenger_picked and on_station:
            passenger_picked = True  # Passenger is now inside the taxi

        state = (
            tar_dir[0], tar_dir[1], 
            passenger_look, destination_look,
            obstacle_north, obstacle_south, obstacle_east, obstacle_west, action,
            passenger_picked
        )
       
        return state, on_station, reached

    def Train(self, total_episodes):
        rewards_per_episode = []
        steps_to_end = []
        passenger_pick_per_episode = []
        epsilon = self.epsilon_start
        for episode in range(total_episodes):
            current_obs, info = self.env.reset()
            self.targets.clear()
            self.target_counter = 0
            #print(self.env.grid_size)
            done = False
            pre_action = -1
            stepCnt = 0
            total_reward_episode = 0
            current_passenger_picked = False  # Track if the taxi has the passenger
            cur_state, _ , _= self.get_state(None, current_obs, None, current_passenger_picked)
            while not done:
                if cur_state not in self.Q_table:
                    self.Q_table[cur_state] = np.zeros(action_size)
                    #self.q_init(cur_state)
                if np.random.rand() < epsilon:
                    action = np.random.choice([0,1,2,3,4,5])  # Random action
                else:
                    action = np.argmax(self.Q_table[cur_state])
                next_obs, reward, done, _ = self.env.step(action)
                next_state, on_station, reached_target = self.get_state(cur_state, next_obs, action, current_passenger_picked)
                passenger_picked = next_state[-1]
                stepCnt += 1
                # Reward shaping
                shaped_reward = 0
                shaped_reward -= 0.1
                
                if action == 4 and not current_passenger_picked and passenger_picked:
                    if on_station:
                        passenger_pick_per_episode.append(stepCnt)
                        shaped_reward += 80
                        #print('picked')

                if reached_target:
                    shaped_reward += 5

                if (pre_action == 0 and action == 1) or (pre_action == 1 and action == 0) or (pre_action == 2 and action == 3) or (pre_action == 3 and action == 2):
                    shaped_reward -= 0.5
                    
                if on_station and current_passenger_picked and cur_state[3] == 1 and action == 5 and done:
                    shaped_reward += 500
                    #print("done")
                
                if not on_station and action == 4 and not passenger_picked:
                    shaped_reward -= 5

                if (not current_passenger_picked or not cur_state[3] or not on_station) and action == 5 :
                    shaped_reward -= 15
                #don't hit the wall plz
                if (cur_state[4] == 1 and action == 1) or (cur_state[5] == 1 and action == 0) or (cur_state[6] == 1 and action == 2) or (cur_state[7] == 1 and action == 3):
                    shaped_reward -= 10

                ctarget_diry, ctarget_dirx = cur_state[0:2]
                if ctarget_diry > 0 and action == 0:
                    shaped_reward += 0.04
                if ctarget_diry < 0 and action == 1:
                    shaped_reward += 0.04
                if ctarget_dirx > 0 and action == 2:
                    shaped_reward += 0.04
                if ctarget_dirx < 0 and action == 3:
                    shaped_reward += 0.04

                if reached_target:
                    self.target_counter = (self.target_counter+1)%4

                reward += shaped_reward
                total_reward_episode += reward

                # Update Q-table
                if next_state not in self.Q_table:
                    self.Q_table[next_state] = np.zeros(action_size)
                    #self.q_init(next_state)
                
                self.Q_table[cur_state][action] += self.alpha * (reward + self.gamma * np.max(self.Q_table[next_state]) - self.Q_table[cur_state][action])

                cur_state = next_state
                current_obs = next_obs  # Move to the next state
                current_passenger_picked = passenger_picked
                stepCnt += 1
                pre_action = action

            rewards_per_episode.append(total_reward_episode)
            steps_to_end.append(stepCnt)
            epsilon = max(self.epsilon_end, epsilon * self.epsilon_decay)

            if (episode + 1) % 100 == 0:
                avg_rewards = np.mean(rewards_per_episode[-100:])
                avg_steps = np.mean(steps_to_end[-100:])
                avg_passenger_picked = np.mean(passenger_pick_per_episode[-100:])
                print(f"🚀 Episode {episode + 1}/{total_episodes}, Epsilon: {epsilon:.3f}")
                print(f"Average Reward: {avg_rewards:.2f}")
                print(f"Average passenger: {avg_passenger_picked:.2f}")
                print(f"Steps to end: {avg_steps:.2f}")
            
        self.extractTable()

    def isreset(self,current_obs, obs):
        return current_obs == None or current_obs[2:10] != obs[2:10]

    def get_action(self, obs):
        if self.isreset(self.global_current_obs, obs):
            self.global_current_passenger_picked = False
        
        next_state,_ = self.get_state(self.global_current_state, obs, self.global_action, self.global_current_passenger_picked)
        if next_state not in self.test_table:
            self.global_action = np.random.choice([0,1,2,3,4,5])
        else:
            self.global_action = np.argmax(self.test_table[next_state])
            #if next_state[-1] == True:
                #print('picked')

        self.global_current_obs = obs
        self.global_current_state = next_state
        self.global_current_passenger_picked = self.global_current_state[-1]
        return self.global_action


    def Test(self, total_episodes=100):
        rewards_per_episode = []
        steps_to_end = []
        actions_cnt = [0,0,0,0,0,0]

        self.test_table = self.loadTable()
        for episode in range(total_episodes):
            obs, info = self.env.reset()
            done = False
            total_reward_episode = 0
            stepCnt = 0
            while not done:
                action = self.get_action(obs)
                obs, reward, done, _ = self.env.step(action)
                total_reward_episode += reward
                stepCnt += 1
            rewards_per_episode.append(total_reward_episode)
            steps_to_end.append(stepCnt)
            #epsilon = max(self.epsilon_end, epsilon * self.epsilon_decay)

        return rewards_per_episode, steps_to_end
        
    def extractTable(self):
        with open('q_table.csv', 'w', newline='') as f:
            writer = csv.writer(f)
            for key, value in self.Q_table.items():
                writer.writerow([key] + list(value))  # Store key as first column, values as rest

    def loadTable(self):
        with open('q_table.csv', 'r') as f:
            reader = csv.reader(f)
            Q_table = {eval(row[0]): list(map(float, row[1:])) for row in reader}
        return Q_table
    
    def run_env(self):
        self.env.run_agent()




In [45]:
Q_learning_wrapper = Q_table_learning()

In [46]:
Q_learning_wrapper.Train(total_episodes = 35000)

🚀 Episode 100/40000, Epsilon: 0.995
Average Reward: -26405.11
Average passenger: 621.62
Steps to end: 6108.88
🚀 Episode 200/40000, Epsilon: 0.990
Average Reward: -28063.56
Average passenger: 736.20
Steps to end: 6463.66
🚀 Episode 300/40000, Epsilon: 0.985
Average Reward: -26210.60
Average passenger: 775.00
Steps to end: 6151.60
🚀 Episode 400/40000, Epsilon: 0.980
Average Reward: -28027.65
Average passenger: 930.96
Steps to end: 6545.26
🚀 Episode 500/40000, Epsilon: 0.975
Average Reward: -26950.74
Average passenger: 667.92
Steps to end: 6312.90
🚀 Episode 600/40000, Epsilon: 0.970
Average Reward: -25397.02
Average passenger: 724.54
Steps to end: 6095.46
🚀 Episode 700/40000, Epsilon: 0.966
Average Reward: -25523.95
Average passenger: 766.10
Steps to end: 6063.50
🚀 Episode 800/40000, Epsilon: 0.961
Average Reward: -23820.95
Average passenger: 732.16
Steps to end: 5771.34
🚀 Episode 900/40000, Epsilon: 0.956
Average Reward: -25487.31
Average passenger: 522.44
Steps to end: 6211.48
🚀 Episode 

In [None]:
rewards_per_episode, steps_to_end = Q_learning_wrapper.Test(total_episodes=100)
print(np.mean(rewards_per_episode), np.mean(steps_to_end))