# COMP 579 - ASSIGNMENT 3
[Ling Fei Zhang](https://github.com/Ling01234), 260985358

Sevag Baghdassarian, ID

Brandon Ma, ID

In [21]:
# imports
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import gymnasium as gym
from tqdm import tqdm, trange
import random
import time
import matplotlib.colors as mcolors
from scipy.stats import sem
from sklearn.linear_model import LogisticRegression


# Q-Learning Agent

In [23]:
# Actions:
# 0: left
# 1: right

# best params initialization:
ALPHA = 1/4
EPSILON = 0.25
GAMMA = 0.95
BINS = 10
EPISODES = 1000
RUNS = 10


class Qlearning:
    def __init__(self, env, alpha, gamma, epsilon, num_episodes, num_bins, seed) -> None:
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.num_episodes = num_episodes
        self.num_bins = num_bins
        self.lowerbound = env.observation_space.low
        self.lowerbound[1] = -3.5
        self.lowerbound[3] = -10
        self.upperbound = env.observation_space.high
        self.upperbound[1] = 3.5
        self.upperbound[3] = 10
        # self.env.seed(seed)
        self.seed = seed
        random.seed(self.seed)
        self.num_action = env.action_space.n
        self.reward = []
        self.Qvalues = np.random.uniform(low=-0.001, high=0.001,
                                         size=(num_bins, num_bins, num_bins, num_bins, self.num_action))
        self.behavior_episodes = []
        self.random_episodes = []
        self.bins = []
        for i in range(4):
            self.bins.append(np.linspace(
                self.lowerbound[i], self.upperbound[i], self.num_bins))
            
    # def get_behavior_episodes(self):
    #     return np.apply_along_axis(np.argmax, 4, self.Qvalues) #shape (10, 10, 10, 10)
        

    def discritize_state(self, state):
        """
        Discritize continuous state into a discrete state

        Args:
            state (list of length 4): Current continuous state of agent

        Returns:
            state (4-tuple): Current discritized state of agent
        """
        new_state = []
        for i in range(4):
            index = np.maximum(np.digitize(state[i], self.bins[i]) - 1, 0)
            new_state.append(index)

        return tuple(new_state)

    def select_action(self, state, episode):
        """
        Select action given a state

        Args:
            state (4-tuple): Current state of the agent, continuous
            episode (int): Current episode of the run

        Returns:
            int: Action chosen by the agent
        """
        random.seed(self.seed)

        # lower exploration rate as we run many episodes
        if episode > 700:
            self.epsilon *= 0.99

        # epsilon greedy
        number = np.random.random()
        if number < self.epsilon:  # uniformly choose action
            return np.random.choice(self.num_action)

        # greedy selection
        state = self.discritize_state(state)
        best_actions = np.where(
            self.Qvalues[state] == np.max(self.Qvalues[state]))[0]
        return np.random.choice(best_actions)

    def simulate_episodes(self):
        """
        Simulate a specified number of episodes
        """
        for episode in range(1, self.num_episodes+1):
            # reset env
            (state, _) = self.env.reset()
            state = list(state)

            # run episode
            episode_reward = 0
            terminal = False
            while not terminal:
                discritized_state = self.discritize_state(state)
                action = self.select_action(state, episode)
                (next_state, reward, terminal, _, _) = self.env.step(action)
                episode_reward += reward

                next_discritized_state = self.discritize_state(
                    list(next_state))

                q_max = np.max(self.Qvalues[next_discritized_state])
                self.qlearning_update(
                    terminal, reward, action, discritized_state, q_max)

                state = next_state

            self.reward.append(int(episode_reward))

    def qlearning_update(self, terminal, reward, action, state, q_max):
        """
        Qlearning update rule

        Args:
            terminal (bool): True if at terminal state, False otherwise
            reward (int): Reward of the agent at current state
            action (int): Action taken by agent
            state (4-tuple): Discrete state of the agent
            q_max (float): Max Q value of the next state
        """
        if not terminal:
            loss = reward + self.gamma * q_max - \
                self.Qvalues[state + (action,)]
        else:
            loss = reward - self.Qvalues[state + (action,)]

        self.Qvalues[state + (action,)] += self.alpha * loss

    def visualize(self, games):
        """
        Visualize the game played for a specified number of games.
        Prints out the reward for each game.

        Args:
            games (int): Number of games to be played
        """
        random.seed(self.seed)
        env = gym.make("CartPole-v1", render_mode="human")
        for game in range(games):
            (state, _) = env.reset()
            env.render()
            rewards = 0

            for _ in range(500):
                discritized_state = self.discritize_state(state)
                best_actions = np.where(self.Qvalues[discritized_state] == np.max(
                    self.Qvalues[discritized_state]))[0]
                action = np.random.choice(best_actions)
                (state, reward, terminal, _, _) = env.step(action)
                rewards += int(reward)
                time.sleep(0.05)

                if terminal:
                    time.sleep(1)
                    break
            print(f"reward for game {game}: {rewards}")
        env.close()
        
    def gather_episodes_agent(self):
        env = gym.make("CartPole-v1")
        self.simulate_episodes()
        print(f"reward after simulate_episode: {self.reward[-20:]}")
        for episode in trange(1, 501):
            # if episode % 10 == 0:
            #     print(f"gather episode {episode}")
            state, _ = env.reset()
            terminal = False
            while not terminal:
                discritized_state = self.discritize_state(state)
                best_actions = np.where(self.Qvalues[discritized_state] == np.max(
                    self.Qvalues[discritized_state]))[0]
                action = np.random.choice(best_actions)
                (next_state, reward, terminal, _, _) = env.step(action)
                self.behavior_episodes.append((state, action))
                state = next_state
        
        
        # preprocess data
        x = np.array([data[0] for data in self.behavior_episodes])
        y = np.array([data[1] for data in self.behavior_episodes])
        
        model = LogisticRegression()
        model.fit(x, y)
        
        # test model for 10 games 
        model_reward = []
        for game in range(10):
            # print(f"game {game}")
            state, _ = env.reset()
            episode_reward = 0
            terminal = False
            while not terminal:
                action = model.predict(state.reshape(1, -1))[0]
                # print(action)
                state, reward, terminal, *_ = env.step(action)
                episode_reward += reward
            
            model_reward.append(episode_reward)
        
        env.close()        
        print(f"The model reward is {model_reward}")
        return model_reward
    
    def gather_episodes_random(self):
        env = gym.make("CartPole-v1")
        # print(f"reward after simulate_episode: {self.reward[-20:]}")
        for episode in trange(1, 501):
            # if episode % 10 == 0:
            #     print(f"random agent episode {episode}")
            state, _ = env.reset()
            terminal = False
            while not terminal:
                discritized_state = self.discritize_state(state)
                action = env.action_space.sample()
                (next_state, reward, terminal, _, _) = env.step(action)
                self.random_episodes.append((state, action))
                state = next_state
        
        
        # preprocess data
        x = np.array([data[0] for data in self.random_episodes])
        y = np.array([data[1] for data in self.random_episodes])
        
        model = LogisticRegression()
        model.fit(x, y)
        
        # test model for 10 games 
        model_reward = []
        for game in range(10):
            # print(f"game {game}")
            state, _ = env.reset()
            episode_reward = 0
            terminal = False
            while not terminal:
                action = model.predict(state.reshape(1, -1))[0]
                # print(action)
                state, reward, terminal, *_ = env.step(action)
                episode_reward += reward
            
            model_reward.append(episode_reward)
        
        env.close()        
        print(f"The random agent reward is {model_reward}")
        return model_reward
        
        

def test_model():
    env = gym.make("CartPole-v1")
    qlearning = Qlearning(env, ALPHA, GAMMA, EPSILON, EPISODES, BINS, 123)
    model_rewards = qlearning.gather_episodes_agent()
    random_rewards = qlearning.gather_episodes_random()
    env.close()
    
test_model()    


reward after simulate_episode: [96, 141, 128, 95, 132, 146, 107, 111, 109, 98, 107, 86, 112, 142, 132, 91, 118, 144, 131, 103]


100%|██████████| 10/10 [00:00<00:00, 317.12it/s]


The model reward is [45.0, 153.0, 69.0, 155.0, 51.0, 138.0, 181.0, 60.0, 141.0, 61.0]


100%|██████████| 500/500 [00:00<00:00, 2042.38it/s]

The random agent reward is [9.0, 11.0, 8.0, 9.0, 10.0, 9.0, 10.0, 9.0, 10.0, 8.0]





Here, we have pre-trained our Q-Learning agent with 1000 episodes, as we did in the previous assignment. We then use the trained Q-Learning agent as our expert in simple imitation learning and used logisitc regression to imitate the action observed in each state. We can see that we can get decent results from simple imitation learning, as we score in the mid hundreds about half the time throughout a 10 games testing. In fact, we observe that some of the rewards returned by simple imitation is even higher than the latest 20 returns by our Q-Learning agent (expert) itself. However, simple imitation learning is not really consistent, as it has returns less than 100 about half the time. On the other hand, we can see the returns received by the random agent. Without surprise, the returns are very low. 