# COMP 579 - ASSIGNMENT 3
[Ling Fei Zhang](https://github.com/Ling01234), 260985358

Sevag Baghdassarian, ID

Brandon Ma, ID

In [8]:
# imports
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import gymnasium as gym
from tqdm import tqdm
import random
import time
import matplotlib.colors as mcolors
from scipy.stats import sem
from sklearn.linear_model import LogisticRegression


# Q-Learning Agent

In [17]:
# Actions:
# 0: left
# 1: right

# best params initialization:
ALPHA = 1/4
EPSILON = 0.25
GAMMA = 0.95
BINS = 10
EPISODES = 1000
RUNS = 10


class Qlearning:
    def __init__(self, env, alpha, gamma, epsilon, num_episodes, num_bins, seed) -> None:
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.num_episodes = num_episodes
        self.num_bins = num_bins
        self.lowerbound = env.observation_space.low
        self.lowerbound[1] = -3.5
        self.lowerbound[3] = -10
        self.upperbound = env.observation_space.high
        self.upperbound[1] = 3.5
        self.upperbound[3] = 10
        # self.env.seed(seed)
        self.seed = seed
        random.seed(self.seed)
        self.num_action = env.action_space.n
        self.reward = []
        self.Qvalues = np.random.uniform(low=-0.001, high=0.001,
                                         size=(num_bins, num_bins, num_bins, num_bins, self.num_action))
        self.behavior_episodes = []
        self.bins = []
        for i in range(4):
            self.bins.append(np.linspace(
                self.lowerbound[i], self.upperbound[i], self.num_bins))
            
    # def get_behavior_episodes(self):
    #     return np.apply_along_axis(np.argmax, 4, self.Qvalues) #shape (10, 10, 10, 10)
        

    def discritize_state(self, state):
        """
        Discritize continuous state into a discrete state

        Args:
            state (list of length 4): Current continuous state of agent

        Returns:
            state (4-tuple): Current discritized state of agent
        """
        new_state = []
        for i in range(4):
            index = np.maximum(np.digitize(state[i], self.bins[i]) - 1, 0)
            new_state.append(index)

        return tuple(new_state)

    def select_action(self, state, episode):
        """
        Select action given a state

        Args:
            state (4-tuple): Current state of the agent, continuous
            episode (int): Current episode of the run

        Returns:
            int: Action chosen by the agent
        """
        random.seed(self.seed)

        # lower exploration rate as we run many episodes
        if episode > 700:
            self.epsilon *= 0.99

        # epsilon greedy
        number = np.random.random()
        if number < self.epsilon:  # uniformly choose action
            return np.random.choice(self.num_action)

        # greedy selection
        state = self.discritize_state(state)
        best_actions = np.where(
            self.Qvalues[state] == np.max(self.Qvalues[state]))[0]
        return np.random.choice(best_actions)

    def simulate_episodes(self):
        """
        Simulate a specified number of episodes
        """
        for episode in range(1, self.num_episodes+1):
            # reset env
            (state, _) = self.env.reset()
            state = list(state)

            # run episode
            episode_reward = 0
            terminal = False
            while not terminal:
                discritized_state = self.discritize_state(state)
                action = self.select_action(state, episode)
                (next_state, reward, terminal, _, _) = self.env.step(action)
                episode_reward += reward

                next_discritized_state = self.discritize_state(
                    list(next_state))

                q_max = np.max(self.Qvalues[next_discritized_state])
                self.qlearning_update(
                    terminal, reward, action, discritized_state, q_max)

                state = next_state

            self.reward.append(int(episode_reward))

    def qlearning_update(self, terminal, reward, action, state, q_max):
        """
        Qlearning update rule

        Args:
            terminal (bool): True if at terminal state, False otherwise
            reward (int): Reward of the agent at current state
            action (int): Action taken by agent
            state (4-tuple): Discrete state of the agent
            q_max (float): Max Q value of the next state
        """
        if not terminal:
            loss = reward + self.gamma * q_max - \
                self.Qvalues[state + (action,)]
        else:
            loss = reward - self.Qvalues[state + (action,)]

        self.Qvalues[state + (action,)] += self.alpha * loss

    def visualize(self, games):
        """
        Visualize the game played for a specified number of games.
        Prints out the reward for each game.

        Args:
            games (int): Number of games to be played
        """
        random.seed(self.seed)
        env = gym.make("CartPole-v1", render_mode="human")
        for game in range(games):
            (state, _) = env.reset()
            env.render()
            rewards = 0

            for _ in range(500):
                discritized_state = self.discritize_state(state)
                best_actions = np.where(self.Qvalues[discritized_state] == np.max(
                    self.Qvalues[discritized_state]))[0]
                action = np.random.choice(best_actions)
                (state, reward, terminal, _, _) = env.step(action)
                rewards += int(reward)
                time.sleep(0.05)

                if terminal:
                    time.sleep(1)
                    break
            print(f"reward for game {game}: {rewards}")
        env.close()
        
    def gather_episodes(self):
        env = gym.make("CartPole-v1")
        self.simulate_episodes()
        print(f"reward after simulate_episode: {self.reward[-100:]}")
        for episode in range(1, 501):
            print(f"episode {episode}")
            state, _ = env.reset()
            terminal = False
            while not terminal:
                discritized_state = self.discritize_state(state)
                best_actions = np.where(self.Qvalues[discritized_state] == np.max(
                    self.Qvalues[discritized_state]))[0]
                action = np.random.choice(best_actions)
                (next_state, reward, terminal, _, _) = env.step(action)
                self.behavior_episodes.append((state, action))
                state = next_state
        
        
        # preprocess data
        x = np.array([data[0] for data in self.behavior_episodes])
        y = np.array([data[1] for data in self.behavior_episodes])
        
        model = LogisticRegression()
        model.fit(x, y)
        
        # test model for 10 games 
        model_reward = []
        for game in range(10):
            print(f"game {game}")
            state, _ = env.reset()
            episode_reward = 0
            terminal = False
            while not terminal:
                action = model.predict(state.reshape(1, -1))[0]
                # print(action)
                state, reward, terminal, *_ = env.step(action)
                episode_reward += reward
            
            model_reward.append(episode_reward)
        
        env.close()        
        print(f"The model reward is {model_reward}")
        return model_reward
        
        

def test_model():
    env = gym.make("CartPole-v1")
    qlearning = Qlearning(env, ALPHA, GAMMA, EPSILON, EPISODES, BINS, 123)
    model_rewards = qlearning.gather_episodes()
    
test_model()    


# def train_qlearning():
#     best_average_reward = []

#     x = np.arange(1000)
#     colors = [mcolors.TABLEAU_COLORS["tab:blue"],
#               mcolors.TABLEAU_COLORS["tab:green"], mcolors.TABLEAU_COLORS["tab:orange"]]
#     average_reward = []
#     for seed in range(RUNS):
#         env = gym.make("CartPole-v1")
#         env.reset()
#         qlearning = Qlearning(
#             env, ALPHA, GAMMA, EPSILON, EPISODES, BINS, seed)
#         qlearning.simulate_episodes()
#         rewards = qlearning.reward
#         average_reward.append(rewards)

#     average_reward = np.mean(average_reward, axis=0)
#     max_reward = np.empty(1000)
#     max_reward.fill(np.max(average_reward))
#     err = sem(average_reward)
#     plt.plot(x, average_reward,
#              label=f"ALPHA = {ALPHA}", color=colors[0])
#     plt.plot(
#         x, max_reward, color=colors[0], linestyle="dashed", label=f"y = {int(max_reward[0])}")
#     plt.fill_between(
#         x, average_reward - err, average_reward + err, color=colors[0], ALPHA=0.5)

#     if ALPHA == 0.25 and EPSILON == 0.25:
#         best_average_reward = average_reward

#         plt.legend(bbox_to_anchor=(1, 0.5), loc="best")
#         plt.title(f"Training Q-learning with epsilon = {EPSILON:.2f}")
#         plt.ylabel("Return")
#         plt.yscale("log")
#         plt.xlabel("Episode")
#         plt.show()

#     return best_average_reward


reward after simulate_episode: [126, 131, 119, 122, 153, 132, 119, 145, 130, 140, 127, 119, 137, 122, 131, 128, 149, 157, 153, 140, 125, 152, 82, 139, 141, 83, 114, 126, 150, 119, 149, 83, 121, 126, 72, 125, 127, 88, 131, 129, 73, 142, 148, 82, 143, 181, 110, 131, 144, 86, 67, 90, 106, 134, 120, 137, 124, 147, 123, 157, 144, 134, 150, 140, 132, 142, 220, 128, 145, 153, 134, 148, 137, 140, 119, 142, 150, 139, 120, 122, 136, 150, 129, 71, 68, 122, 87, 131, 129, 154, 120, 129, 120, 144, 123, 75, 205, 131, 118, 152]
episode 1
episode 2
episode 3
episode 4
episode 5
episode 6
episode 7
episode 8
episode 9
episode 10
episode 11
episode 12
episode 13
episode 14
episode 15
episode 16
episode 17
episode 18
episode 19
episode 20
episode 21
episode 22
episode 23
episode 24
episode 25
episode 26
episode 27
episode 28
episode 29
episode 30
episode 31
episode 32
episode 33
episode 34
episode 35
episode 36
episode 37
episode 38
episode 39
episode 40
episode 41
episode 42
episode 43
episode 44
episode