# COMP 579 - Assignment 3
[Ling Fei Zhang](https://github.com/Ling01234), 260985358

Sevag Baghdassarian, ID

Brandon Ma, ID

In [1]:
# imports
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import gymnasium as gym
from tqdm import tqdm, trange
import random
import time
import matplotlib.colors as mcolors
from scipy.stats import sem
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPRegressor


# Q-Learning Agent

In [2]:
# Actions:
# 0: left
# 1: right

# best params initialization:
ALPHA = 1/4
EPSILON = 0.25
GAMMA = 0.95
BINS = 10
EPISODES = 1000
RUNS = 10
SEED = 123
random.seed(SEED)


class Qlearning:
    def __init__(self, env, alpha, gamma, epsilon, num_episodes, num_bins, seed) -> None:
        self.env = env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.num_episodes = num_episodes
        self.num_bins = num_bins
        self.lowerbound = env.observation_space.low
        self.lowerbound[1] = -3.5
        self.lowerbound[3] = -10
        self.upperbound = env.observation_space.high
        self.upperbound[1] = 3.5
        self.upperbound[3] = 10
        # self.env.seed(seed)
        self.seed = seed
        random.seed(self.seed)
        self.num_action = env.action_space.n
        self.reward = []
        self.Qvalues = np.random.uniform(low=-0.001, high=0.001,
                                         size=(num_bins, num_bins, num_bins, num_bins, self.num_action))
        self.behavior_episodes1 = []
        self.behavior_episodes2 = []
        self.behavior_episodes3 = []
        self.random_episodes1 = []
        self.random_episodes2 = []
        self.random_episodes3 = []
        self.bins = []
        for i in range(4):
            self.bins.append(np.linspace(
                self.lowerbound[i], self.upperbound[i], self.num_bins))
            
    # def get_behavior_episodes(self):
    #     return np.apply_along_axis(np.argmax, 4, self.Qvalues) #shape (10, 10, 10, 10)
        

    def discritize_state(self, state):
        """
        Discritize continuous state into a discrete state

        Args:
            state (list of length 4): Current continuous state of agent

        Returns:
            state (4-tuple): Current discritized state of agent
        """
        new_state = []
        for i in range(4):
            index = np.maximum(np.digitize(state[i], self.bins[i]) - 1, 0)
            new_state.append(index)

        return tuple(new_state)

    def select_action(self, state, episode):
        """
        Select action given a state

        Args:
            state (4-tuple): Current state of the agent, continuous
            episode (int): Current episode of the run

        Returns:
            int: Action chosen by the agent
        """
        random.seed(self.seed)

        # lower exploration rate as we run many episodes
        if episode > 700:
            self.epsilon *= 0.99

        # epsilon greedy
        number = np.random.random()
        if number < self.epsilon:  # uniformly choose action
            return np.random.choice(self.num_action)

        # greedy selection
        state = self.discritize_state(state)
        best_actions = np.where(
            self.Qvalues[state] == np.max(self.Qvalues[state]))[0]
        return np.random.choice(best_actions)

    def simulate_episodes(self):
        """
        Simulate a specified number of episodes
        """
        for episode in range(1, self.num_episodes+1):
            # reset env
            (state, _) = self.env.reset()
            state = list(state)

            # run episode
            episode_reward = 0
            terminal = False
            while not terminal:
                discritized_state = self.discritize_state(state)
                action = self.select_action(state, episode)
                (next_state, reward, terminal, _, _) = self.env.step(action)
                episode_reward += reward

                next_discritized_state = self.discritize_state(
                    list(next_state))

                q_max = np.max(self.Qvalues[next_discritized_state])
                self.qlearning_update(
                    terminal, reward, action, discritized_state, q_max)

                state = next_state

            self.reward.append(int(episode_reward))

    def qlearning_update(self, terminal, reward, action, state, q_max):
        """
        Qlearning update rule

        Args:
            terminal (bool): True if at terminal state, False otherwise
            reward (int): Reward of the agent at current state
            action (int): Action taken by agent
            state (4-tuple): Discrete state of the agent
            q_max (float): Max Q value of the next state
        """
        if not terminal:
            loss = reward + self.gamma * q_max - \
                self.Qvalues[state + (action,)]
        else:
            loss = reward - self.Qvalues[state + (action,)]

        self.Qvalues[state + (action,)] += self.alpha * loss

    def visualize(self, games):
        """
        Visualize the game played for a specified number of games.
        Prints out the reward for each game.

        Args:
            games (int): Number of games to be played
        """
        random.seed(self.seed)
        env = gym.make("CartPole-v1", render_mode="human")
        for game in range(games):
            (state, _) = env.reset()
            env.render()
            rewards = 0

            for _ in range(500):
                discritized_state = self.discritize_state(state)
                best_actions = np.where(self.Qvalues[discritized_state] == np.max(
                    self.Qvalues[discritized_state]))[0]
                action = np.random.choice(best_actions)
                (state, reward, terminal, _, _) = env.step(action)
                rewards += int(reward)
                time.sleep(0.05)

                if terminal:
                    time.sleep(1)
                    break
            print(f"reward for game {game}: {rewards}")
        env.close()
        
    def gather_episodes_agent(self, num_episodes):
        """
        Gather num_episodes behavior episodes for simple imitation learning

        Args:
            num_episodes (int): number of behavior episodes desired

        Returns:
            int: return of simple imitation learning using Q-Learning Agent
            as expert.
        """
        self.simulate_episodes()
        print(f"reward after simulate_episode: {self.reward[-20:]}")
        for episode in trange(1, num_episodes+1):
            # if episode % 10 == 0:
            #     print(f"gather episode {episode}")
            state, _ = self.env.reset()
            terminal = False
            while not terminal:
                discritized_state = self.discritize_state(state)
                best_actions = np.where(self.Qvalues[discritized_state] == np.max(
                    self.Qvalues[discritized_state]))[0]
                action = np.random.choice(best_actions)
                (next_state, reward, terminal, _, _) = self.env.step(action)
                
                if episode <= 100:
                    self.behavior_episodes1.append((state, action, reward, next_state, terminal))
                    self.behavior_episodes2.append((state, action, reward, next_state, terminal))
                if episode <= 250:
                    self.behavior_episodes2.append((state, action, reward, next_state, terminal))
                self.behavior_episodes3.append((state, action, reward, next_state, terminal))
                
                state = next_state
                
        # preprocess data
        x = np.array([data[0] for data in self.behavior_episodes3])
        y = np.array([data[1] for data in self.behavior_episodes3])
                
        episode_reward = simple_imitation(x, y)
        return episode_reward
        
        

    
    def gather_episodes_random(self, num_episodes):
        """
        Gather num_episodes behavior episodes with a random agent

        Args:
            num_episodes (int): number of behavior episodes desired for simple imitation

        Returns:
            int: return by a random agent. 
        """
        # print(f"reward after simulate_episode: {self.reward[-20:]}")
        for episode in trange(1, num_episodes+1):
            # if episode % 10 == 0:
            #     print(f"random agent episode {episode}")
            state, _ = self.env.reset()
            terminal = False
            while not terminal:
                discritized_state = self.discritize_state(state)
                action = self.env.action_space.sample()
                (next_state, reward, terminal, _, _) = self.env.step(action)
                
                if episode <= 100:
                    self.random_episodes1.append((state, action, reward, next_state, terminal))
                    self.random_episodes2.append((state, action, reward, next_state, terminal))
                if episode <= 250:
                    self.random_episodes2.append((state, action, reward, next_state, terminal))
                self.random_episodes3.append((state, action, reward, next_state, terminal))
                
                state = next_state
                
        # preprocess data
        x = np.array([data[0] for data in self.random_episodes3])
        y = np.array([data[1] for data in self.random_episodes3])
        
        episode_reward = simple_imitation(x, y)
        return episode_reward
        

def simple_imitation(x, y):
    """
    Simple imitation estimator

    Args:
        x (list): list of states from dataset
        y (list): list of actions from dataset

    Returns:
        int: return observed by the agent during testing
    """
    env = gym.make("CartPole-v1")
    model = LogisticRegression()
    model.fit(x, y)
    
    state, _ = env.reset()
    episode_reward = 0
    terminal = False
    while not terminal:
        action = model.predict(state.reshape(1, -1))[0]
        state, reward, terminal, *_ = env.step(action)
        episode_reward += reward
    env.close()
    return episode_reward


Here, we will run a test run on the model, and see how it performs

In [3]:
def test_model():
    env = gym.make("CartPole-v1")
    qlearning = Qlearning(env, ALPHA, GAMMA, EPSILON, EPISODES, BINS, SEED)
    model_rewards = qlearning.gather_episodes_agent(500)
    random_rewards = qlearning.gather_episodes_random(500)
    print(f"Simple imitation with expert agent reward: {model_rewards}")
    print(f"Simple imitation with random agent reward: {random_rewards}")
    env.close()
    return qlearning.behavior_episodes1, qlearning.random_episodes1, qlearning.behavior_episodes2, qlearning.random_episodes2, qlearning.behavior_episodes3, qlearning.random_episodes3
    
data1, data2, data3, data4, data5, data6 = test_model()    

reward after simulate_episode: [153, 122, 131, 148, 119, 167, 119, 163, 144, 140, 142, 151, 140, 149, 192, 143, 133, 175, 150, 111]


100%|██████████| 500/500 [00:02<00:00, 219.52it/s]
100%|██████████| 500/500 [00:00<00:00, 1875.01it/s]

Simple imitation with expert agent reward: 175.0
Simple imitation with random agent reward: 12.0





We have pre-trained our Q-Learning agent with 1000 episodes, as we did in the previous assignment. We then use the trained Q-Learning agent as our expert in simple imitation learning and used logisitc regression to imitate the action observed in each state. The results above were produced using 500 behavior episodes. We can see that we can get decent results from simple imitation learning. On the other hand, we can see the returns received by the random agent. Without surprise, the returns are very low. 

# Datasets
Below, we will create the 9 datasets we need to perform our analysis. 

In [5]:
# expert Q-learning data
# 100 episodes of expert data
x1 = np.array([data[0] for data in data1])
y1 = np.array([data[1] for data in data1])

# 250 episodes of expert data
x3 = np.array([data[0] for data in data3])
y3 = np.array([data[1] for data in data3])

# 500 episodes of expert data
x5 = np.array([data[0] for data in data5])
y5 = np.array([data[1] for data in data5])



# random agent data
# 100 episodes of random data
x2 = np.array([data[0] for data in data2])
y2 = np.array([data[1] for data in data2])

# 250 episodes of random data
x4 = np.array([data[0] for data in data4])
y4 = np.array([data[1] for data in data4])

# 500 episodes of random data
x6 = np.array([data[0] for data in data6])
y6 = np.array([data[1] for data in data6])

# Shuffled data
shuffled_data = data5 + data6
random.shuffle(shuffled_data)

# 100 episodes of mixed data
data7 = shuffled_data[:int(len(shuffled_data)/10)]
x7 = np.array([data[0] for data in data7])
y7 = np.array([data[1] for data in data7])

# 250 episodes of mixed data
data8 = shuffled_data[:int(len(shuffled_data)/4)]
x8 = np.array([data[0] for data in data8])
y8 = np.array([data[1] for data in data8])

# 500 episodes of mixed data
data9 = shuffled_data[:int(len(shuffled_data)/2)]
x9 = np.array([data[0] for data in data9])
y9 = np.array([data[1] for data in data9])


# Fitted Q-Learning

In [62]:
class FittedQLearning:
    def __init__(self, env, buffer, gamma=0.99, num_episodes=500, batch_size=64, buffer_size=5000, approximator="linear"):
        self.env = env
        self.gamma = gamma
        self.num_episodes = num_episodes
        self.batch_size = batch_size
        self.buffer_size = buffer_size
        self.buffer = buffer
        self.state_space = env.observation_space.shape[0]
        self.action_space = env.action_space.n
        self.approximator = approximator

        # build model
        if approximator == "linear":
            self.model = LinearRegression()
        elif approximator == "mlp":
            self.model = MLPRegressor(hidden_layer_sizes=(
                64,), activation="relu", solver="adam", max_iter=1000)
        else:
            raise Exception("The approximator must be 'linear' or 'mlp'")

        # initial fit of the approximator
        self.initial_fit()

        # initial buffer
        for data in self.buffer[:self.buffer_size]:
            self.remember(*data)

    def initial_fit(self):
        if self.approximator == "linear":
            weights = np.random.uniform(low=-0.001, high=0.001,
                                        size=(1, self.state_space + 1))
            weights_inter = np.random.uniform(low=-0.001, high=0.001,
                                        size=(1,))
            
            self.model.coef_ = weights
            self.model.intercept_ = weights_inter

        else: # TO MODIFY WITH PROPER SIZE
            layer_sizes = self.model.hidden_layer_sizes + (1,)
            weights = []
            for i in range(len(layer_sizes) - 1):
                w = [np.random.uniform(low=-0.001, high=0.001,
                                       size=(layer_sizes[i], layer_sizes[i+1]))]
                weights.append(w)
            self.model.coefs_ = weights
        # self.model.fit(self.buffer[:, :4], self.buffer[:, 4])

    def predict(self, state):
        # qvalue = self.model.predict(np.array(state).reshape(1, -1))[0]
        action0 = np.array(0)
        action1 = np.array(1)
        qvalue1 = self.model.predict(np.hstack((state, action0)).reshape(1,-1))[0][0]
        qvalue2 = self.model.predict(np.hstack((state, action1)).reshape(1,-1))[0][0]
        qvalues = [qvalue1, qvalue2]
        print(f"qvalue in predict: {qvalues}")
        return qvalues

    def update(self):
        if len(self.buffer) > self.batch_size:
            batch_index = np.random.choice(
                len(self.buffer), size=self.batch_size)
            batch = [self.buffer[i] for i in batch_index]
            states, actions, rewards, next_states, dones = zip(*batch)
            states = np.array(states)
            actions = np.array(actions)
            rewards = np.array(rewards)
            next_states = np.array(next_states)
            dones = np.array([int(done) for done in dones])

            # axis 1 checks for each next_state
            next_qvalues = []
            for index, next_state in enumerate(next_states):
                # print(f"in enumerate: {index}")
                if dones[index] == 1:
                    qvalues = 0
                else:
                    # print(f"in else")
                    qvalues = self.predict(next_state)
                    qvalues = np.max(qvalues)
                next_qvalues.append(qvalues)
            print(f"next_qvalues in update: {next_qvalues}")
            target_qvalues = rewards + self.gamma * (1 - dones) * next_qvalues

            if self.approximator == "linear":
                x = np.hstack((states, actions.reshape(-1, 1)))
                # x = states
            else:
                x = states
            y = target_qvalues.reshape(-1, 1)

            self.model.fit(x, y)

    def select_action(self, state):
        qvalues = self.predict(state)
        action = np.argmax(qvalues)
        print(f"action in select_action: {action}")
        return action

    def remember(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))

    def train(self):
        for episode in trange(1, self.num_episodes+1):
            state, _ = self.env.reset()
            done = False
            while not done:
                action = self.select_action(state)
                next_state, reward, done, *_ = self.env.step(action)
                self.remember(state, action, reward, next_state, done)
                state = next_state

            self.update()

            # if episode % 10 == 0:
            #     print(f"episode {episode} in training")

    def test(self):
        state, _ = self.env.reset()
        episode_reward = 0
        done = False
        while not done:
            action = self.select_action(state)
            state, reward, done, *_ = self.env.step(action)
            episode_reward += reward

        return episode_reward
                    
def test_model_fitted():
    env = gym.make("CartPole-v1")
    model = FittedQLearning(env, data5)
    model.train()
    reward = model.test()
    print(f"testing reward: {reward}")
    env.close()
    
test_model_fitted()

            

 10%|█         | 50/500 [00:00<00:01, 245.55it/s]

qvalue in predict: [-0.00017478052302397296, -8.816216734760255e-05]
action in select_action: 1
qvalue in predict: [0.00019716559578658566, 0.0002837839514629561]
action in select_action: 1
qvalue in predict: [0.0005734312154691432, 0.0006600495711455136]
action in select_action: 1
qvalue in predict: [0.0009555466229063211, 0.0010421649785826914]
action in select_action: 1
qvalue in predict: [0.0013450007836589956, 0.001431619139335366]
action in select_action: 1
qvalue in predict: [0.0017432008231040423, 0.0018298191787804127]
action in select_action: 1
qvalue in predict: [0.00215142585518183, 0.0022380442108582005]
action in select_action: 1
qvalue in predict: [0.0025707720104088177, 0.002657390366085188]
action in select_action: 1
qvalue in predict: [0.00046216169216037795, 0.0005487800478367484]
qvalue in predict: [0.0008065415579306196, 0.00089315991360699]
qvalue in predict: [-0.000988278397881941, -0.0009016600422055705]
qvalue in predict: [-0.0007364474648213586, -0.00064982910

 15%|█▌        | 75/500 [00:00<00:01, 243.41it/s]

qvalue in predict: [64.91791034916939, 61.65074560702188]
action in select_action: 0
qvalue in predict: [66.60817501000746, 63.34101026785994]
action in select_action: 0
qvalue in predict: [68.28002333135171, 65.0128585892042]
action in select_action: 0
qvalue in predict: [69.95222828513374, 66.68506354298623]
action in select_action: 0
qvalue in predict: [71.64243635911134, 68.37527161696383]
action in select_action: 0
qvalue in predict: [73.36684651867387, 70.09968177652635]
action in select_action: 0
qvalue in predict: [75.13986056399098, 71.87269582184346]
action in select_action: 0
qvalue in predict: [76.9736635623095, 73.70649882016198]
action in select_action: 0
qvalue in predict: [78.87773732246426, 75.61057258031674]
action in select_action: 0
qvalue in predict: [66.78135877937964, 63.51419403723213]
qvalue in predict: [42.54513222272499, 39.27796748057747]
qvalue in predict: [61.04635541633288, 57.779190674185365]
qvalue in predict: [50.757369029048675, 47.49020428690116]
qva

 25%|██▌       | 125/500 [00:00<00:01, 241.58it/s]

qvalue in predict: [139.51262316889444, 154.88079621815]
qvalue in predict: [114.7791848800658, 130.14735792932134]
qvalue in predict: [128.28162509630852, 143.6497981455641]
qvalue in predict: [185.21518326020703, 200.5833563094626]
qvalue in predict: [144.29688709550524, 159.66506014476082]
qvalue in predict: [99.00541163084297, 114.37358468009855]
qvalue in predict: [98.1051745794109, 113.47334762866647]
qvalue in predict: [205.9526567057393, 221.32082975499486]
qvalue in predict: [99.98244717716862, 115.3506202264242]
qvalue in predict: [116.35086593825847, 131.719038987514]
qvalue in predict: [201.430585356258, 216.79875840551358]
qvalue in predict: [117.23647401759021, 132.60464706684579]
qvalue in predict: [188.19863076368796, 203.56680381294353]
qvalue in predict: [156.423161386044, 171.79133443529958]
qvalue in predict: [168.18658232596115, 183.55475537521673]
qvalue in predict: [91.80992167260641, 107.17809472186198]
qvalue in predict: [107.77107297718791, 123.13924602644349]

 35%|███▍      | 174/500 [00:00<00:01, 234.96it/s]

qvalue in predict: [405.9332336810851, 421.18785975881406]
qvalue in predict: [365.0116499033048, 380.2662759810337]
qvalue in predict: [275.6706201920959, 290.92524626982487]
qvalue in predict: [405.4927112892612, 420.74733736699017]
qvalue in predict: [394.58645132963466, 409.8410774073636]
qvalue in predict: [382.4710724795329, 397.72569855726186]
qvalue in predict: [454.9919073764539, 470.2465334541829]
qvalue in predict: [339.6267533829046, 354.8813794606336]
qvalue in predict: [416.7363722006271, 431.9909982783561]
qvalue in predict: [315.5786070818763, 330.8332331596053]
qvalue in predict: [262.3122454340142, 277.56687151174316]
qvalue in predict: [240.59891737620566, 255.85354345393463]
qvalue in predict: [438.78150487468747, 454.03613095241644]
qvalue in predict: [407.9197944129005, 423.1744204906295]
qvalue in predict: [389.19499702520477, 404.44962310293374]
qvalue in predict: [441.2651411680091, 456.5197672457381]
qvalue in predict: [414.8454746317422, 430.1001007094712]
qv

 44%|████▍     | 222/500 [00:00<00:01, 226.01it/s]

qvalue in predict: [601.3347463435571, 604.5802500895651]
qvalue in predict: [284.1082758703966, 287.35377961640455]
qvalue in predict: [594.4695366603513, 597.7150404063592]
qvalue in predict: [607.6493645398393, 610.8948682858473]
qvalue in predict: [282.7242731804869, 285.96977692649483]
qvalue in predict: [372.8117543799103, 376.0572581259183]
qvalue in predict: [460.745641619744, 463.9911453657519]
qvalue in predict: [371.17537117913355, 374.42087492514145]
next_qvalues in update: [64.08174558022222, 514.4249246394382, 617.7631664894387, 28.950098098618014, 119.91313056580788, 599.2359848707589, 522.3276877940983, 617.4618812453864, 610.5170262113978, 89.95202019247847, 634.1948915091458, 577.9183365420503, 598.4343359995842, 317.3052654626495, 554.6727192283996, 573.2625640891998, 615.9505081034469, 325.02376553392685, 623.4106141336447, 598.957146959327, 585.0211583044088, 462.05204172151707, 578.7467828733057, 389.9618515114466, 591.8124045963772, 451.0374285836025, 561.8175874

 54%|█████▍    | 270/500 [00:01<00:01, 228.99it/s]

qvalue in predict: [2162.5776515498783, 1992.0252935291808]
qvalue in predict: [2138.150182994801, 1967.5978249741036]
qvalue in predict: [1765.097914723921, 1594.5455567032232]
qvalue in predict: [1992.5961240389558, 1822.043766018258]
qvalue in predict: [2277.9069636538165, 2107.354605633119]
qvalue in predict: [2564.666675492747, 2394.114317472049]
qvalue in predict: [2244.493337935936, 2073.940979915238]
qvalue in predict: [1732.0683506228486, 1561.5159926021508]
qvalue in predict: [2023.6334990478072, 1853.0811410271094]
qvalue in predict: [1923.4172680019444, 1752.8649099812467]
qvalue in predict: [2645.6842848421934, 2475.1319268214957]
qvalue in predict: [1842.974068672046, 1672.4217106513483]
qvalue in predict: [2457.269180759636, 2286.7168227389384]
qvalue in predict: [2437.9213033714823, 2267.3689453507845]
qvalue in predict: [2586.4319028819255, 2415.8795448612277]
next_qvalues in update: [2579.3362044163578, 2009.043871769795, 2084.1886230846385, 2121.2287742467565, 2136.2

 64%|██████▍   | 319/500 [00:01<00:00, 232.00it/s]

qvalue in predict: [3534.677739273414, 3383.537392316289]
qvalue in predict: [2156.541130071766, 2005.400783114641]
qvalue in predict: [2428.4073905729097, 2277.267043615784]
qvalue in predict: [3157.0684739937565, 3005.9281270366314]
qvalue in predict: [3031.8469400513004, 2880.706593094175]
qvalue in predict: [2939.717096591846, 2788.5767496347207]
qvalue in predict: [3158.2581314883005, 3007.117784531175]
qvalue in predict: [1468.7709069849925, 1317.6305600278672]
qvalue in predict: [3627.29768204991, 3476.1573350927847]
qvalue in predict: [2988.494886119418, 2837.354539162293]
qvalue in predict: [3319.5044418171615, 3168.3640948600364]
qvalue in predict: [2249.154377704696, 2098.0140307475704]
qvalue in predict: [3313.650292546884, 3162.5099455897584]
qvalue in predict: [2376.2034382018337, 2225.0630912447086]
qvalue in predict: [973.052132969894, 821.911786012769]
qvalue in predict: [3327.8831804811275, 3176.7428335240024]
qvalue in predict: [936.2857057379224, 785.1453587807973]


 74%|███████▍  | 369/500 [00:01<00:00, 237.47it/s]

qvalue in predict: [15034.67968856613, 17297.47950925322]
action in select_action: 1
qvalue in predict: [16310.659765229431, 18573.45958591652]
action in select_action: 1
qvalue in predict: [17695.597060257984, 19958.396880945067]
action in select_action: 1
qvalue in predict: [19195.940086393774, 21458.73990708086]
action in select_action: 1
qvalue in predict: [20818.69765206703, 23081.497472754116]
action in select_action: 1
qvalue in predict: [22571.26950595635, 24834.069326643432]
action in select_action: 1
qvalue in predict: [24461.241865555032, 26724.04168624212]
action in select_action: 1
qvalue in predict: [26496.129394012947, 28758.92921470003]
action in select_action: 1
qvalue in predict: [28683.06068663521, 30945.8605073223]
action in select_action: 1
qvalue in predict: [10155.660291918059, 12418.460112605146]
qvalue in predict: [6323.583699099222, 8586.38351978631]
qvalue in predict: [4519.826701068534, 6782.626521755621]
qvalue in predict: [5348.642770971424, 7611.442591658

 84%|████████▍ | 419/500 [00:01<00:00, 232.59it/s]

qvalue in predict: [59800.23810079556, 58643.000302652734]
qvalue in predict: [51006.267191875595, 49849.02939373276]
qvalue in predict: [57128.894190035746, 55971.65639189292]
qvalue in predict: [58672.613601853875, 57515.37580371104]
qvalue in predict: [53253.628688766476, 52096.39089062365]
qvalue in predict: [63303.39201265922, 62146.154214516384]
qvalue in predict: [35987.65896048379, 34830.421162340965]
qvalue in predict: [67378.321158341, 66221.08336019817]
qvalue in predict: [59836.61719460829, 58679.37939646546]
qvalue in predict: [60614.628935279594, 59457.391137136765]
qvalue in predict: [54295.846067266175, 53138.608269123346]
qvalue in predict: [53918.27628998569, 52761.03849184286]
qvalue in predict: [61536.56544793976, 60379.32764979693]
qvalue in predict: [46320.57853633755, 45163.34073819471]
qvalue in predict: [63569.68493239927, 62412.44713425644]
qvalue in predict: [19124.452216137695, 17967.214417994866]
qvalue in predict: [63136.873124699516, 61979.63532655669]
qv

 94%|█████████▍| 469/500 [00:01<00:00, 238.88it/s]

qvalue in predict: [150629.3185952006, 141730.22114912854]
qvalue in predict: [76756.92530768835, 67857.8278616163]
qvalue in predict: [146179.48178979865, 137280.38434372662]
qvalue in predict: [111259.50333423939, 102360.40588816733]
qvalue in predict: [82368.41569196625, 73469.31824589419]
qvalue in predict: [114317.88086047577, 105418.78341440372]
qvalue in predict: [102970.73823373503, 94071.64078766298]
qvalue in predict: [156433.3783198292, 147534.28087375715]
qvalue in predict: [142146.4690805747, 133247.37163450266]
qvalue in predict: [55851.3031123798, 46952.20566630775]
qvalue in predict: [177559.9041231428, 168660.80667707074]
qvalue in predict: [114090.25884160615, 105191.1613955341]
qvalue in predict: [72857.55578285405, 63958.458336782]
qvalue in predict: [134534.9805935624, 125635.88314749034]
qvalue in predict: [110590.65601917528, 101691.55857310325]
qvalue in predict: [69168.79532358766, 60269.6978775156]
qvalue in predict: [140011.1033896693, 131112.00594359724]
qva

100%|██████████| 500/500 [00:02<00:00, 235.63it/s]

qvalue in predict: [250414.43920739053, 215407.800221219]
action in select_action: 0
qvalue in predict: [270856.1316966162, 235849.49271044464]
action in select_action: 0
qvalue in predict: [294442.99063829135, 259436.35165211983]
action in select_action: 0
qvalue in predict: [321277.08455733815, 286270.4455711666]
action in select_action: 0
qvalue in predict: [351476.1829792188, 316469.54399304726]
action in select_action: 0
qvalue in predict: [385171.2570811708, 350164.6180949992]
action in select_action: 0
qvalue in predict: [422503.28488978744, 387496.64590361586]
action in select_action: 0
qvalue in predict: [463619.2551539403, 428612.6161677688]
action in select_action: 0
qvalue in predict: [508667.00490032113, 473660.36591414956]
action in select_action: 0
qvalue in predict: [557788.8175610856, 522782.17857491394]
action in select_action: 0
qvalue in predict: [164066.32793918782, 129059.68895301627]
qvalue in predict: [185405.71053169123, 150399.0715455197]
qvalue in predict: [2


