In [14]:
class FXEnv:
    def __init__(self, isTraining = True, data = None, featureFirst=False, randomizeInitialState = True):
        if data == None:
            rates = rates = pd.read_csv('/mnt/landisk/data/fx/NextBoaderPossibility/fx_USDJPY_5_2020-08-03T23-05-00_to_2021-12-04T07-50-00.csv', header=0, index_col=0, parse_dates=True)
        else:
            rates = data
        self.rowdata = rates
        diff_array = rates.diff()
        self.data = pd.DataFrame(diff_array, columns=['time', 'open', 'high', 'low', 'close', 'tick_volume', 'spread', 'real_volume'])
        self.data = self.data.drop(columns=['time', 'real_volume'])
        self.data.tick_volume, _, _ = self.minmaxNormalization(rates.tick_volume)
        self.data.spread, self.minSp, self.maxSp = self.minmaxNormalization(rates.spread)
        df_budget = pd.Series([1 for i in range(0, len(self.data))])
        df_asb_diff = pd.Series([0 for i in range(0, len(self.data))])
        self.data = self.data[1:]
        self.rowdata = self.rowdata[1:]
        self.FF = featureFirst
        self.RIS = randomizeInitialState
        length = len(self.data)
        
        self.budget_org = 100000
        self.leverage = 25
        self.volume_point = 10000
        self.point = 0.001

        self.dataRange = datetime.timedelta(days=2)
        self.dims = 5
        INTERVAL_DAYS = 3
        MINUTES_SPAN = 5
        
        totalMinutes = INTERVAL_DAYS * 24 * 60
        self.span  = int(totalMinutes/MINUTES_SPAN)+1
        
        ##select random indices.
        self.indices = random.sample(range(self.span, length - self.span -1), k=length - self.span*2 -1)
        if isTraining:
            self.fromIndex = self.span
            self.toIndex = int(length*0.7)
        else:
            self.fromIndex = int(length*0.7)+1
            self.toIndex = length+1
            
        #For Reinforce lerning
        self.action_space = gym.spaces.Discrete(3+2)
        self.observation_space = gym.spaces.Box(
            low=-1,
            high=1,
            shape=(self.span,6)
        )
        self.reward_range = [-1, 1]
        self.VALID_REWARD = 0
        self.INVALID_REWARD = -.001
        self.reset()

    def __len__(self):
        return self.toIndex - self.fromIndex
    
    def reset(self):
        '''
        '''
        self.askPositions = []
        self.bidPositions = []
        self.budget = self.budget_org
        self.coin = 0
        if self.RIS:
            self.index = random.randint(self.fromIndex, self.toIndex)
        else:
            self.index = self.fromIndex
        self.stepCount = 0
        self.rewards = 0
        self.winCount = 0
        self.orderCount = 0
        self.pl = 0
        self.budget_history = [1 for i in range(0, self.span)]
        self.ask_diff_history = [0 for i in range(0, self.span)]
        self.bid_diff_history = [0 for i in range(0, self.span)]
        observations = self.__getitem__(self.index + self.stepCount)
        observations = numpy.hstack([observations, numpy.atleast_2d(self.budget_history).T])
        observations = numpy.hstack([observations, numpy.atleast_2d(self.ask_diff_history).T])
        observations = numpy.hstack([observations, numpy.atleast_2d(self.bid_diff_history).T])
        if self.FF:
            observations = observations.T
        return observations
    
    def getSymbolInfo(self, symbol='USDJPY'):
        if symbol == 'USDJPY':
             return {
                 "point": 0.001,
                 "min":0.1,
                 "rate":100000
             }
    
    def GET_CURRENT_ASK(self):
        if self.index + self.stepCount < len(self.rowdata):
            value = self.rowdata.close.iloc[self.index + self.stepCount -1]  + self.rowdata["spread"].iloc[self.index + self.stepCount -1]*self.point
            #value = random.uniform(next_data["Open"].iloc[0], next_data["High"].iloc[0])
            #value = next_data["Open"].iloc[0] + next_data["spread"].iloc[0]*self.point
            return value
        
    def GET_CURRENT_BID(self):
        if self.index + self.stepCount < len(self.rowdata):
            value = self.rowdata.close.iloc[self.index + self.stepCount -1]  - self.rowdata["spread"].iloc[self.index + self.stepCount -1]*self.point
            #value = random.uniform(next_data["Low"].iloc[0], next_data["Open"].iloc[0])
            #value = next_data["Open"].iloc[0] - next_data["spread"].iloc[0]*self.point
            return value
    
    def __getRowData__(self, ndx):
        inputs = []
        if type(ndx) == slice:
            for index in self.indices[ndx]:
                inputs.append(self.rowdata[index+1-self.span:index+1].values.tolist())
        else:
            index = ndx
            inputs = self.rowdata[index+1-self.span:index+1].values.tolist()

        return inputs
    
    def __getInputs__(self, ndx):
        inputs = []
        if type(ndx) == int:
            indicies = slice(ndx, ndx+1)
            for index in self.indices[indicies]:
                inputs.append(self.data[index+1-self.span:index+1].values.tolist())
            return inputs[0]
        elif type(ndx) == slice:
            indicies = ndx
            for index in self.indices[indicies]:
                inputs.append(self.data[index+1-self.span:index+1].values.tolist())
            return inputs
    
    def __getActialIndex__(self,ndx):
        inputs = []
        if type(ndx) == slice:
            for index in self.indices[ndx]:
                inputs.append(index)
        else:
            inputs = self.indices[ndx]

        return inputs
    
    def __getitem__(self, ndx):
        ins = numpy.array(self.__getInputs__(ndx), dtype=numpy.dtype('float32'))
        return ins
        #return ins, outputs
    
    def minmaxNormalization(self, data):
        if type(data) == numpy.ndarray:
            temp_data = data[~numpy.isnan(data)]
        elif type(data) == pd.core.series.Series:
            temp_data = data.dropna()
        else:
            print(f"unkown type: {type(data)}")
            temp_data = data
        X_max, X_min = max(temp_data), min(temp_data)
        data_norm = (data - X_min) / (X_max - X_min)
        return data_norm, X_min, X_max

    def denormalization(self, value, X_min, X_max):
        return value * (X_max - X_min) + X_min
    
    def badget_in_use_and_diff(self):
        budget_in_use = 0
        ask_diff = 0
        bid_diff = 0
        sell_price = self.GET_CURRENT_BID()
        ask_price = self.GET_CURRENT_ASK()
        for position in self.askPositions:
            ask_diff += (sell_price - position['price'])/position['price']
            budget_in_use += position['volume']*sell_price*self.volume_point/self.leverage
        for position in self.bidPositions:
            bid_diff += (position['price'] - ask_price)/ask_price
            budget_in_use += position['volume']*ask_price*self.volume_point/self.leverage
        return budget_in_use, ask_diff, bid_diff

    def evaluate(self, action):
        reward = 0
        if action == 1:
            '''
            buy coin with 10 point if possible.
            if you don't have much budget, return negative reward 
            '''
            reward = self.__buy__()

        elif action == 2:
            '''
            sell coin with 10 point if possible.
            if you don't have much budget, return negative reward
            '''
            reward = self.__sell__()

        elif action == 0:
            '''
            hold. reward is 0
            '''
            reward = self.__stay__()
        elif action == 3:
            '''
            buy settlement of bid position.
            if there are no position, return negative reward
            '''
            reward = self.__settlement__("buy")
        elif action == 4:
            '''
            sell settlement of ask position.
            if there are no position, return negative reward
            '''
            reward = self.__settlement__("sell")
        else:
            raise Exception(f"The action number {action} exeeds the lengths in evaluate function.")
        return reward
    
    def step(self, action): # actionを実行し、結果を返す
        self.stepCount += 1
        if self.index + self.stepCount <= self.toIndex:
            done = False
            reward = 0.0
            option = None
            reward = self.evaluate(action)
            self.rewards = self.rewards + reward

            budget_in_use, ask_diff, bid_diff = self.badget_in_use_and_diff()
            option = [self.budget/self.budget_org, ask_diff, bid_diff]
            #budget
            observations = self.__getitem__(self.index + self.stepCount)
            self.budget_history[:-1] = self.budget_history[1:]
            self.budget_history[-1] = self.budget/self.budget_org
            self.ask_diff_history[:-1] = self.ask_diff_history[1:]
            self.ask_diff_history[-1] = ask_diff
            self.bid_diff_history[:-1] = self.ask_diff_history[1:]
            self.bid_diff_history[-1] = bid_diff
            observations = numpy.hstack([observations, numpy.atleast_2d(self.budget_history).T])
            observations = numpy.hstack([observations, numpy.atleast_2d(self.ask_diff_history).T])
            observations = numpy.hstack([observations, numpy.atleast_2d(self.bid_diff_history).T])
            if self.FF:
                observations = observations.T

            
            if self.orderCount > 0:
                winRate = self.winCount/self.orderCount
            else:
                winRate = -1
            
            #if (self.budget + budget_in_use) - self.budget_org < - self.budget_org * 0.2:
            if self.pl < -10:
                done = True
            else:
                done = False
            #if self.orderCount < 12*24*30 or self.pl > 0:
                #done = False
            return observations, reward, done, option
        else:
            observations = self.__getitem__(self.index + self.stepCount-1)
            ask_diff = 0
            sell_price = self.GET_CURRENT_BID()
            budget_in_use, ask_diff, bid_diff = self.badget_in_use_and_diff()
            self.budget_history[:-1] = self.budget_history[1:]
            self.budget_history[-1] = self.budget/self.budget_org
            self.ask_diff_history[:-1] = self.ask_diff_history[1:]
            self.ask_diff_history[-1] = ask_diff
            self.bid_diff_history[:-1] = self.ask_diff_history[1:]
            self.bid_diff_history[-1] = bid_diff
            observations = numpy.hstack([observations, numpy.atleast_2d(self.budget_history).T])
            observations = numpy.hstack([observations, numpy.atleast_2d(self.ask_diff_history).T])
            observations = numpy.hstack([observations, numpy.atleast_2d(self.bid_diff_history).T])
            print("index end")
            if self.FF:
                observations = observations.T
            return observations, 0, True, {}
        
    def render(self, mode='human', close=False):
        '''
        '''
        ask_diff = 0
        sell_price = self.GET_CURRENT_BID()
        budget_in_use, _, _ = self.badget_in_use_and_diff()
        if self.orderCount > 0:
            winRate = self.winCount/self.orderCount
        else:
            winRate = -1
        print (f"budget:{self.budget} + {budget_in_use}, pl:{self.pl}, winRate:{winRate}")

    def __stay__(self, S=5, T=1/2):
        reward = 0.0
        sell_price = self.GET_CURRENT_BID()
        for position in self.askPositions:
            ask_diff = (sell_price - position['price'])/position['price']
            if position['step'] < S:
                alpha = (position['step'] - self.stepCount)/S * T
            else:
                alpha = T
            reward += alpha * ask_diff
        ask_price = self.GET_CURRENT_ASK()
        for position in self.bidPositions:
            bid_diff = (position['price'] - ask_price)/ask_price
            if position['step'] < S:
                alpha = (position['step'] - self.stepCount)/S * T
            else:
                alpha = T
            reward += alpha * bid_diff
        #raise Exception(f"Unexpected action value in __stay__ function: {action}")

        return reward
    
    def close(self):
        '''
        '''
        pass
    
    def seed(self, seed=None):
        '''
        '''
        if seed == None:
            random.seed(1017)
        else:
            random.seed(seed)

    def __settlement__(self, type, price=None):
        reward = 0
        # settlement bid position
        if type == "buy":
            if len(self.bidPositions) > 0 and len(self.askPositions) > 0: #全額売買のみ　かつ　両建て無し
                #print("buy settlement")
                reward = self.VALID_REWARD
                current_buy_rate = self.GET_CURRENT_ASK()
                for position in self.bidPositions:
                    reward += (position['price'] - current_buy_rate)/current_buy_rate
                    self.budget += (position['volume'] * self.volume_point * current_buy_rate)/self.leverage
                    #print(f"actual reward: {((position['price'] - current_buy_rate))}")
                    #print(f"BID SETTLEMENT: {position['price']} - {current_buy_rate} = {position['price'] - current_buy_rate}")
                    pl = position['price'] - current_buy_rate
                    #print(pl)
                    self.pl += pl
                    if pl > 0:
                        self.winCount += 1
                    else:
                        # twice
                        reward += (position['price'] - current_buy_rate)/current_buy_rate
                self.bidPositions = [] 
            else:
                reward = self.INVALID_REWARD
        elif type == "sell":# settlement ask position
            reward = self.VALID_REWARD
            if len(self.askPositions) > 0 and len(self.bidPositions) > 0:
                #print("sell settlement")
                current_sell_rate = self.GET_CURRENT_BID()
                for position in self.askPositions:
                    reward += (current_sell_rate - position['price'])/position['price']
                    self.budget += (position['volume'] * self.volume_point * current_sell_rate)/self.leverage
                    #print(f"actual reward: {((current_sell_rate - position['price']))}")
                    #print(f"ASK SETTLEMENT: {current_sell_rate} - {position['price']} = {current_sell_rate - position['price']}")
                    pl = current_sell_rate - position['price']
                    self.pl += pl
                    #print(pl)
                    if pl >0:
                        self.winCount += 1
                    else:
                        # twice
                        reward += (current_sell_rate - position['price'])/position['price']
                self.askPositions = []
            else:
                reward = self.INVALID_REWARD
        #print(f"{self.budget}, {reward}")
        return reward
    
    def __buy__(self, volume=0.1):
        current_buy_rate = self.GET_CURRENT_ASK()
        reward = 0
        required_budget = (volume * self.volume_point * current_buy_rate)/self.leverage
        #if self.budget > required_budget:
        if len(self.askPositions) == 0:
            reward = self.VALID_REWARD
            means = self.rowdata.close.rolling(12).mean()
            mean = means.iloc[-1]
            #reward = (mean - current_buy_rate)/mean
            position = {'volume': volume, 'price': current_buy_rate, 'step':self.stepCount}
            self.askPositions.append(position)
            self.budget = self.budget -  required_budget
            self.orderCount +=1
        else:
            reward = self.INVALID_REWARD
            #reward = 0
        return reward
        
    def __sell__(self, volume=0.1):
        current_sell_rate = self.GET_CURRENT_BID()
        reward = 0
        required_budget = (volume * self.volume_point * current_sell_rate)/self.leverage
        #if self.budget > required_budget:
        if len(self.bidPositions) == 0:
            reward = self.VALID_REWARD
            means = self.rowdata.close.rolling(12).mean()
            mean = means.iloc[-1]
            #eward = (current_sell_rate - mean)/mean
            position = {'volume': volume, 'price': current_sell_rate, 'step':self.stepCount}
            #if self.budget * self.leverage >= volume*self.volume_point * current_sell_rate:
            self.bidPositions.append(position)
            self.budget = self.budget -  required_budget
            self.orderCount +=1
        else:
            reward = self.INVALID_REWARD
            #reward = 0
        return reward

In [15]:
env = FXEnv(featureFirst=True)

In [3]:
import pfrl
import torch.nn.functional as F
import torch.nn as nn
import torch
import random
import numpy
import datetime

In [4]:
import torch
import torch.nn as nn
from torch.optim import SGD
import math
import numpy as np

class PredictorSimple(nn.Module):
    def __init__(self, size, inputDim, n_actions, removeHistoryData = True):
        super().__init__()
        self.size = size
        self.rhd = removeHistoryData
        self.inDim = inputDim
        self.ActionHistoryDim = 3
        self.conv1 = nn.Conv1d(inputDim, inputDim*3, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(inputDim*3, inputDim*2, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(inputDim*2*size, inputDim*size)
        self.fc2 = nn.Linear(inputDim*size, size)
        self.output_layer = nn.Linear(size, n_actions)
        self.softmax = nn.Softmax(1)

    def forward(self, inputs):
        batch_size, feature_len, seq_len  = inputs.shape[0], inputs.shape[1],inputs.shape[2]
        if self.rhd:
            out = inputs[:,0: feature_len - self.ActionHistoryDim, :]
        else:
            out = inputs
        out = F.relu(self.conv1(out))
        out = F.relu(self.conv2(out))
        out = out.view(-1, self.inDim*2*self.size)
        out = F.relu(self.fc1(out))
        out = F.relu(self.fc2(out))
        out = self.output_layer(out)
        return pfrl.action_value.DiscreteActionValue(out)

In [5]:
import torch
import torch.nn as nn
from torch.optim import SGD
import math
import numpy as np

class Predictor(nn.Module):
    def __init__(self, inputDim, hiddenDim,  n_actions):
        super(Predictor, self).__init__()
        self.ActionHistoryDim = 3
        self.rnn = nn.LSTM(input_size = inputDim,
                            hidden_size = hiddenDim,
                            batch_first=True)
        self.rnn.to(device)
        self.output_layer = nn.Linear(hiddenDim, n_actions)#+self.ActionHistoryDim, n_actions)
        self.output_layer.to(device)
    
    def forward(self, inputs, hidden0=None):
        batch_size, seq_len, feature_len = inputs.shape[0], inputs.shape[1],inputs.shape[2]
        ohlc_inputs = inputs[:,:, 0: feature_len - self.ActionHistoryDim]
        last_actions = inputs[:, -1, -self.ActionHistoryDim:] # [1, ActionHistoryDim] (ex.torch.Size([1, 3]))
        output, (hidden, cell) = self.rnn(ohlc_inputs, hidden0) #LSTM層
        output = output[:, -1, :] # [1, hiddenDim] (ex. torch.Size([1, 50]))
        #output = torch.cat((output, last_actions), dim=1) #[1, hiddenDim+ActionHistoryDim] (ex.torch.Size([1, 53]))
        output = self.output_layer(output) #全結合層
        return pfrl.action_value.DiscreteActionValue(output)

In [6]:
training_size = 1000 #traning dataのデータ数
epochs_num = 10000 #traningのepoch回数
hidden_size = 500 #LSTMの隠れ層の次元数
batch_size = 32

In [7]:
obs = env.reset()
size = obs.shape[1]

In [8]:
obs.shape

(9, 865)

In [9]:
model = PredictorSimple(size, 9, 5, False) #modelの宣言

#model = Predictor(6, hidden_size, 5) #modelの宣言
criterion = nn.MSELoss() #評価関数の宣言

In [16]:
#optimizer = SGD(model.parameters(), lr=0.0001) #最適化関数の宣言
optimizer = torch.optim.Adam(model.parameters(), eps=1e-9)
# Set the discount factor that discounts future rewards.
gamma = 0.9

# Use epsilon-greedy for exploration
explorer = pfrl.explorers.ConstantEpsilonGreedy(
    epsilon=0.3, random_action_func=env.action_space.sample)

# DQN uses Experience Replay.}
# Specify a replay buffer and its capacity.
replay_buffer = pfrl.replay_buffers.ReplayBuffer(capacity=10**2)

# Since observations from CartPole-v0 is numpy.float64 while
# As PyTorch only accepts numpy.float32 by default, specify
# a converter as a feature extractor function phi.
phi = lambda x: x.astype(numpy.float32, copy=False)

# Set the device id to use GPU. To use CPU only, set it to -1.
gpu = 0

# Now create an agent that will interact with the environment.
agent = pfrl.agents.DoubleDQN(
    model,
    optimizer,
    replay_buffer,
    gamma,
    explorer,
    replay_start_size=40,
    update_interval=5,
    target_update_interval=20,
    phi=phi,
    gpu=gpu,
)

In [18]:
n_episodes = 1000
max_step_len = 10000
print(datetime.datetime.now(),'start episodes')
for i in range(1, n_episodes + 1):
    obs = env.reset()
    #obs = obs.to('cpu').detach().numpy().copy()
    R = 0  # return (sum of rewards)
    t = 0  # time step
    while True:
        # Uncomment to watch the behavior in a GUI window
        #env.render()
        action = agent.act(obs)
        obs, reward, done, ops = env.step(action)
        #obs = obs.to('cpu').detach().numpy().copy()
        R += reward
        t += 1
        reset = t == max_step_len
        agent.observe(obs, reward, done, reset)
        if reset:
            print("max steps")
            break
        elif done:
            break
    env.render()
    #print(datetime.datetime.now(),'episode:', i, 'R:', R)
    print('statistics:', agent.get_statistics())
    #if i % 50 == 0:
print('Finished.')

2022-02-20 18:55:32.947121 start episodes
max steps
budget:95797.4 + 4362.56, pl:-0.4600000000002069, winRate:0.4152542372881356
statistics: [('average_q', 0.0035779413), ('average_loss', 1.7016625214694159e-07), ('cumulative_steps', 53583), ('n_updates', 10709), ('rlen', 100)]
max steps
budget:91417.28000000001 + 8777.92, pl:-4.353999999999658, winRate:0.39300411522633744
statistics: [('average_q', -0.0044779414), ('average_loss', 1.6221727225484983e-07), ('cumulative_steps', 63583), ('n_updates', 12709), ('rlen', 100)]
budget:95700.35999999999 + 4220.76, pl:-10.078999999999851, winRate:0.38596491228070173
statistics: [('average_q', -0.00041051424), ('average_loss', 1.913931938446467e-07), ('cumulative_steps', 71057), ('n_updates', 14204), ('rlen', 100)]
max steps
budget:95656.80000000002 + 4369.639999999999, pl:-3.8310000000003157, winRate:0.35730337078651686
statistics: [('average_q', 0.0006851731), ('average_loss', 2.9632868553619573e-07), ('cumulative_steps', 81057), ('n_updates',

max steps
budget:95850.75999999988 + 4376.28, pl:-5.363999999999933, winRate:0.402
statistics: [('average_q', -0.0033291234), ('average_loss', 2.4917634313226243e-07), ('cumulative_steps', 372131), ('n_updates', 74419), ('rlen', 100)]
index end
budget:95618.19999999985 + 4411.88, pl:-2.376999999999782, winRate:0.3953488372093023
statistics: [('average_q', -0.0016946463), ('average_loss', 2.8808021866666424e-07), ('cumulative_steps', 376853), ('n_updates', 75363), ('rlen', 100)]
max steps
budget:95900.96000000024 + 4426.2, pl:-5.203000000000031, winRate:0.39068825910931176
statistics: [('average_q', 0.00061754824), ('average_loss', 5.201560186662846e-07), ('cumulative_steps', 386853), ('n_updates', 77363), ('rlen', 100)]
index end
budget:95548.92000000011 + 4411.88, pl:-0.3389999999998565, winRate:0.422680412371134
statistics: [('average_q', -0.0033119754), ('average_loss', 1.4417408848999004e-07), ('cumulative_steps', 388602), ('n_updates', 77713), ('rlen', 100)]
max steps
budget:91659

max steps
budget:95638.99999999999 + 4369.64, pl:-5.615000000000123, winRate:0.3967611336032389
statistics: [('average_q', -0.0012249196), ('average_loss', 4.1413808794743544e-07), ('cumulative_steps', 656042), ('n_updates', 131201), ('rlen', 100)]
max steps
budget:95735.71999999993 + 4229.040000000001, pl:-4.911000000000186, winRate:0.438
statistics: [('average_q', 0.0021941152), ('average_loss', 1.1340470596366003e-07), ('cumulative_steps', 666042), ('n_updates', 133201), ('rlen', 100)]
max steps
budget:95874.23999999996 + 4425.4400000000005, pl:-4.741999999999933, winRate:0.4451219512195122
statistics: [('average_q', 0.0025449542), ('average_loss', 2.2091905664822776e-07), ('cumulative_steps', 676042), ('n_updates', 135201), ('rlen', 100)]
max steps
budget:95792.8400000002 + 4158.44, pl:-4.601000000000681, winRate:0.42038216560509556
statistics: [('average_q', 0.0002538865), ('average_loss', 1.7081644458016853e-07), ('cumulative_steps', 686042), ('n_updates', 137201), ('rlen', 100)]

index end
budget:95570.36000000012 + 4412.12, pl:-2.8900000000000574, winRate:0.3844086021505376
statistics: [('average_q', -0.0020054618), ('average_loss', 1.6062422460549898e-07), ('cumulative_steps', 958013), ('n_updates', 191595), ('rlen', 100)]
max steps
budget:95702.71999999972 + 4149.76, pl:-5.306000000000125, winRate:0.41164658634538154
statistics: [('average_q', -0.0024042176), ('average_loss', 8.946788830677121e-08), ('cumulative_steps', 968013), ('n_updates', 193595), ('rlen', 100)]
max steps
budget:95706.56000000006 + 4379.48, pl:-4.121000000000279, winRate:0.41
statistics: [('average_q', 0.005302142), ('average_loss', 1.6348978611802068e-07), ('cumulative_steps', 978013), ('n_updates', 195595), ('rlen', 100)]
max steps
budget:95580.07999999977 + 4373.84, pl:-7.52499999999965, winRate:0.4168336673346693
statistics: [('average_q', 0.0032454287), ('average_loss', 1.7264865096677794e-07), ('cumulative_steps', 988013), ('n_updates', 197595), ('rlen', 100)]
index end
budget:9555

max steps
budget:95498.7200000001 + 4350.120000000001, pl:-4.1369999999997304, winRate:0.38271604938271603
statistics: [('average_q', 0.0005958809), ('average_loss', 1.2926436852467304e-07), ('cumulative_steps', 1237951), ('n_updates', 247583), ('rlen', 100)]
max steps
budget:91308.15999999989 + 8822.480000000001, pl:-2.685000000000173, winRate:0.41372141372141374
statistics: [('average_q', 0.0024263188), ('average_loss', 3.1158424544486253e-07), ('cumulative_steps', 1247951), ('n_updates', 249583), ('rlen', 100)]
max steps
budget:95708.56000000011 + 4161.960000000001, pl:-3.234000000000023, winRate:0.40124740124740127
statistics: [('average_q', 0.002187111), ('average_loss', 2.727357897391869e-07), ('cumulative_steps', 1257951), ('n_updates', 251583), ('rlen', 100)]
max steps
budget:91697.79999999987 + 8419.760000000002, pl:-4.316999999999879, winRate:0.40388349514563104
statistics: [('average_q', -0.0008533632), ('average_loss', 1.0188081613904387e-06), ('cumulative_steps', 1267951),

max steps
budget:91238.00000000007 + 8714.400000000001, pl:-5.759000000000171, winRate:0.36065573770491804
statistics: [('average_q', -0.0038206314), ('average_loss', 1.8066107980274636e-07), ('cumulative_steps', 1529650), ('n_updates', 305923), ('rlen', 100)]
max steps
budget:95904.91999999981 + 4387.44, pl:-5.242000000000118, winRate:0.41855670103092785
statistics: [('average_q', -0.0019853553), ('average_loss', 1.768194502282938e-07), ('cumulative_steps', 1539650), ('n_updates', 307923), ('rlen', 100)]
max steps
budget:95774.71999999996 + 4210.76, pl:-7.622000000000355, winRate:0.41164241164241167
statistics: [('average_q', -0.0026291055), ('average_loss', 1.350059971372275e-07), ('cumulative_steps', 1549650), ('n_updates', 309923), ('rlen', 100)]
max steps
budget:95775.60000000014 + 4216.52, pl:-3.270999999999759, winRate:0.3675564681724846
statistics: [('average_q', -0.0010062668), ('average_loss', 1.3278508230740728e-07), ('cumulative_steps', 1559650), ('n_updates', 311923), ('rl

KeyboardInterrupt: 

### env.observation

In [None]:
obs.shape

In [None]:
reset