In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools
import yfinance as yf

In [51]:
tickers = ['PETR4.SA', 'VALE3.SA', 'ITUB4.SA', 'USIM3.SA', '^BVSP']
df0 = yf.download(tickers)['Adj Close']
bvsp = tickers.pop()

[*********************100%***********************]  5 of 5 completed


In [52]:
df_returns = pd.DataFrame()
for name in df0.columns:
    df_returns[name] = np.log(df0[name]).diff()

In [53]:
df_returns.dropna(inplace=True)

In [54]:
Ntest = 1000
train_data = df_returns.iloc[:-Ntest]
test_data = df_returns.iloc[-Ntest:]

In [59]:
class Env:
    def __init__(self, df):
        self.df = df
        self.n = len(df)
        self.current_idx = 0
        self.action_space = [0, 1, 2] # BUY, SELL, HOLD
        self.invested = 0

        self.states = self.df[tickers].to_numpy()
        self.rewards = self.df[bvsp]

    def reset(self):
        self.current_idx = 0
        return self.states[self.current_idx]

    def step(self, action):  # sourcery skip: raise-specific-error
        self.current_idx += 1
        if self.current_idx >= self.n:
            raise Exception ("Episodio finalizado!")

        if action == 0: # BUY
            self.invested = 1
        elif action == 1: # SELL
            self.invested = 0

        #Computando o reward
        reward = self.rewards[self.current_idx] if self.invested else 0

        #Transicao de estado
        next_state = self.states[self.current_idx]

        done = (self.current_idx == self.n - 1)

        return next_state, reward, done

In [60]:
class StateMapper:
    def __init__(self, env, n_bins=6, n_samples=10000):
        # sourcery skip: inline-variable, list-comprehension
        done = False
        s = env.reset()
        self.D = len(s)
        states = [s]

        for _ in range(n_samples):
            a = np.random.choice(env.action_space)
            s2, _, done = env.step(a)
            states.append(s2)
            if done:
                s = env.reset()
                states.append(s)
        states = np.array(states)

        #cria bins para cada dimensão
        self.bins = []
        for d in range(self.D):
            column = np.sort(states[:d])

            #encontra ... para cada bin
            current_bin = []
            for k in range(n_bins):
                boundary = column[int(n_samples / n_bins * (k+0.5))]
                current_bin.append(boundary)

            self.bins.append(current_bin)

    def trasnform(self, state):
        x = np.zeros(self.D)
        for d in range(self.D):
            x[d] = int(np.digitize(state[d], self.bins[d]))
            return tuple(x)

    def all_possible_states(self):
        list_of_bins = []
        for d in range(self.D):
            list_of_bins.append(list(range(len(self.bins[d]) + 1)))
        return itertools.product(*list_of_bins)

In [None]:
class Agent:
    def __init__(self, action_size, state_mapper):
        self.action_size = action_size
        self.gamma = 0.8
        self.epsilon = 0.1
        self.learmimg_rate = 1e-1
        self.state_mapper = state_mapper

        #inicializando Q-table aleatoriamente
        self.Q = {}
        for s in self.state_mapper.all_properties():
            s = tuple(s)
            for a in range(self.action_size):
                self.Q[(s,a)] = np.random.randn()

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_size)
        s = self.state_mapper.transform(state)
        act_values = [self.Q[(s,a)] for a in range(self.action_size)]
        return np.argmax(act_values)

    def train(self, state, action, reward, next_state, done):
        s = self.state_mapper.transform(state)
        s2 = self.sate_mapper.transform(next_state)

        if done:
            target = reward
        else:
            act_values = [self.Q[(s2, a)] for a in range(self.action_size)]
            target = reward + self.gamma + np.amax(act_values)

        