Passos:
1. Considerar que eu tenho uma tabela dataframe em pandas com W+1 linhas e C+1 colunas de uma determinada moeda_X
2. Separar esse dataframe em 2, um com W linhas e C+1 colunas e o outro com a nossa linha que queremos atacar.
3. Criar uma função de bagging e a partir dela gerar outros $\gamma$ dataframes com n linhas
4. Criar uma função que recebe um dataframe e te devolve um conjunto de K outros, sendo aplicado Purge K-Fold Cross Validation
5. Criar uma função que aplica o modelo de LSTM para cada tabela que ela recebe, treinando, gerando um HP e retornando o valor esperado.

In [259]:
import pandas as pd
import numpy as np
import yfinance as yf

# Parâmetro w ajustável
w = 30      #####     HIPERMARÂMTRO       #####

# Definindo o período dos últimos 5 anos
end_date = pd.to_datetime("today").strftime("%Y-%m-%d")
start_date = (pd.to_datetime("today") - pd.DateOffset(years=5)).strftime("%Y-%m-%d")

# Função para calcular o retorno logarítmico
def log_return(series):
    return np.log(series / series.shift(1))

# Baixando os dados históricos do Ethereum, Bitcoin e Solana
eth_data = yf.download("ETH-USD", start=start_date, end=end_date)
btc_data = yf.download("BTC-USD", start=start_date, end=end_date)
sol_data = yf.download("SOL-USD", start=start_date, end=end_date)

# Calculando o retorno logarítmico diário e fazendo o shift para corrigir as datas
eth_data['Return'] = log_return(eth_data['Close']).shift(-2)
btc_data['Return'] = log_return(btc_data['Close']).shift(-2)
sol_data['Return'] = log_return(sol_data['Close']).shift(-2)

# Sincronizando os dados para garantir que todas as moedas tenham os mesmos dias
combined_data = pd.concat([eth_data['Return'], btc_data['Return'], sol_data['Return']], axis=1, keys=['ETH_Return', 'BTC_Return', 'SOL_Return']).dropna()

# Criando o dataframe final
rows = []
for i in range(w, len(combined_data)):
    eth_returns = combined_data['ETH_Return'].iloc[i-w:i].values
    btc_returns = combined_data['BTC_Return'].iloc[i-w:i].values
    sol_returns = combined_data['SOL_Return'].iloc[i-w:i].values
    eth_age = (combined_data.index[i] - eth_data.index[0]).days
    eth_return_today = combined_data['ETH_Return'].iloc[i]
    
    # Concatenando os dados na linha
    row = np.concatenate([eth_returns, btc_returns, sol_returns, [eth_age], [eth_return_today]])
    rows.append(row)

# Nome das colunas
columns = [f"ETH_Return_Day_{j+1}" for j in range(w)] + \
          [f"BTC_Return_Day_{j+1}" for j in range(w)] + \
          [f"SOL_Return_Day_{j+1}" for j in range(w)] + \
          ['ETH_Age', 'ETH_Return_Today']

# Criando o DataFrame final com a data correta como índice
df_final = pd.DataFrame(rows, columns=columns, index=combined_data.index[w:])

df_final.tail()
df_final.shape


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


(1616, 92)

## Agora temos esse dataframe, de modo reduzido, para a moeda etherium.

### Proxima etapa é uma função que itere sobre esse df, para cada iteração criando outro df que contem os ultimos W dias.

In [260]:
# Criando a função First Transformation
def FirstTransform(df, W):
    vet = []
    Linhas, Colunas = df.shape
    for i in range(Linhas, W+1, -1):
        vet.append(df.iloc[(i-W-1):i])
    return vet

In [261]:
df = df_final
W = 300      ####     HIPER PARAMETRO     #####
vetor  = FirstTransform(df, W)
len(vetor)
vetor[0]

Unnamed: 0_level_0,ETH_Return_Day_1,ETH_Return_Day_2,ETH_Return_Day_3,ETH_Return_Day_4,ETH_Return_Day_5,ETH_Return_Day_6,ETH_Return_Day_7,ETH_Return_Day_8,ETH_Return_Day_9,ETH_Return_Day_10,...,SOL_Return_Day_23,SOL_Return_Day_24,SOL_Return_Day_25,SOL_Return_Day_26,SOL_Return_Day_27,SOL_Return_Day_28,SOL_Return_Day_29,SOL_Return_Day_30,ETH_Age,ETH_Return_Today
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-12-16,0.001021,0.025109,0.004478,-0.043031,0.063677,-0.001073,0.009143,0.001566,-0.010187,-0.017537,...,0.033321,-0.059832,-0.026299,0.031976,0.066321,-0.043947,0.015147,-0.034238,1524.0,0.009422
2023-12-17,0.025109,0.004478,-0.043031,0.063677,-0.001073,0.009143,0.001566,-0.010187,-0.017537,0.010754,...,-0.059832,-0.026299,0.031976,0.066321,-0.043947,0.015147,-0.034238,0.046058,1525.0,-0.017930
2023-12-18,0.004478,-0.043031,0.063677,-0.001073,0.009143,0.001566,-0.010187,-0.017537,0.010754,-0.009516,...,-0.026299,0.031976,0.066321,-0.043947,0.015147,-0.034238,0.046058,-0.018426,1526.0,0.010977
2023-12-19,-0.043031,0.063677,-0.001073,0.009143,0.001566,-0.010187,-0.017537,0.010754,-0.009516,0.011085,...,0.031976,0.066321,-0.043947,0.015147,-0.034238,0.046058,-0.018426,0.118729,1527.0,0.016946
2023-12-20,0.063677,-0.001073,0.009143,0.001566,-0.010187,-0.017537,0.010754,-0.009516,0.011085,0.016709,...,0.066321,-0.043947,0.015147,-0.034238,0.046058,-0.018426,0.118729,0.132700,1528.0,0.038104
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-10-07,0.026287,0.013100,-0.021035,0.009333,0.033240,-0.009469,-0.041233,-0.011097,0.020024,0.011895,...,-0.049210,-0.034338,-0.025981,0.045637,-0.003222,0.025805,-0.016876,-0.004157,1820.0,-0.029767
2024-10-08,0.013100,-0.021035,0.009333,0.033240,-0.009469,-0.041233,-0.011097,0.020024,0.011895,0.039316,...,-0.034338,-0.025981,0.045637,-0.003222,0.025805,-0.016876,-0.004157,-0.028858,1821.0,0.006555
2024-10-09,-0.021035,0.009333,0.033240,-0.009469,-0.041233,-0.011097,0.020024,0.011895,0.039316,0.038335,...,-0.025981,0.045637,-0.003222,0.025805,-0.016876,-0.004157,-0.028858,-0.003071,1822.0,0.021848
2024-10-10,0.009333,0.033240,-0.009469,-0.041233,-0.011097,0.020024,0.011895,0.039316,0.038335,0.021166,...,0.045637,-0.003222,0.025805,-0.016876,-0.004157,-0.028858,-0.003071,0.045688,1823.0,0.016288


## Agora temos vetor, que é um vetor cujos elementos são tabelas com W+1 linhas e C+1 Colunas

### Proxima etapa é separar o alvo e depois realizar um bagging para cada elemento desse vetor, criando $\gamma$ outras tabelas

In [262]:
vetor_alvos = []
for i in range(len(vetor)):
    vetor_alvos.append((pd.DataFrame(vetor[i].iloc[-1])).T)
    vetor[i] = vetor[i].drop(vetor[i].index[-1])

In [263]:
def Bagging(df, n, gamma):
    df_bagged = []
    for i in range(gamma):
        aux = df.sample(n = n, random_state = i)
        aux = aux.sort_index()
        df_bagged.append(aux)
    return df_bagged


In [264]:
n = 270       #####       HIPERPARÂMETRO      #####
gamma = 50      #####       HIPERPARÂMETRO      #####
X_inicial = vetor[0] # depois para backtest basta fazer um for e plotar num grafico sei la
alvo = vetor_alvos[0]
X_bagged = Bagging(X_inicial, n, gamma)
len(X_bagged)
X_bagged[0]
df = X_bagged[0][['ETH_Return_Day_30', 'BTC_Return_Day_30', 'SOL_Return_Day_30','ETH_Return_Today']].copy()
df

Unnamed: 0_level_0,ETH_Return_Day_30,BTC_Return_Day_30,SOL_Return_Day_30,ETH_Return_Today
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-12-16,-0.013766,-0.020943,-0.034238,0.009422
2023-12-17,0.009422,0.029980,0.046058,-0.017930
2023-12-18,-0.017930,-0.008317,-0.018426,0.010977
2023-12-19,0.010977,0.032165,0.118729,0.016946
2023-12-20,0.016946,0.004957,0.132700,0.038104
...,...,...,...,...
2024-10-06,-0.007471,-0.009313,-0.016876,0.007423
2024-10-07,0.007423,-0.001684,-0.004157,-0.029767
2024-10-08,-0.029767,-0.025261,-0.028858,0.006555
2024-10-09,0.006555,-0.005090,-0.003071,0.021848


## Agora temos o X_bagged, um vetor com gamma elementos, que cada elemento contem n linhas e C colunas

### Proxima etapa é para cada um desses elementos de X_bagged, separar em K tuplas de Training e Validation, em uma proporção 90-10.

In [265]:
# Etapa Intermediaria, para cada um desses elementos de X_bagged, vamos calcular o valor previsto do dia 11
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [266]:
df

Unnamed: 0_level_0,ETH_Return_Day_30,BTC_Return_Day_30,SOL_Return_Day_30,ETH_Return_Today
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-12-16,-0.013766,-0.020943,-0.034238,0.009422
2023-12-17,0.009422,0.029980,0.046058,-0.017930
2023-12-18,-0.017930,-0.008317,-0.018426,0.010977
2023-12-19,0.010977,0.032165,0.118729,0.016946
2023-12-20,0.016946,0.004957,0.132700,0.038104
...,...,...,...,...
2024-10-06,-0.007471,-0.009313,-0.016876,0.007423
2024-10-07,0.007423,-0.001684,-0.004157,-0.029767
2024-10-08,-0.029767,-0.025261,-0.028858,0.006555
2024-10-09,0.006555,-0.005090,-0.003071,0.021848


In [267]:
X = df.drop(columns=['ETH_Return_Today']).values
y = df['ETH_Return_Today'].values

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))


# Converta X_final e y_final em arrays numpy
X_array = np.array(X_scaled)
y_array = np.array(y_scaled)
# Crie as sequências com o número de timesteps
X_lstm = []
y_lstm = []
timesteps = 30
for i in range(timesteps, len(X_array)):
    X_lstm.append(X_array[i-timesteps+1:i+1, :])  # Pega 'timesteps' linhas anteriores
    y_lstm.append(y_array[i])  # Alvo é o valor do dia seguinte

# Converta listas para arrays numpy
X_lstm = np.array(X_lstm)
y_lstm = np.array(y_lstm)

# Separar o X_train (todos exceto o último elemento)
X_train_and_val = X_lstm[:-1]
y_train_and_val = y_lstm[:-1]

# Separar o X_test (apenas o último elemento)
X_test = X_lstm[-1:]
y_test = y_lstm[-1:]

# Separar o X_train_and_val em X_val e X_train
X_train = X_train_and_val
y_train = y_train_and_val


y_real = scaler_y.inverse_transform(y_test)[0][0]
y_real


0.016287544019312178

In [268]:
def Model(df):
    X = df.drop(columns=['ETH_Return_Today']).values
    y = df['ETH_Return_Today'].values
    
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()
    X_scaled = scaler_X.fit_transform(X)
    y_scaled = scaler_y.fit_transform(y.reshape(-1, 1))


    # Converta X_final e y_final em arrays numpy
    X_array = np.array(X_scaled)
    y_array = np.array(y_scaled)
    # Crie as sequências com o número de timesteps
    X_lstm = []
    y_lstm = []
    timesteps = 30
    for i in range(timesteps, len(X_array)):
        X_lstm.append(X_array[i-timesteps+1:i+1, :])  # Pega 'timesteps' linhas anteriores
        y_lstm.append(y_array[i])  # Alvo é o valor do dia seguinte
    
    # Converta listas para arrays numpy
    X_lstm = np.array(X_lstm)
    y_lstm = np.array(y_lstm)
    # Separar o X_train (todos exceto o último elemento)
    X_train_and_val = X_lstm
    y_train_and_val = y_lstm
    # Separar o X_train_and_val em X_val e X_train
    X_train = X_train_and_val
    y_train = y_train_and_val

    # Defina o modelo
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=False, input_shape=(X_train.shape[1], X_train.shape[2]))) # [samples, time steps, features]
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')

    # Treine o modelo
    model.fit(X_train, y_train, epochs=50, batch_size=239)

    y_hat_scaled = model.predict(X_test)
    y_hat = scaler_y.inverse_transform(y_hat_scaled)[0][0]

    y_real = scaler_y.inverse_transform(y_test)[0][0]

    return y_hat
    


In [284]:
X_inicial = vetor[1] # X_inicial eh uma matriz de W linhas e C+1 colunas
alvo = vetor_alvos[1]
valor = alvo.iloc[-1, -1]
alvo


Unnamed: 0,ETH_Return_Day_1,ETH_Return_Day_2,ETH_Return_Day_3,ETH_Return_Day_4,ETH_Return_Day_5,ETH_Return_Day_6,ETH_Return_Day_7,ETH_Return_Day_8,ETH_Return_Day_9,ETH_Return_Day_10,...,SOL_Return_Day_23,SOL_Return_Day_24,SOL_Return_Day_25,SOL_Return_Day_26,SOL_Return_Day_27,SOL_Return_Day_28,SOL_Return_Day_29,SOL_Return_Day_30,ETH_Age,ETH_Return_Today
2024-10-10,0.009333,0.03324,-0.009469,-0.041233,-0.011097,0.020024,0.011895,0.039316,0.038335,0.021166,...,0.045637,-0.003222,0.025805,-0.016876,-0.004157,-0.028858,-0.003071,0.045688,1823.0,0.016288


In [285]:
import math
df = X_bagged[0][['ETH_Return_Day_30', 'BTC_Return_Day_30', 'SOL_Return_Day_30','ETH_Return_Today']].copy()
n = 270       #####       HIPERPARÂMETRO      #####
gamma = 50      #####       HIPERPARÂMETRO      #####
ganho = 1
for i in range(len(vetor)):
    X_inicial = vetor[i] # X_inicial eh uma matriz de W linhas e C+1 colunas
    alvo = vetor_alvos[i][['ETH_Return_Day_30', 'BTC_Return_Day_30', 'SOL_Return_Day_30','ETH_Return_Today']].copy()
    valor_alvo = alvo.iloc[-1, -1]
    X_bagged = Bagging(X_inicial, n, gamma) # X_bagged vira um vetor em que cada elemento eh uma coluna de n linhas e C+1 colunas, que são os baggeds de um elemento do vetor VETOR
    sum_y_hat = 0
    for j in range(len(X_bagged)):
        df = X_bagged[i].copy()
        y_hat = Model(df)
        sum_y_hat += y_hat
    y_hat = sum_y_hat / len(X_bagged)   # y_hat é o retorno logaritmo previsto
    
    y_hat_perc = math.exp(y_hat) - 1
    if(y_hat_perc > 0):
        y_real_perc = math.exp(valor_alvo) - 1
        ganho *= (1 + y_real_perc)
print(ganho)

  super().__init__(**kwargs)


Epoch 1/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - loss: 0.1584
Epoch 2/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0721 
Epoch 3/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0151 
Epoch 4/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0811 
Epoch 5/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0600 
Epoch 6/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 0.0151 
Epoch 7/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 0.0579 
Epoch 8/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 0.0744 
Epoch 9/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0323 
Epoch 10/50
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 0.0162 
Epoch 11/50
[1m2/2

ValueError: Exception encountered when calling LSTMCell.call().

[1mDimensions must be equal, but are 3 and 91 for '{{node sequential_20_1/lstm_16_1/lstm_cell_1/MatMul}} = MatMul[T=DT_FLOAT, grad_a=false, grad_b=false, transpose_a=false, transpose_b=false](sequential_20_1/lstm_16_1/strided_slice_1, sequential_20_1/lstm_16_1/lstm_cell_1/Cast/ReadVariableOp)' with input shapes: [1,3], [91,200].[0m

Arguments received by LSTMCell.call():
  • inputs=tf.Tensor(shape=(1, 3), dtype=float32)
  • states=('tf.Tensor(shape=(1, 50), dtype=float32)', 'tf.Tensor(shape=(1, 50), dtype=float32)')
  • training=False

In [96]:
# Faça previsões
y_pred_scaled = model.predict(X_test)

# Inverter a normalização para obter os valores reais
y_pred = scaler_y.inverse_transform(y_pred_scaled)

# Visualize algumas previsões
print(f"Previsões: {y_pred[:5]}")
print(f"Valores reais: {scaler_y.inverse_transform(y_test[:5])}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step
Previsões: [[ 0.0065013 ]
 [-0.00666994]
 [-0.00018413]
 [ 0.00321763]
 [-0.0009015 ]]
Valores reais: [[ 0.05008975]
 [-0.00712229]
 [-0.06526123]
 [ 0.04934977]
 [ 0.00062212]]


In [32]:
"""val_size = 0.1*len(X_bagged)
for i in range(len(X_bagged)):
    for j in range(int(len(X_bagged) - val_size) + 1):
        validation = X_bagged[i].iloc[j : j + val_size]
        train = X_bagged[i].drop(X_bagged.index[j : j + val_size])
validation"""


'val_size = 0.1*len(X_bagged)\nfor i in range(len(X_bagged)):\n    for j in range(int(len(X_bagged) - val_size) + 1):\n        validation = X_bagged[i].iloc[j : j + val_size]\n        train = X_bagged[i].drop(X_bagged.index[j : j + val_size])\nvalidation'