In [2]:
import pandas as pd
import numpy as np
import yfinance as yf
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import MinMaxScaler
import time
import matplotlib.pyplot as plt
import math
import copy
from concurrent.futures import ThreadPoolExecutor
from concurrent.futures import ProcessPoolExecutor
from tensorflow.keras.layers import Input
from sklearn.preprocessing import StandardScaler
from keras.layers import Dropout

# Índice

1. Coleta de dados
   1. Escolha das Criptomoedas que serão trabalhadas
   2. Criação do dicionário 'data'
   3. Criação do dicionário 'data_complete'
2. Tratamento das tabelas
   1. Criar vetor de dias - First Transform
   2. Separar em treinamento e teste - Permitir Bagging
   3. Bagging
3. Machine Learning
   1. Função do Modelo
   2. Treinamento com Paralelismo e Geração dos Expected Values
4. Preparação para Portifólio
   1. Preparar data_complete para conter volatilidade
   2. Criar vetor de médias
   3. Criar matriz de covariâncias
5. Alocação de Portifólio
   1. Para cada intervalo de tempo, selecionar pesos de portifólio
   2. Calcular retorno desse portifólio no intervalo de tempo
   3. Salvar retorno e analisar resultados

In [3]:
# HIPERPARÂMETROS:
start_date = '2016-08-21'
W = 80  # Lembrando que o tamanho do treinamento será W - w
w = 50

# 1. Coleta de dados

## 1.1 Escolha das Criptomoedas que serão trabalhadas

In [4]:
# Criptomoedas que vamos escolher para nosso portifólio
cryptos = [
    'BTC-USD', 'ETH-USD','LTC-USD', 'ADA-USD',
    'DOT-USD', 'LINK-USD','SOL-USD',
    'TRX-USD'
]

## 1.2 Criação do dicionário 'data'

In [5]:
## Função para calcular o retorno logarítmico
def log_return(series):
    return np.log(series['Close'] / series['Open'])

In [6]:
data = {}
for crypto in cryptos:
    # Baixar dados e calcular retorno logarítmico
    df = yf.download(crypto, start=start_date, end=pd.to_datetime("today").strftime("%Y-%m-%d"), interval='1d')
    df['Return'] = log_return(df)

    # Construir DataFrame final, pegando o retorno do dia anterior e o atual
    df_final = pd.DataFrame({
        #'Volume': df['Volume'],
        'Crypto_Return_Today': df['Return']
    }).dropna()

    data[crypto] = df_final

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [7]:
data['ETH-USD']

Unnamed: 0_level_0,Crypto_Return_Today
Date,Unnamed: 1_level_1
2017-11-09,0.038888
2017-11-10,-0.069126
2017-11-11,0.052501
2017-11-12,-0.021787
2017-11-13,0.031076
...,...
2024-10-28,0.023570
2024-10-29,0.027884
2024-10-30,0.007453
2024-10-31,-0.054687


In [8]:
"""aux = yf.download('BTC-USD', start=start_date, end=pd.to_datetime("today").strftime("%Y-%m-%d"), interval='1wk')
aux['Return'] = log_return(aux)
aux = pd.DataFrame({'BTC-Return': aux['Return']})
aux"""


'aux = yf.download(\'BTC-USD\', start=start_date, end=pd.to_datetime("today").strftime("%Y-%m-%d"), interval=\'1wk\')\naux[\'Return\'] = log_return(aux)\naux = pd.DataFrame({\'BTC-Return\': aux[\'Return\']})\naux'

In [9]:
"""for crypto in cryptos:
    data[crypto] = pd.merge(aux, data[crypto],  left_index=True, right_index=True, how='outer').dropna()"""

"for crypto in cryptos:\n    data[crypto] = pd.merge(aux, data[crypto],  left_index=True, right_index=True, how='outer').dropna()"

In [10]:
print(len(data['ETH-USD']))
data['LTC-USD']

2550


Unnamed: 0_level_0,Crypto_Return_Today
Date,Unnamed: 1_level_1
2016-08-21,-0.001200
2016-08-22,0.010953
2016-08-23,0.070155
2016-08-24,-0.018953
2016-08-25,-0.014212
...,...
2024-10-28,-0.005068
2024-10-29,0.045062
2024-10-30,-0.029316
2024-10-31,-0.037382


In [11]:
"""global data_std
data_std = {}
global data_mean
data_mean = {}
for crypto in cryptos:
    data_std[crypto] = data[crypto].std()
    data_mean[crypto] = data[crypto].mean()
    data[crypto] = (data[crypto] - data[crypto].mean()) / data[crypto].std()"""

'global data_std\ndata_std = {}\nglobal data_mean\ndata_mean = {}\nfor crypto in cryptos:\n    data_std[crypto] = data[crypto].std()\n    data_mean[crypto] = data[crypto].mean()\n    data[crypto] = (data[crypto] - data[crypto].mean()) / data[crypto].std()'

In [12]:
print(len(data['ETH-USD']))
data['ETH-USD']

2550


Unnamed: 0_level_0,Crypto_Return_Today
Date,Unnamed: 1_level_1
2017-11-09,0.038888
2017-11-10,-0.069126
2017-11-11,0.052501
2017-11-12,-0.021787
2017-11-13,0.031076
...,...
2024-10-28,0.023570
2024-10-29,0.027884
2024-10-30,0.007453
2024-10-31,-0.054687


## 1.3 Criação do dicionário 'data_complete'

In [13]:
data_complete = {}
for crypto in cryptos:
    # Baixar dados e calcular retorno logarítmico
    df = yf.download(crypto, start=start_date, end=pd.to_datetime("today").strftime("%Y-%m-%d"), interval='1d')
    df['Return'] = log_return(df)

    # Construir DataFrame final, pegando o retorno do dia anterior e o atual
    df_final = pd.DataFrame({
        #'Volume': df['Volume'],
        'Crypto_Return_Today': df['Return']
    }).dropna()

    # Adicionar a coluna 'exp_value' com valores NaN
    df_final['exp_value'] = np.nan

    # Armazenar no dicionário
    data_complete[crypto] = df_final

[*********************100%%**********************]  1 of 1 completed


[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed


In [14]:
print(len(data_complete['ETH-USD']))
data_complete['ETH-USD']

2550


Unnamed: 0_level_0,Crypto_Return_Today,exp_value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-11-09,0.038888,
2017-11-10,-0.069126,
2017-11-11,0.052501,
2017-11-12,-0.021787,
2017-11-13,0.031076,
...,...,...
2024-10-28,0.023570,
2024-10-29,0.027884,
2024-10-30,0.007453,
2024-10-31,-0.054687,


In [15]:
"""data_complete_std = {}
data_complete_mean = {}
for crypto in cryptos:
    data_complete_std[crypto] = data_complete[crypto].std()
    data_complete_mean[crypto] = data_complete[crypto].mean()
    data_complete[crypto] = (data_complete[crypto] - data_complete[crypto].mean()) / data_complete[crypto].std()"""

'data_complete_std = {}\ndata_complete_mean = {}\nfor crypto in cryptos:\n    data_complete_std[crypto] = data_complete[crypto].std()\n    data_complete_mean[crypto] = data_complete[crypto].mean()\n    data_complete[crypto] = (data_complete[crypto] - data_complete[crypto].mean()) / data_complete[crypto].std()'

In [16]:
print(len(data_complete['ETH-USD']))
data_complete['ETH-USD']

2550


Unnamed: 0_level_0,Crypto_Return_Today,exp_value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2017-11-09,0.038888,
2017-11-10,-0.069126,
2017-11-11,0.052501,
2017-11-12,-0.021787,
2017-11-13,0.031076,
...,...,...
2024-10-28,0.023570,
2024-10-29,0.027884,
2024-10-30,0.007453,
2024-10-31,-0.054687,


# 2. Tratamento das tabelas

## 2.1 Criar vetor de dias - First Transform

In [17]:
# Criando a função First Transformation, que recebe uma tabela contendo os retornos de todos os dias dos ultimos X anos e retorno um vetor, que cada elemento
# é uma tabela do retorno de W+1 dias.
def FirstTransform(df, W):
    vet = []
    Linhas, Colunas = df.shape
    for i in range(Linhas, W+1, -1):
        vet.append(df.iloc[(i-W-1):i])
    return vet

In [18]:
#W = 
for crypto in cryptos:
    data[crypto] = FirstTransform(data[crypto], W)

In [19]:
print(len(data['ETH-USD']))
data['ETH-USD'][0]

2469


Unnamed: 0_level_0,Crypto_Return_Today
Date,Unnamed: 1_level_1
2024-08-13,-0.007602
2024-08-14,-0.015158
2024-08-15,-0.035470
2024-08-16,0.008948
2024-08-17,0.008211
...,...
2024-10-28,0.023570
2024-10-29,0.027884
2024-10-30,0.007453
2024-10-31,-0.054687


## 2.2 Separar em treinamento e teste - Permitir Bagging

In [20]:
test_target = {}
for crypto in cryptos:
    row = []
    for i in range(len(data[crypto])):
        row.append((pd.DataFrame(data[crypto][i].iloc[-1])).T)
        data[crypto][i] = data[crypto][i].drop(data[crypto][i].index[-1])
    test_target[crypto] = row

In [21]:
data['ETH-USD'][0], test_target['ETH-USD'][0]

(            Crypto_Return_Today
 Date                           
 2024-08-13            -0.007602
 2024-08-14            -0.015158
 2024-08-15            -0.035470
 2024-08-16             0.008948
 2024-08-17             0.008211
 ...                         ...
 2024-10-27             0.010431
 2024-10-28             0.023570
 2024-10-29             0.027884
 2024-10-30             0.007453
 2024-10-31            -0.054687
 
 [80 rows x 1 columns],
             Crypto_Return_Today
 2024-11-01            -0.001585)

## 2.3 Bagging

In [22]:
# Definindo uma função bagging, que recebe um dataframe e retorna um vetor de dataframes.
def Bagging(df, n, gamma):
    df_bagged = []
    for i in range(gamma):
        aux = df.sample(n = n, random_state = i)
        aux = aux.sort_index()
        df_bagged.append(aux)
    return df_bagged

# 3. Machine Learning

* vamos peimeiro fazer do modo 1 para todos os dias
* depois aplicar para os ultimos 290 dias o modo 2 (contém bagging e purge K-Fold-CV)

## 3.1 Função do Modelo


In [23]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2


# Função para criar o modelo
def create_model(a, b):
    model = Sequential()
    model.add(Input(shape=(a, b)))  # Define a camada de entrada
    model.add(LSTM(units=1, return_sequences=False))  # Primeira camada LSTM
    model.add(Dropout(0.2))
    model.add(Dense(units=1))  # Camada de saída
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model


In [24]:
# Função do modelo LSTM
def Model(df, time_step, features, model=None):
    # Normalizando o dataframe
    #scaler = StandardScaler()
    #df_scaled = scaler.fit_transform(df)
    #df_array = np.array(df_scaled)
    df_array = np.array(df)

    # Transformando no formato LSTM
    X_lstm = []
    y_lstm = []
    for i in range(time_step, len(df_array)):
        X_lstm.append(df_array[i-time_step:i, :])  # Pega 'timesteps' linhas anteriores
        y_lstm.append(df_array[i])  # Alvo é o valor do dia seguinte
    X_lstm = np.array(X_lstm)
    y_lstm = np.array(y_lstm)

    # Separar o X_train (todos exceto o último elemento)
    X_train = X_lstm[:-1]
    y_train = y_lstm[:-1]
    y_train = pd.DataFrame(y_train)[features - 1].to_numpy()
    y_train = y_train.reshape((len(y_train), 1))
    y_train.shape

    # X_test e y_test
    X_test = X_lstm[-1:]
    y_test = y_lstm[-1:]
    y_test = y_lstm[-1:]
    y_test = pd.DataFrame(y_test)[features - 1].to_numpy()
    y_test = y_test.reshape((len(y_test), 1))

    # Treinar o modelo específico para a criptomoeda
    model.fit(X_train, y_train, epochs=50, verbose=0)

    # Fazer a previsão
    y_hat_scaled = model.predict(X_test)
    

    return y_hat_scaled

##  3.2 Treinamento com Paralelismo e Geração dos Expected Values

In [25]:
def Add(y_hat, data_complete, ind, crypto):
    data_complete[crypto].at[ind, 'exp_value'] = y_hat

In [26]:
import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    print("Usando GPU:", physical_devices[0])
else:
    print("Usando CPU")

Usando CPU


In [27]:
for crypto in cryptos:
    print(len(data[crypto]))
    print(len(test_target[crypto]))
len(data['ETH-USD'][0])

2914
2914
2469
2469
2914
2914
2469
2469
1454
1454
2469
2469
1586
1586
2469
2469


80

In [30]:
test_target['ETH-USD'][2131]

Unnamed: 0,Crypto_Return_Today
2019-01-01,0.05399


In [29]:
# 1250, 1350

In [31]:
# Função que processa cada criptomoeda (paralelismo será aplicado aqui)
def process_crypto(crypto, data, test_target, data_complete, time_step, features):
    # Criar um modelo para cada criptomoeda
    for day in range(2001, 2132, 1):
        if len(data[crypto]) > day:
            model = create_model(time_step, features)

            df_train = data[crypto][day].copy()  # df é uma tabela de 300 linhas e uma coluna
            df_test = test_target[crypto][day].copy()  # df_test é uma tabela com 1 linha e uma coluna
            df = pd.concat([df_train, df_test], ignore_index=True)  # Unindo as duas tabelas

            scaler = StandardScaler()
            df = scaler.fit_transform(df)

            y_hat_scaled = Model(df, time_step, features, model=model)  # Chamar o modelo para previsão

            y_hat_scaled = np.repeat(y_hat_scaled, features, axis=1)
            y_hat_scaled = pd.DataFrame(y_hat_scaled, columns=['Predictions_1'])
            
            y_hat = scaler.inverse_transform(y_hat_scaled)[0][features - 1]

            Add(y_hat, data_complete, df_test.index[0], crypto)  # Adicionar o resultado na tabela final

    print("\nMoeda: ", crypto, " processada")
       
# Número de samples que vamos testar o modelo
time_step = w 
features = 1

# Criar o executor para rodar as criptomoedas em paralelo
with ThreadPoolExecutor() as executor:
    # Executar o processo para cada criptomoeda simultaneamente
    futures = [
        executor.submit(process_crypto, crypto, data, test_target, data_complete, time_step, features)
        for crypto in cryptos
    ]

    # Aguardar a conclusão de todas as threads
    for future in futures:
        future.result()

print("Processamento em paralelo finalizado.")


Moeda:  DOT-USD  processada

Moeda:  SOL-USD  processada
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 334ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 345ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 470ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 482ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 465ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 528ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 333ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 355ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 383ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 533ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 483ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 522ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275m

In [32]:
df = data_complete['ETH-USD']
df.dropna()

Unnamed: 0_level_0,Crypto_Return_Today,exp_value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,0.053990,-0.000987
2019-01-02,0.091295,0.000280
2019-01-03,-0.039837,-0.000948
2019-01-04,0.037363,-0.000117
2019-01-05,0.008395,-0.000147
...,...,...
2019-05-07,-0.015362,-0.013512
2019-05-08,0.006126,0.010823
2019-05-09,-0.003883,0.000736
2019-05-10,0.016479,-0.002201


In [33]:
data_complete['SOL-USD'].dropna()

Unnamed: 0_level_0,Crypto_Return_Today,exp_value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1


In [34]:
data_aux = {}
for crypto in cryptos:
    data_aux[crypto] = pd.read_csv(f"{crypto}.csv", index_col='Date')
    #data_complete[crypto] = pd.read_csv(f"{crypto}.csv")

In [35]:
data_aux['ETH-USD'].dropna()

Unnamed: 0_level_0,Crypto_Return_Today,exp_value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-05-11,0.115329,0.001932
2019-05-12,-0.035851,0.006155
2019-05-13,0.049082,0.004553
2019-05-14,0.098707,0.007870
2019-05-15,0.130064,0.027431
...,...,...
2024-10-16,0.001948,0.009319
2024-10-17,-0.002653,0.001030
2024-10-18,0.014005,0.000691
2024-10-19,0.002710,0.001672


In [41]:
data_aux['ETH-USD'][-1990:]
#data_complete['ETH-USD'][:-2002]

Unnamed: 0_level_0,Crypto_Return_Today,exp_value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-05-11,0.115329,0.001932
2019-05-12,-0.035851,0.006155
2019-05-13,0.049082,0.004553
2019-05-14,0.098707,0.007870
2019-05-15,0.130064,0.027431
...,...,...
2024-10-16,0.001948,0.009319
2024-10-17,-0.002653,0.001030
2024-10-18,0.014005,0.000691
2024-10-19,0.002710,0.001672


In [42]:
for crypto in cryptos:
    df_aux = data_complete[crypto]
    df_aux = pd.concat([df_aux[:-2002], data_aux[crypto][-1990:]])
    df_aux.index = pd.to_datetime(df_aux.index)
    data_complete[crypto] = df_aux
    

In [43]:
for crypto in cryptos:
    data_complete[crypto]= data_complete[crypto][~data_complete[crypto].index.duplicated(keep='first')]

In [44]:
data_complete['ETH-USD'].dropna()

Unnamed: 0_level_0,Crypto_Return_Today,exp_value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01,0.053990,-0.000987
2019-01-02,0.091295,0.000280
2019-01-03,-0.039837,-0.000948
2019-01-04,0.037363,-0.000117
2019-01-05,0.008395,-0.000147
...,...,...
2024-10-16,0.001948,0.009319
2024-10-17,-0.002653,0.001030
2024-10-18,0.014005,0.000691
2024-10-19,0.002710,0.001672


In [45]:
for crypto in cryptos:
    # Define o nome do arquivo como o nome da criptomoeda seguido de .csv
    filename = f"{crypto}.csv"
    
    # Salva o DataFrame data_complete[crypto] no arquivo .csv
    data_complete[crypto].to_csv(filename, index=True)

    print(f"Arquivo {filename} salvo com sucesso!")

Arquivo BTC-USD.csv salvo com sucesso!
Arquivo ETH-USD.csv salvo com sucesso!
Arquivo LTC-USD.csv salvo com sucesso!
Arquivo ADA-USD.csv salvo com sucesso!
Arquivo DOT-USD.csv salvo com sucesso!
Arquivo LINK-USD.csv salvo com sucesso!
Arquivo SOL-USD.csv salvo com sucesso!
Arquivo TRX-USD.csv salvo com sucesso!


## 3.3 Método 2 para os ultimos X intervalos

* Aplicar bagging
* Aplicar CV e Otimização de HP
* Fazer treinamento para uma janela W maior de tempo

In [None]:
# for day in data[crypto]:
#       day = bagging(day)

# for day in data[crypto]:
#       for sample in day:
#               sample = CV(sample) # transforma sample em um vetor de folds. Ultimos 10% do fold são validation e 90% são pure train

# for day in data[crypto]:
#       for sample in day:
#               for fold in sample:
#                       y_hat = model(fold)
#               y_hat_mean = y_hat / len(sample)
#       y_hat_mean_mean = y_hat_mean / len(day)