#Técnica de OPTICS testadas no Google Cloud

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
np.random.seed(1) # NumPy
import random
random.seed(3) # Python
import tensorflow as tf
tf.random.set_seed(2) # Tensorflow
#session_conf = tf.ConfigProto(intra_op_parallelism_threads=1,
#                              inter_op_parallelism_threads=1)
from keras import backend as K
#sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
#K.set_session(sess)


import pandas as pd
import pickle
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.manifold import TSNE

from keras.layers import Dense, Flatten, LSTM, Dropout

import sys
import os
import datetime

In [3]:
tf.config.list_physical_devices('GPU')

[]

In [4]:
sys.path.append('/content/drive/MyDrive/Python/TCC-Cloud/Classes')
pasta = str("/content/drive/MyDrive/Python/TCC-Cloud/Data/")

In [5]:
%load_ext autoreload
%aimport class_SeriesAnalyser, class_Trader, class_DataProcessor
%autoreload 1

series_analyser = class_SeriesAnalyser.SeriesAnalyser()
trader = class_Trader.Trader()
data_processor = class_DataProcessor.DataProcessor()

In [6]:
preços = pd.read_csv(pasta + "Preços.csv",index_col='Data',parse_dates=True)
retornos = pd.read_csv(pasta + "Retornos.csv",index_col='Data',parse_dates=True)
semestres = pd.read_csv(pasta + "Semestres.csv",parse_dates=True)

Datas = pd.DataFrame(retornos.index)
Datas['indice'] = Datas.index
log_data = np.log(preços)

In [7]:
min_half_life = 0 # number of points in a day
max_half_life = 2520 #~number of points in a year: 78*252
subsample = 0

no_pairs = 5
duration_limit = 50
years = 2023 - 1995
threshold = 2

alpha = 0.05

days, num_assets = np.shape(retornos)
Rpair = np.zeros((days, no_pairs))

stop_loss = float('-inf')


# Funções

In [8]:
# MQO para encontrar o coeficiente de cointegração e criando a serie do spread
def OLS(data_ticker1, data_ticker2):
    spread = sm.OLS(data_ticker1,data_ticker2)
    spread = spread.fit()
    return data_ticker1 + (data_ticker2 * -spread.params[0]), spread.params[0]


# ADF test
def ADF(spread):
    return ts.adfuller(spread) # H0: Raiz unitária.


# Encontra o coeficiente de cointegração e realiza o ADF test
def ADF_test(data_ticker1, data_ticker2):
    ols = OLS(data_ticker1, data_ticker2)
    spread = ols[0]
    gamma = ols[1]
    return ADF(spread),gamma


# Encontra os pares cointegrados
def find_cointegrated_pairs_mod(data):
    try:
        print(f"Finding cointegrated pairs for shape {np.shape(data)}")
        n = data.shape[1]
        pvalue_matrix = np.ones((n, n))
        gammas_matrix = np.ones((n, n))
        keys = data.keys()
        for i in range(n):
            for j in range(i+1, n):
                S1 = keys[i]
                S2 = keys[j]
                #print(f"Testing cointegration for pairs {S1} and {S2}")
                result = ADF_test(data[S1], data[S2])
                gammas_matrix[i, j] =result[1] # gamma
                pvalue = result[0][1] # pvalue
                pvalue_matrix[i, j] = pvalue
            if(i%50 == 0):
                print(f"Finished pairs for stock {keys[i]}")
        return pvalue_matrix, gammas_matrix
    except Exception as e:
        print(e)


# Ordenando os melhores pares
def top_coint_pairs(data,pvalue_matrix,gamma, alpha,semestre,n):
#alpha = nivel de significancia para o teste ADF
#n = top n ativos com o menor pvalue
    alpha_filter = np.where(pvalues < alpha)
    pvalues_f = pvalues[alpha_filter] # pvalores menores que alpha
    #print(f"Alpha filter rows len {len(alpha_filter[0])} | cols len {len(alpha_filter[1])} | value {alpha_filter}")
    #print(f"pvalues_f len: {np.shape(pvalues_f)} | value: {pvalues_f}")
    stock_a = data.columns[list(alpha_filter)[0]] # relacionando o pvalor com a ação A
    stock_b = data.columns[list(alpha_filter)[1]] # relacionando o pvalor com a ação B
    gammas_f = gammas[alpha_filter] # relacionando o pvalor com o gamma
    N = len(list(alpha_filter[0])) # quantidade de pares cointegrados

    d = []
    for i in range(N):
        pair_dict = {
            'Stock A': stock_a[i],
            'Stock B': stock_b[i],
            'P-Values': pvalues_f[i],
            'Gamma': gammas_f[i],
            'Semestre' : semestre
        }
        #if(i%2000 == 0):
        #print(f"Appending pair dict: {pair_dict}")
        d.append(pair_dict)

    return pd.DataFrame(d).sort_values(by="P-Values").iloc[:n,]


# Calcula os retornos da carteira e armazenando em um data frame
def calculate_profit(pair, spread, threshold, par1, par2, resumo, semester, gamma):
    #print(f"Calculating profits for pair {par1}-{par2}")

    #print(f"Spread series: {spread}")

    date_format = "%Y-%m-%d"

    log_ret = spread.diff() # log return eh o incremento
    #print(f"log_ret len: {len(log_ret)} | variable: {log_ret}")
    dias = spread.index
    #print(f"Dias len: {len(dias)} | variable: {dias}")
    z_score = (spread-spread.mean())/spread.std()
    z_score.plot()
    #print(f"Z-score index: {z_score.index} | variable: {z_score}")
    portfolio_return = []
    pos = 0 # 0: sem posição aberta
            # 1: Comprei o meu portfolio h = (1,-gamma)
            # -1: Vendi o meu portfolio h = -(1,-gamma)

    dias_abertura = []
    dias_fechamento = []

    count = 0
    dia_abertura = 0
    dia_fechamento = 0

    closing_threshold = 0.0

    for i in range(1, len(z_score)):

        if (z_score.iloc[i][0] > threshold) and (pos == 0):
            # Posição fechada no par e com sinal short 1st e long 2nd
            pos = -1

            count += 1
            dia_abertura = dias[i] - dias[0]
            retornos_op = []


        elif (z_score.iloc[i][0] < -threshold)  and (pos == 0):
            # Posição fechada no par e com sinal de long 1st e short 2nd
            pos = 1

            count += 1
            dia_abertura = dias[i] - dias[0]
            retornos_op = []

        else:
            #print(f"Dia {i} | Pos {pos} | log_ret {log_ret[i]} | S1 return {returns[par1][i]} | S2 return {returns[par2][i]} | Net {pos*(returns[par1][i] - gamma*returns[par2][i])}")
            if (pos != 0) and ((dias[i] - dias[0] - dia_abertura) == duration_limit):
                #Fechando operações maiores que 50 dias
                portfolio_return.append(log_ret.iloc[i][0]*pos)
                pos = 0
                dia_fechamento = dias[i] - dias[0]
                delta_dias = dia_fechamento - dia_abertura
                if sum(retornos_op) < stop_loss:
                    print(f"Sum: {sum(retornos_op)}")
                    retorno_op = stop_loss
                    print(f"Retorno_op: {retorno_op}")
                else:
                    retornos_op.append(log_ret.iloc[i][0]*pos)
                    retorno_op = pd.Series(retornos_op).sum()

                #Rpair[i-1, pair] = log_ret.iloc[i][0]*pos

                resumo.append([count, semester, dia_abertura, dia_fechamento, delta_dias, retorno_op, par1, par2, True])


            elif (pos == 1) and (z_score.iloc[i][0] >= -closing_threshold or sum(retornos_op) < stop_loss):
                # Posição vendida aberta no par e com sinal de convergência
                portfolio_return.append(log_ret.iloc[i][0]*pos)
                pos = 0

                dia_fechamento = dias[i] - dias[0]
                delta_dias = dia_fechamento - dia_abertura
                if sum(retornos_op) < stop_loss:
                    print(f"Sum: {sum(retornos_op)}")
                    retorno_op = stop_loss
                    print(f"Retorno_op: {retorno_op}")
                else:
                    retornos_op.append(log_ret.iloc[i][0]*pos)
                    retorno_op = pd.Series(retornos_op).sum()

                #Rpair[i-1, pair] = log_ret.iloc[i][0]*pos

                resumo.append([count, semester, dia_abertura, dia_fechamento, delta_dias, retorno_op, par1, par2, True])

            elif (pos == -1) and (z_score.iloc[i][0] <= closing_threshold or sum(retornos_op) < stop_loss):
                portfolio_return.append(log_ret.iloc[i][0]*pos)
                pos = 0

                dia_fechamento = dias[i] - dias[0]
                delta_dias = dia_fechamento - dia_abertura
                if sum(retornos_op) < stop_loss:
                    print(f"Sum: {sum(retornos_op)}")
                    retorno_op = stop_loss
                    print(f"Retorno_op: {retorno_op}")
                else:
                    retornos_op.append(log_ret.iloc[i][0]*pos)
                    retorno_op = pd.Series(retornos_op).sum()

                #Rpair[i-1, pair] = log_ret.iloc[i][0]*pos

                resumo.append([count, semester, dia_abertura, dia_fechamento, delta_dias, retorno_op, par1, par2, True])

            elif (pos == 1) and (z_score.iloc[i][0] < -closing_threshold):
                # Posição vendidada aberta no par aberta e sem convergência
                portfolio_return.append(log_ret.iloc[i][0]*pos)
                retornos_op.append(log_ret.iloc[i][0]*pos)

                #Rpair[i-1, pair] = log_ret.iloc[i][0]*pos

            elif (pos == -1) and (z_score.iloc[i][0] > closing_threshold):
                portfolio_return.append(log_ret.iloc[i][0]*pos)
                retornos_op.append(log_ret.iloc[i][0]*pos)

                #Rpair[i-1, pair] = log_ret.iloc[i][0]*pos

            else:
                # Sem posição aberta e nem sinal de entrada

                if pos != 0:
                    dia_fechamento = dias[i] - dias[0]
                    delta_dias = dia_fechamento - dia_abertura
                    retornos_op.append(log_ret.iloc[i][0]*pos)
                    retorno_op = pd.Series(retornos_op).sum()

                    #Rpair[i-1, pair] = log_ret.iloc[i][0]*pos

                    resumo.append([count, semester, dia_abertura, dia_fechamento, delta_dias, retorno_op, par1, par2, True])

                pos = 0

    if pos != 0:
        # Operação sem convergência
        pos = 0

        dia_fechamento = dias[i] - dias[0]
        delta_dias = dia_fechamento - dia_abertura
        retorno_op = pd.Series(retornos_op).sum()
        print(f"Par {par1}-{par2} sem convergência, retorno_op: {retorno_op}")
        resumo.append([count, semester, dia_abertura, dia_fechamento, delta_dias, retorno_op, par1, par2, False])

    #print(f"Total return: {sum(pair_returns)} | Pair returns: {pair_returns}")
    #print(f"Conversão do par: {pos}")
    total_ret = pd.Series(portfolio_return).sum()

    return total_ret, resumo


# Calcula o expoente de hurst
def get_hurst_exponent(time_series):

    # Definindo o intervalo de taus
    max_tau = round(len(time_series)/4)
    taus = range(2, max_tau)

    # Calculando a variável k
    k = [np.std(np.subtract(time_series[tau:], time_series[:-tau])) for tau in taus]

    'To calculate the Hurst exponent, we first calculate the standard deviation of the differences between a series and its lagged version, for a range of possible lags.'

    # Calculate the slope of the log plot -> the Hurst Exponent
    reg = np.polyfit(np.log(taus), np.log(k), 1)

    'We then estimate the Hurst exponent as the slope of the log-log plot of the number of lags versus the mentioned standard deviations.'

    return reg[0]

In [9]:
def cluster_size(counts):
    plt.figure()
    plt.barh(counts.index+1, counts.values)
    #plt.title('Cluster Member Counts')
    plt.yticks(np.arange(1, len(counts)+1, 1))
    plt.xlabel('ETFs within cluster', size=12)
    plt.ylabel('Cluster Id', size=12);

In [10]:
def plot_TSNE(X, clf, clustered_series_all):
    """
    This function makes use of t-sne to visualize clusters in 2d.
    """

    X_tsne = TSNE(learning_rate=1000, perplexity=25, random_state=1337).fit_transform(X)

    # visualization
    fig = plt.figure(1, facecolor='white', figsize=(15,15), frameon=True, edgecolor='black')
    plt.clf()

    # axis in the middle
    ax = fig.add_subplot(1, 1, 1, alpha=0.9)
    # Move left y-axis and bottim x-axis to centre, passing through (0,0)
    ax.spines['left'].set_position('center')
    ax.spines['left'].set_alpha(0.3)
    ax.spines['bottom'].set_position('center')
    ax.spines['bottom'].set_alpha(0.3)
    # Eliminate upper and right axes
    ax.spines['right'].set_color('none')
    ax.spines['top'].set_color('none')
    # Show ticks in the left and lower axes only
    ax.xaxis.set_ticks_position('bottom')
    ax.yaxis.set_ticks_position('left')
    ax.tick_params(which='major', labelsize=18)
    #plt.axis('off')

    # etfs in cluster
    labels = clf.labels_
    x = X_tsne[(labels!=-1), 0]
    y = X_tsne[(labels!=-1), 1]
    tickers = list(clustered_series_all[clustered_series_all != -1].index)
    plt.scatter(
        x,
        y,
        s=300,
        alpha=0.75,
        c=labels[labels!=-1],
        cmap=cm.Paired
    )
    for i, ticker in enumerate(tickers):
        plt.annotate(ticker, (x[i]-20, y[i]+12), size=15)

    # remaining etfs, not clustered
    x = X_tsne[(clustered_series_all==-1).values, 0]
    y = X_tsne[(clustered_series_all==-1).values, 1]
    tickers = list(clustered_series_all[clustered_series_all == -1].index)

    plt.scatter(
        x,
        y,
        s=150,
        alpha=0.20,
        c='black'
    )

    plt.title('OPTICS clusters visualized with t-SNE', size=16);
    #plt.xlabel('t-SNE Dim. 1', position=(0.92,0), size=20)
    #plt.ylabel('t-SNE Dim. 2', position=(0,0.92), size=20)
    ax.set_xticks(range(-300, 301, 600))
    ax.set_yticks(range(-300, 301, 600))
    plt.savefig('OPTICS_2013_2017.png', bbox_inches='tight', pad_inches=0.1)
    plt.show()

# Aplicando os Métodos

In [11]:
big_loop = 0
i = 0
pairs_df = []
past_days= 0
N_PRIN_COMPONENTS = 10

for big_loop in range(0, len(semestres) - 3):
    print(f"Starting period {big_loop} | Past days: {past_days}")

    # Listando os dias dos intervalos
    inicio = (Datas == semestres['Data'][big_loop]).query("Data == True").index[0]
    twelve_months = (Datas == semestres['Data'][big_loop + 2]).query("Data == True").index[0]
    six_months = (Datas == semestres['Data'][big_loop + 3]).query("Data == True").index[0]

    # Limpeza das ações não listadas no período
    df_prices_train = log_data.iloc[inicio:twelve_months,:].dropna(how="any",axis=1)
    df_prices_test = log_data.iloc[(twelve_months+1):six_months,:]

    # Selecionar só as colunas presentes no treino
    df_prices_test_filtered = df_prices_test[[col for col in df_prices_test.columns if col in df_prices_train.columns]]

    df_returns = data_processor.get_return_series(df_prices_train)

    print('Total number of possible pairs: ', len(df_returns.columns)*(len(df_returns.columns)-1)/2)


    X, explained_variance = series_analyser.apply_PCA(N_PRIN_COMPONENTS, df_returns,
                                                  random_state=0)
    print(explained_variance)

    # DBSCAN
    '''clustered_series_all, clustered_series, counts, clf = series_analyser.apply_DBSCAN(0.15,
                                                                                       3,
                                                                                       X,
                                                                                       df_returns)
    '''

    # OPTICS
    clustered_series_all, clustered_series, counts, clf = series_analyser.apply_OPTICS(X, df_returns, min_samples=3,
                                                                                       #max_eps=5,
                                                                                       cluster_method='xi')

    cluster_size(counts)

    print(counts,'Average cluster size: ', np.mean(counts), 'Total stocks: ', np.sum(counts))

    for label_n in range(len(counts)):
      elements_cluster_n = list(clustered_series[clustered_series == label_n].index)
      print('\nCluster {}:\n{}'.format(label_n,elements_cluster_n))

    plot_TSNE(X,clf, clustered_series_all)

    plt.rcParams.update({'font.size': 12})
    for clust in range(len(counts)):
        symbols = list(clustered_series[clustered_series==clust].index)
        means = np.log(df_prices_train[symbols].mean())
        series = np.log(df_prices_train[symbols]).sub(means)
        series.plot(figsize=(10,5))#title='ETFs Time Series for Cluster %d' % (clust+1))
        #plt.ylabel('Normalized log prices', size=12)
        #plt.xlabel('Date', size=12)
        plt.savefig('cluster_{}.png'.format(str(clust+1)), bbox_inches='tight', pad_inches=0.1)

    pairs_unsupervised, unique_tickers = series_analyser.get_candidate_pairs(clustered_series=clustered_series,
                                                                pricing_df_train=df_prices_train,
                                                                pricing_df_test=df_prices_test,
                                                                min_half_life=min_half_life,
                                                                max_half_life=max_half_life,
                                                                min_zero_crosings=12,
                                                                p_value_threshold=0.10,
                                                                hurst_threshold=0.5,
                                                                subsample=subsample
                                                                )
    #Semestre, par, valores
    pares = []

    for i in range(0,len(pairs_unsupervised)):
      print(pairs_unsupervised[i][2]['hurst_exponent'])

      score = pairs_unsupervised[i][2]['coint_coef']
      S1_name = pairs_unsupervised[i][0]
      S2_name = pairs_unsupervised[i][1]
      coint_coef = pairs_unsupervised[i][2]['coint_coef']
      pares.append([score, S1_name, S2_name, coint_coef])

    colss = ["score", "S1_name", "S2_name","coint_coef"]
    pares = pd.DataFrame(pares, columns = colss)
    sorted_pairs = pares.sort_values(by="score", ascending=True)

    top_20_pairs = sorted_pairs[:no_pairs]  # Select top 20

    # Remover índice antigo e criar novo índice padrão
    top_20_pairs = top_20_pairs.reset_index(drop=True)

    pairs_df.append(top_20_pairs)

Output hidden; open in https://colab.research.google.com to view.

# Fazendo testes de transações:

In [12]:
# Vendo o Lucro da estratégia igual Dist. e Coint.
past_days = 0
resumos = []

for big_loop in range(0, len(semestres) - 3):
    print(f"Starting period {big_loop} | Past days: {past_days}")

    pairs_sem = pairs_df[big_loop]

    # Listando os dias dos intervalos
    inicio = (Datas == semestres['Data'][big_loop]).query("Data == True").index[0]
    twelve_months = (Datas == semestres['Data'][big_loop + 2]).query("Data == True").index[0]
    six_months = (Datas == semestres['Data'][big_loop + 3]).query("Data == True").index[0]

    resultado = []
    print('Pares no Semestre:',len(pairs_sem))

    for i in range(0,len(pairs_sem)):
        S1_name = pairs_sem['S1_name'][i]
        S2_name = pairs_sem['S2_name'][i]
        gamma_1_2 = pairs_sem['coint_coef'][i]

        #Aqui tá o problema das datas:
        S1 = log_data[S1_name].iloc[twelve_months:six_months] # periodo de teste
        S2 = log_data[S2_name].iloc[twelve_months:six_months] # periodo de teste

        #spread, convertendo Datas para nºs de dias
        spread_ = S1 - gamma_1_2*S2
        spread_.name = "spread"
        spread = pd.merge(spread_,Datas, on="Data")
        spread.index = spread['indice']
        spread.drop(['Data','indice'],inplace=True,axis=1)

        print(spread)
        # Pegando o resultado da estratégia
        #resultado.append(np.exp(calculate_profit(spread,1.65)))
        ret, resumos = calculate_profit(i, spread, threshold, S1_name, S2_name, resumos, big_loop, gamma_1_2)
    print("-------------------------------------------------")

Output hidden; open in https://colab.research.google.com to view.

# Salvando os resultados:

In [13]:
#os.makedirs("..\Distancia-Cointegração/cointegration_results", exist_ok=True)

pasta = str("/content/drive/MyDrive/Python/TCC-Cloud/Clustering/result_clustering/")
pasta

#pd.DataFrame(coint_pairs_df).to_csv(pasta + "pares_cointegrados.csv")

cols = ['Operação', 'Semestre', 'Abertura', 'Fechamento', 'Dias', 'Retorno total', 'Ticker 1', 'Ticker 2', 'Converged']
df_results = pd.DataFrame(resumos, columns = cols)
df_results['Index'] = df_results['Ticker 1'].astype(str) + '-' + df_results['Ticker 2'].astype(str) + '-' + df_results['Operação'].astype(str)
df_results['Retorno total - exp'] = np.exp(df_results['Retorno total'])
df_results.to_csv(pasta + "operations_CT.csv", sep=',', index=False)

#pd.DataFrame(Rpair).to_csv(pasta + "Rpair.csv", header=None, index=False)
#daily_returns = np.sum(Rpair, axis=1)
#pd.DataFrame(daily_returns).to_csv(pasta + "daily_returns.csv", header=None, index=False)


# Rascunho

In [14]:
pairs_unsupervised[0]

('VALE3',
 'SLCE3',
 {'t_statistic': -2.8008405417463753,
  'critical_val': {'1%': -3.4557539868570775,
   '5%': -2.8727214497041422,
   '10%': -2.572728476331361},
  'p_value': 0.058153802937034746,
  'coint_coef': 0.28322297932308454,
  'zero_cross': 25,
  'half_life': 12,
  'hurst_exponent': 0.22615876751417416,
  'spread': Data
  2022-06-30    1.738960
  2022-07-01    1.745093
  2022-07-04    1.754614
  2022-07-05    1.724062
  2022-07-06    1.729492
                  ...   
  2023-06-23    1.795347
  2023-06-26    1.790352
  2023-06-27    1.754600
  2023-06-28    1.759933
  2023-06-29    1.771691
  Length: 261, dtype: float64,
  'Y_train': Data
  2022-06-30    2.929155
  2022-07-01    2.929835
  2022-07-04    2.937729
  2022-07-05    2.905770
  2022-07-06    2.913856
                  ...   
  2023-06-23    2.964500
  2023-06-26    2.960105
  2023-06-27    2.927721
  2023-06-28    2.923968
  2023-06-29    2.939162
  Name: SLCE3, Length: 261, dtype: float64,
  'X_train': Data
  202

In [15]:
pairs_unsupervised[0][2]['coint_coef']

0.28322297932308454

In [16]:
unique_tickers

array(['BBSE3', 'BEEF3', 'BRAP4', 'EGIE3', 'GEPA4', 'GGBR4', 'GOAU4',
       'IGTI11', 'MULT3', 'PETR3', 'PETR4', 'PRIO3', 'RRRP3', 'SLCE3',
       'TIMS3', 'TRPL4', 'VALE3', 'VIVT3'], dtype='<U6')

In [17]:
pairs_df

[      score S1_name    S2_name  coint_coef
 0  0.462191   ITSA4  BRDT4-old    0.462191
 1  0.508117   OIBR3      PALF3    0.508117
 2  0.559543   OIBR3      TMAR6    0.559543
 3  0.569854   PETR3      BBDC3    0.569854
 4  0.665784   OIBR3      OIBR4    0.665784,
       score S1_name    S2_name  coint_coef
 0  0.647789   CSTB4  BRDT4-old    0.647789
 1  0.719898   OIBR4      TMAR6    0.719898
 2  0.962546   OIBR3      OIBR4    0.962546
 3  1.285927   OIBR3      TMAR6    1.285927
 4  1.491355   OIBR3      PALF3    1.491355,
       score S1_name    S2_name  coint_coef
 0  0.270856   ITSA4  BRDT4-old    0.270856
 1  0.458211   PCAR4      TMAR6    0.458211
 2  0.483826   LIGT3      WHMT3    0.483826
 3  0.604644   PCAR4      WHMT3    0.604644
 4  0.705735   PALF3      TMAR6    0.705735,
       score S1_name    S2_name  coint_coef
 0  0.160269   CSNA3      KLBN4    0.160269
 1  0.321075   INEP4      PALF3    0.321075
 2  0.338580   CSTB4  BRDT4-old    0.338580
 3  0.416659   ACES4      TMA

In [18]:
# Vendo o Lucro da estratégia Com Classes
past_days = 0
resumos = []

for big_loop in range(0, len(semestres) - 3):
    print(f"Starting period {big_loop} | Past days: {past_days}")

    pairs_sem = pairs[pairs['Semestre'] == big_loop]

    # Listando os dias dos intervalos
    inicio = (Datas == semestres['Data'][big_loop]).query("Data == True").index[0]
    twelve_months = (Datas == semestres['Data'][big_loop + 2]).query("Data == True").index[0]
    six_months = (Datas == semestres['Data'][big_loop + 3]).query("Data == True").index[0]

    train_val_split = twelve_months

    # interday
    n_years_val = round(len(df_prices_train)/(240))

    train_results_without_costs, train_results_with_costs, performance_threshold_train = \
            trader.apply_trading_strategy(pairs_unsupervised,
                                           strategy='fixed_beta',
                                            entry_multiplier=2,
                                            exit_multiplier=0,
                                            test_mode=True,
                                            train_val_split=train_val_split
                                            )

    sharpe_results_threshold_train_nocosts, cum_returns_threshold_train_nocosts = train_results_without_costs
    sharpe_results_threshold_train_w_costs, cum_returns_threshold_train_w_costs = train_results_with_costs

    results, pairs_summary = trader.summarize_results(sharpe_results_threshold_train_nocosts,
                                                      cum_returns_threshold_train_nocosts,
                                                      performance_threshold_train,
                                                      pairs_unsupervised,
                                                      n_years_val)

Starting period 0 | Past days: 0


NameError: name 'pairs' is not defined

In [None]:
inicio = (Datas == semestres['Data'][0]).query("Data == True").index[0]
inicio

In [None]:
ps.iloc[0].name

In [None]:
inicio = (Datas == semestres['Data'][0]).query("Data == True").index[0]
inicio

In [None]:
df_prices_train = log_data.iloc[inicio:twelve_months,:].dropna(how="any",axis=1)

In [None]:
df_prices_train

In [None]:
clustered_series

In [None]:
df_prices_test

In [None]:
    # Limpeza das ações não listadas no período
    df_prices_train = log_data.iloc[inicio:twelve_months,:].dropna(how="any",axis=1)
    df_prices_test = log_data.iloc[(twelve_months+1):six_months,:]



In [None]:

# Select only those columns in df_prices_test
df_prices_test_filtered = df_prices_test[[col for col in df_prices_test.columns if col in df_prices_train.columns]]

df_prices_test_filtered

In [None]:
df_prices_test_filtered['ALPA4']

In [None]:
df_returns

In [None]:
# prompt: Usando o DataFrame df_returns: contar valores nan  valores do df_returns

df_returns.isnull().sum().sum()


In [None]:
X

In [None]:
Xa = pd.DataFrame(X)

In [None]:
Xa.isna().sum()

In [None]:
Xa.isin([np.inf, -np.inf]).sum()

In [None]:
Xa.describe()