## Importação das bibliotecas

In [11]:
import pandas as pd
import numpy as np

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

import xgboost as xgb

## Aquisição dos dados

In [12]:
input_path = '../data/processed/dados_historicos_ibovespa_2015-2025_processed.csv'

df = pd.read_csv(input_path, index_col='ds', parse_dates=['ds'])
df.tail()

Unnamed: 0_level_0,target,close,open,high,low,volume,daily_return,return_lag_1,return_lag_2,return_lag_3,...,momentum_21,momentum_63,sma_21,ema_50,rsi_14,atr_14,obv,day_of_week,day_of_month,month
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-06-12,0,137800,137127,137931,136175,7120000000.0,0.0049,0.0051,0.0054,-0.003,...,-0.004501,0.11251,137814.619048,134894.97767,56.429439,1746.368879,136513100000.0,3,12,6
2025-06-13,1,137213,137800,137800,136586,8630000000.0,-0.0043,0.0049,0.0051,0.0054,...,-0.015222,0.092138,137713.619048,134985.880507,53.14965,1708.342531,127883100000.0,4,13,6
2025-06-16,0,139256,137212,139988,137212,7620000000.0,0.0149,-0.0043,0.0049,0.0051,...,0.000496,0.079864,137716.904762,135153.336173,61.53024,1784.603778,135503100000.0,0,16,6
2025-06-17,0,138840,139256,139497,138293,8380000000.0,-0.003,0.0149,-0.0043,0.0049,...,-0.005701,0.061192,137679.0,135297.911225,59.20777,1743.13208,127123100000.0,1,17,6
2025-06-18,0,138717,138844,139161,138443,8320000000.0,-0.0009,-0.003,0.0149,-0.0043,...,-0.009942,0.055083,137612.666667,135431.993138,58.50462,1669.90836,118803100000.0,2,18,6


In [13]:
# Ao invés do target categórico, vamos criar um target numérico (baseado no retorno diário)
df.drop('target', axis=1, inplace=True)
df['target'] = df['daily_return'].shift(-1)
df.dropna(inplace=True)

## Construção do modelo de classificação

In [14]:
# definindo variáveis preditoras e variável alvo
remove_features = ['open', 'high', 'low', 'close', 'volume', 'target', 'ema_50', 'obv']

X = df.drop(columns=remove_features)
y = df['target']

In [15]:
# divisão entre treino e teste (30 dias de pregão, conforme requisito) de forma cronológica (sem aleatoriedade)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=30, shuffle=False)

print(f"Tamanho do treino: {len(X_train)} amostras")
print(f"Tamanho do teste: {len(X_test)} amostras")

Tamanho do treino: 2451 amostras
Tamanho do teste: 30 amostras


In [16]:
# definindo modelos de classificação
seed = 42
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=seed),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=seed, n_jobs=-1),
    'XGBoost Regressor': xgb.XGBRegressor(random_state=seed, n_jobs=-1),
    'SVR': Pipeline([
        ('scaler', StandardScaler()),
        ('model', SVR())
    ])
}

In [17]:
# treinando os modelos
for name, model in models.items():
    print(f"Treinando {name}...")
    model.fit(X_train, y_train)

print("Todos os modelos foram treinados!")

Treinando Linear Regression...
Treinando Decision Tree Regressor...
Treinando Random Forest Regressor...
Treinando XGBoost Regressor...
Treinando SVR...
Todos os modelos foram treinados!


In [18]:
# comparando o desempenho dos modelos
resultados_regressao = {}

for nome, modelo in models.items():
    y_pred = modelo.predict(X_test)
    
    # Calcula as métricas de regressão
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Calcula a acurácia direcional
    # Compara o sinal (+) ou (-) da previsão com o sinal do valor real
    acuracia_direcional = np.mean(np.sign(y_pred) == np.sign(y_test)) * 100
    
    # Armazena todos os resultados no dicionário
    resultados_regressao[nome] = {
        'MAE': mae,
        'MSE': mse,
        'R2 Score': r2,
        'Acurácia Direcional (%)': acuracia_direcional
    }


df_resultados = pd.DataFrame.from_dict(resultados_regressao, orient='index').sort_values(by='Acurácia Direcional (%)', ascending=False)
df_resultados

Unnamed: 0,MAE,MSE,R2 Score,Acurácia Direcional (%)
Linear Regression,0.005607,5.6e-05,0.036219,56.666667
Decision Tree Regressor,0.013817,0.000255,-3.355114,53.333333
Random Forest Regressor,0.006096,7e-05,-0.198436,53.333333
XGBoost Regressor,0.006709,8.7e-05,-0.483063,53.333333
SVR,0.016164,0.000317,-4.414751,50.0


Resultado desanimador kkk