In [None]:

# Machine Learning aplicado à Taxa de Juros do Banco Central

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Carrega os dados
df = pd.read_csv('src/datasets/taxaJuros.csv', sep=';', decimal='.')

# Conversão de datas
df['DataReferencia'] = pd.to_datetime(df['DataReferencia'], errors='coerce')
df = df.sort_values('DataReferencia')

# Conversão de colunas numéricas
for col in df.columns:
    if df[col].dtype == 'object' and col not in ['InstituicaoFinanceira', 'ModalidadeCredito', 'TipoPessoa']:
        df[col] = df[col].astype(str).str.replace(',', '.', regex=False)
        try:
            df[col] = df[col].astype(float)
        except:
            pass

# Filtra apenas as colunas numéricas
df_num = df.select_dtypes(include=['float64', 'int64'])

# Correlação
correlacao = df_num.corr()
plt.figure(figsize=(10, 6))
sns.heatmap(correlacao[['TaxaJurosAoAno']].sort_values(by='TaxaJurosAoAno', ascending=False), annot=True, cmap='coolwarm')
plt.title('Correlação com TaxaJurosAoAno')
plt.tight_layout()
plt.show()

# Seleção de features
X_import = df_num.drop(columns=['TaxaJurosAoAno'])
y_import = df_num['TaxaJurosAoAno']

modelo_temp = RandomForestRegressor(n_estimators=200, random_state=42)
modelo_temp.fit(X_import, y_import)

importancias = pd.Series(modelo_temp.feature_importances_, index=X_import.columns)
melhores_variaveis = importancias.sort_values(ascending=False).head(8).index.tolist()

# Criação de lags
df['lag1'] = df['TaxaJurosAoAno'].shift(1)
df['lag2'] = df['TaxaJurosAoAno'].shift(2)

df_model = df.dropna()
X = df_model[melhores_variaveis + ['lag1', 'lag2']]
y = df_model['TaxaJurosAoAno']

X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=False, test_size=0.3)

modelo = RandomForestRegressor(
    n_estimators=100,
    max_depth=4,
    min_samples_split=3,
    min_samples_leaf=2,
    max_features='log2',
    random_state=42
)
modelo.fit(X_train, y_train)

y_pred = modelo.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"RMSE: {rmse:,.2f}")

# Plot da série temporal real vs prevista
y_full = np.concatenate([modelo.predict(X_train), y_pred])
datas_full = df_model['DataReferencia'].iloc[-len(y_full):]  
plt.figure(figsize=(12, 6))
plt.plot(df_model['DataReferencia'], df_model['TaxaJurosAoAno'], label='Taxa Real', color='blue')
plt.plot(datas_full, y_full, label='Previsão', color='green', linestyle='--')
plt.title('Previsão da Taxa de Juros ao Ano')
plt.xlabel('Data')
plt.ylabel('Taxa de Juros (%)')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
