# Starting the analysis to choose the algorithm

## Importing the initial Libs 

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pycaret as pc
sns.set_theme()

## Reading the Data

In [None]:
df_mensal = pd.read_excel('../data/df_mensal.xlsx', engine='openpyxl')

In [None]:
taxas_de_juros = pd.read_excel('../data/taxas_de_juros.xlsx', engine='openpyxl')

In [None]:
usd_brl = pd.read_excel('../data/USD_BRL Dados Históricos.xlsx', engine='openpyxl')

## PreProcess of the Data

### df_mensal

In [None]:
df_mensal.head()

In [None]:
df_mensal['DateTime'] = pd.to_datetime(df_mensal['DateTime'])

df_mensal.set_index('DateTime', inplace=True)

df_diario = df_mensal.resample('D').ffill()

df_diario.reset_index(inplace=True)

In [None]:
df_diario.head()

### taxas_de_juros

In [None]:
taxas_de_juros['DateTime'] = pd.to_datetime(taxas_de_juros['DateTime'])

### usd_brl

In [None]:
usd_brl['DateTime'] = pd.to_datetime(usd_brl['DateTime'])

### Merging df's

In [None]:
mensal_and_juros = pd.merge(df_diario, taxas_de_juros, on='DateTime')

In [None]:
mensal_and_juros.head()

In [None]:
mensal_and_juros.shape

In [None]:
df = pd.merge(mensal_and_juros, usd_brl, on='DateTime')

In [None]:
df.head()

## Dev the model

### Finding the best model

In [None]:
from pycaret.regression import setup, compare_models

setup(df, target='Selic')

best_model = compare_models()

### Extra Trees Regressor

In [None]:
df.drop(['DateTime'], axis=1, inplace=True)
df.head()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


imputer = SimpleImputer(strategy='mean')  
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

X = df_imputed.drop('Selic', axis=1)
y = df_imputed['Selic']

param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=42)

et = ExtraTreesRegressor(random_state=42)

grid_search = GridSearchCV(estimator=et, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

print("Melhores parâmetros: ", best_params)

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectFromModel

best_params = {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}

etr = ExtraTreesRegressor(**best_params, random_state=42)

selector = SelectFromModel(estimator=etr).fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

etr.fit(X_train_selected, y_train)

scores = cross_val_score(etr, X_train_selected, y_train, cv=5) 
print(f"Acurácia da validação cruzada: {scores.mean()} (+/- {scores.std() * 2})")

y_pred = etr.predict(X_test_selected)

test_score = etr.score(X_test_selected, y_test)
print(f"Acurácia nos dados de teste: {test_score}")

final_decision = etr.predict(X_test_selected[-1:])
print("Decisão final: ", final_decision)

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error

et_model = ExtraTreesRegressor(
    bootstrap=False,
    max_depth=None,
    min_samples_leaf=2,
    min_samples_split=10,
    n_estimators=200,
    random_state=42  
)

et_model.fit(X_train, y_train)

predictions = et_model.predict(X_test)

mse = mean_squared_error(y_test, predictions)

print("Erro Quadrático Médio (MSE): ", mse)

In [None]:
et_model = ExtraTreesRegressor(n_estimators=200, random_state=42)
et_model.fit(X_train, y_train)

feat_importances = pd.Series(et_model.feature_importances_, index=X_train.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.show()

In [None]:
predictions = et_model.predict(X_test)

comparison_df = pd.DataFrame({'Real Values': y_test, 'Predictions': predictions})

comparison_df

In [None]:
y_train.unique()

In [None]:
final_decision = et_model.predict(X_test.iloc[-1:].values)

print("Decisão final: ", final_decision)    

### Random Forest Regressor	

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='mean')  
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

X = df_imputed.drop('Selic', axis=1)
y = df_imputed['Selic']

param_grid = {
    'n_estimators': [50, 100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=42)

rf = RandomForestRegressor(random_state=42)

grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)

best_params_rf = grid_search_rf.best_params_

print("Melhores parâmetros para RandomForestRegressor: ", best_params_rf)

In [None]:
rf_model = RandomForestRegressor(**best_params_rf, random_state=42)

rf_model.fit(X_train, y_train)

final_decision_rf = rf_model.predict(X_test.iloc[-1:].values)

print("Decisão final para RandomForestRegressor: ", final_decision_rf)

In [None]:
from sklearn.feature_selection import SelectFromModel

best_params_rf = {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}

rf = RandomForestRegressor(**best_params_rf, random_state=42)

selector = SelectFromModel(estimator=rf).fit(X_train, y_train)
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

rf.fit(X_train_selected, y_train)

scores = cross_val_score(rf, X_train_selected, y_train, cv=5)
print(f"Acurácia da validação cruzada: {scores.mean()} (+/- {scores.std() * 2})")

y_pred = rf.predict(X_test_selected)

test_score = rf.score(X_test_selected, y_test)
print(f"Acurácia nos dados de teste: {test_score}")

final_decision_rf = rf.predict(X_test_selected[-1:])
print("Decisão final para RandomForestRegressor: ", final_decision_rf)


In [None]:
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)

rf_model.fit(X_train, y_train)

feat_importances = pd.Series(rf_model.feature_importances_, index=X_train.columns)

top_feat_importances = feat_importances.nlargest(10)

top_feat_importances.plot(kind='barh')
plt.xlabel('Importância das Características')
plt.title('Top 10 Características Mais Importantes')
plt.show()