HOLD OUT

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

def remove_outliers(df, column_names):
    for column in column_names:
        Q1 = df[column].quantile(0.20)
        Q3 = df[column].quantile(0.80)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

file_paths = ['C:/Users/milen/OneDrive/Documentos/TCC/Bases/euro.csv', 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/dolar.csv', 
              'C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv', 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/taxa_br.csv', 
              'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bolsa_eua.csv']

for file_path in file_paths:
    data_dolar = pd.read_csv(file_path)

    data_dolar['Data'] = pd.to_datetime(data_dolar['Data'])
    data_dolar = data_dolar.iloc[1:]

    numeric_columns = data_dolar.select_dtypes(include=[np.number]).columns.tolist()
    data_dolar = remove_outliers(data_dolar, numeric_columns)

    train_data = data_dolar[data_dolar['Data'] < pd.to_datetime("2020-01-01")]
    test_data = data_dolar[data_dolar['Data'] >= pd.to_datetime("2020-01-01")]

    y_train = train_data['Dif_Preco_Dolar']
    X_train = train_data.drop(columns=['Data', 'Preco_Dolar', 'Preco_Real', 'Dif_Preco_Real','Dif_Preco_Dolar'])

    y_test = test_data['Dif_Preco_Dolar']
    X_test = test_data.drop(columns=['Data', 'Preco_Dolar', 'Preco_Real', 'Dif_Preco_Real','Dif_Preco_Dolar'])

    mean_values_train = X_train.mean()
    X_train.fillna(mean_values_train, inplace=True)

    mean_values_test = X_test.mean()
    X_test.fillna(mean_values_test, inplace=True)

    non_numeric_columns = X_train.select_dtypes(include=['object']).columns
    for col in non_numeric_columns:
        X_train[col] = pd.to_numeric(X_train[col].str.replace('.', '').str.replace(',', '.'), errors='coerce')
        X_test[col] = pd.to_numeric(X_test[col].str.replace('.', '').str.replace(',', '.'), errors='coerce')

    model = model = RandomForestRegressor()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"Resultados para {file_path}:")
    print("MSE:", mse)
    print("MAE:", mae)
    print("R^2:", r2)
    print()

    resultados = pd.DataFrame()
    resultados['Data'] = test_data['Data'].reset_index(drop=True)
    resultados['Valor_Real'] = y_test.reset_index(drop=True)
    resultados['Valor_Previsto'] = y_pred

    data_especifica = pd.to_datetime("2023-09-27")
    previsao_especifica = resultados[resultados['Data'] == data_especifica]

    if not previsao_especifica.empty:
        print(previsao_especifica)
    else:
        print(f"Não há dados disponíveis para a data {data_especifica.date()}")


Resultados para C:/Users/milen/OneDrive/Documentos/TCC/Bases/euro.csv:
MSE: 3.0022353692334494
MAE: 1.368704181184669
R^2: -0.08273746292349271

          Data  Valor_Real  Valor_Previsto
273 2023-09-27       -1.16         -0.4515
Resultados para C:/Users/milen/OneDrive/Documentos/TCC/Bases/dolar.csv:
MSE: 3.9454186550276256
MAE: 1.6546530386740337
R^2: -0.43096876929721306

          Data  Valor_Real  Valor_Previsto
348 2023-09-27       -1.16           1.243
Resultados para C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv:
MSE: 3.252425906583868
MAE: 1.4658182692307693
R^2: -0.24634724833791588

          Data  Valor_Real  Valor_Previsto
224 2023-09-27       -1.16          0.9359
Resultados para C:/Users/milen/OneDrive/Documentos/TCC/Bases/taxa_br.csv:
MSE: 3.1039473928016372
MAE: 1.4255237098255276
R^2: -0.10595426531727292

          Data  Valor_Real  Valor_Previsto
349 2023-09-27       -1.16        0.047383
Resultados para C:/Users/milen/OneDrive/Documentos/TCC/Bases/

JANELA DESLIZANTE DE 1 DIA

In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

def remove_outliers(df, column_names):
    for column in column_names:
        Q1 = df[column].quantile(0.20)
        Q3 = df[column].quantile(0.80)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

def create_sliding_windows(data, window_size):
    windows = []
    for i in range(len(data) - window_size + 1):
        window = data[i:i + window_size]
        windows.append(window)
    return np.array(windows)

def normalize_data(train, test):
    scaler = StandardScaler()
    scaler.fit(train)
    return scaler.transform(train), scaler.transform(test)

def add_moving_average(data, window_size=1):
    data[f'Moving_Average_{window_size}'] = data['Dif_Preco_Dolar'].rolling(window=window_size).mean().shift(1)
    return data

file_paths = ['C:/Users/milen/OneDrive/Documentos/TCC/Bases/euro.csv', 
              'C:/Users/milen/OneDrive/Documentos/TCC/Bases/dolar.csv', 
              'C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv', 
              'C:/Users/milen/OneDrive/Documentos/TCC/Bases/taxa_br.csv', 
              'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bolsa_eua.csv']

for file_path in file_paths:
    data = pd.read_csv(file_path)

    data['Data'] = pd.to_datetime(data['Data'])
    data = data.iloc[1:]

    numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
    data = remove_outliers(data, numeric_columns)

    data = add_moving_average(data)

    train_data = data[data['Data'] < pd.to_datetime("2020-01-01")]
    test_data = data[data['Data'] >= pd.to_datetime("2020-01-01")]

    y_train = train_data['Dif_Preco_Dolar']
    X_train = train_data.drop(columns=['Data', 'Preco_Dolar', 'Preco_Real', 'Dif_Preco_Real', 'Dif_Preco_Dolar', f'Moving_Average_{1}'])

    y_test = test_data['Dif_Preco_Dolar']
    X_test = test_data.drop(columns=['Data', 'Preco_Dolar', 'Preco_Real', 'Dif_Preco_Real', 'Dif_Preco_Dolar', f'Moving_Average_{1}'])

    mean_values_train = X_train.mean()
    X_train.fillna(mean_values_train, inplace=True)

    mean_values_test = X_test.mean()
    X_test.fillna(mean_values_test, inplace=True)

    non_numeric_columns = X_train.select_dtypes(include=['object']).columns
    for col in non_numeric_columns:
        X_train[col] = pd.to_numeric(X_train[col].str.replace('.', '').str.replace(',', '.'), errors='coerce')
        X_test[col] = pd.to_numeric(X_test[col].str.replace('.', '').str.replace(',', '.'), errors='coerce')

    X_train, X_test = normalize_data(X_train, X_test)

    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    window_size = 1 

    test_windows = create_sliding_windows(X_test, window_size)

    predictions = []
    for window in test_windows:
        pred = model.predict(window)
        predictions.append(pred[-1]) 

    y_test_adjusted = y_test[window_size - 1:]

    mse = mean_squared_error(y_test_adjusted, predictions)
    mae = mean_absolute_error(y_test_adjusted, predictions)
    r2 = r2_score(y_test_adjusted, predictions)

    print(f"Resultados para {file_path}:")
    print("MSE:", mse)
    print("MAE:", mae)
    print("R^2:", r2)
    print()

    resultados = pd.DataFrame()
    resultados['Data'] = test_data['Data'].iloc[window_size - 1:].reset_index(drop=True)
    resultados['Valor_Real'] = y_test_adjusted.reset_index(drop=True) 
    resultados['Valor_Previsto'] = predictions

    # Filtrar para a data específica
    data_especifica = pd.to_datetime("2023-09-27")
    previsao_especifica = resultados[resultados['Data'] == data_especifica]

    if not previsao_especifica.empty:
        print(f"Previsão para {data_especifica.date()}:")
        print(previsao_especifica)
    else:
        print(f"Não há dados disponíveis para a data {data_especifica.date()}")

    print(f"Resultados para {file_path}:")
    print(resultados.tail(30))
    print()


Resultados para C:/Users/milen/OneDrive/Documentos/TCC/Bases/euro.csv:
MSE: 2.9554556172473876
MAE: 1.3585648083623696
R^2: -0.06586663710462592

Previsão para 2023-09-27:
          Data  Valor_Real  Valor_Previsto
273 2023-09-27       -1.16          -0.412
Resultados para C:/Users/milen/OneDrive/Documentos/TCC/Bases/euro.csv:
          Data  Valor_Real  Valor_Previsto
257 2023-08-28        1.29          -0.412
258 2023-08-29       -0.37          -0.412
259 2023-08-30        0.87          -0.412
260 2023-08-31       -3.22          -0.412
261 2023-09-01        0.72          -0.412
262 2023-09-05       -2.52          -0.412
263 2023-09-06        0.25          -0.412
264 2023-09-11        2.33          -0.412
265 2023-09-12        0.66          -0.412
266 2023-09-13       -0.28          -0.412
267 2023-09-14        1.62          -0.412
268 2023-09-18        1.83          -0.412
269 2023-09-19        0.83          -0.412
270 2023-09-21       -3.23          -0.412
271 2023-09-25       -1.64

JANELA DESLIZANTE DE 30 DIAS

In [6]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

def remove_outliers(df, column_names):
    for column in column_names:
        Q1 = df[column].quantile(0.20)
        Q3 = df[column].quantile(0.80)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

def create_sliding_windows(data, window_size):
    windows = []
    for i in range(len(data) - window_size + 1):
        window = data[i:i + window_size]
        windows.append(window)
    return np.array(windows)

def normalize_data(train, test):
    scaler = StandardScaler()
    scaler.fit(train)
    return scaler.transform(train), scaler.transform(test)

def add_moving_average(data, window_size=30):
    data[f'Moving_Average_{window_size}'] = data['Dif_Preco_Dolar'].rolling(window=window_size).mean().shift(1)
    return data

file_paths = ['C:/Users/milen/OneDrive/Documentos/TCC/Bases/euro.csv', 
              'C:/Users/milen/OneDrive/Documentos/TCC/Bases/dolar.csv', 
              'C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv', 
              'C:/Users/milen/OneDrive/Documentos/TCC/Bases/taxa_br.csv', 
              'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bolsa_eua.csv']

for file_path in file_paths:
    data = pd.read_csv(file_path)

    data['Data'] = pd.to_datetime(data['Data'])
    data = data.iloc[1:]

    numeric_columns = data.select_dtypes(include=[np.number]).columns.tolist()
    data = remove_outliers(data, numeric_columns)

    data = add_moving_average(data)

    train_data = data[data['Data'] < pd.to_datetime("2020-01-01")]
    test_data = data[data['Data'] >= pd.to_datetime("2020-01-01")]

    y_train = train_data['Dif_Preco_Dolar']
    X_train = train_data.drop(columns=['Data', 'Preco_Dolar', 'Preco_Real', 'Dif_Preco_Real', 'Dif_Preco_Dolar', f'Moving_Average_{30}'])

    y_test = test_data['Dif_Preco_Dolar']
    X_test = test_data.drop(columns=['Data', 'Preco_Dolar', 'Preco_Real', 'Dif_Preco_Real', 'Dif_Preco_Dolar', f'Moving_Average_{30}'])

    mean_values_train = X_train.mean()
    X_train.fillna(mean_values_train, inplace=True)

    mean_values_test = X_test.mean()
    X_test.fillna(mean_values_test, inplace=True)

    non_numeric_columns = X_train.select_dtypes(include=['object']).columns
    for col in non_numeric_columns:
        X_train[col] = pd.to_numeric(X_train[col].str.replace('.', '').str.replace(',', '.'), errors='coerce')
        X_test[col] = pd.to_numeric(X_test[col].str.replace('.', '').str.replace(',', '.'), errors='coerce')

    X_train, X_test = normalize_data(X_train, X_test)

    model = RandomForestRegressor()
    model.fit(X_train, y_train)

    window_size = 30 

    test_windows = create_sliding_windows(X_test, window_size)

    predictions = []
    for window in test_windows:
        pred = model.predict(window)
        predictions.append(pred[-1]) 

    y_test_adjusted = y_test[window_size - 1:]

    mse = mean_squared_error(y_test_adjusted, predictions)
    mae = mean_absolute_error(y_test_adjusted, predictions)
    r2 = r2_score(y_test_adjusted, predictions)

    print(f"Resultados para {file_path}:")
    print("MSE:", mse)
    print("MAE:", mae)
    print("R^2:", r2)
    print()

    resultados = pd.DataFrame()
    resultados['Data'] = test_data['Data'].iloc[window_size - 1:].reset_index(drop=True)  
    resultados['Valor_Real'] = y_test_adjusted.reset_index(drop=True) 
    resultados['Valor_Previsto'] = predictions 

    data_especifica = pd.to_datetime("2023-09-27")
    previsao_especifica = resultados[resultados['Data'] == data_especifica]

    if not previsao_especifica.empty:
        print(f"Previsão para {data_especifica.date()}:")
        print(previsao_especifica)
    else:
        print(f"Não há dados disponíveis para a data {data_especifica.date()}")

    print(f"Resultados para {file_path}:")
    print(resultados.tail(30))
    print()




Resultados para C:/Users/milen/OneDrive/Documentos/TCC/Bases/euro.csv:
MSE: 3.2043197708527136
MAE: 1.4196232558139537
R^2: -0.10522980102518975

Previsão para 2023-09-27:
          Data  Valor_Real  Valor_Previsto
244 2023-09-27       -1.16         -0.4694
Resultados para C:/Users/milen/OneDrive/Documentos/TCC/Bases/euro.csv:
          Data  Valor_Real  Valor_Previsto
228 2023-08-28        1.29         -0.4694
229 2023-08-29       -0.37         -0.4694
230 2023-08-30        0.87         -0.4694
231 2023-08-31       -3.22         -0.4694
232 2023-09-01        0.72         -0.4694
233 2023-09-05       -2.52         -0.4694
234 2023-09-06        0.25         -0.4694
235 2023-09-11        2.33         -0.4694
236 2023-09-12        0.66         -0.4694
237 2023-09-13       -0.28         -0.4694
238 2023-09-14        1.62         -0.4694
239 2023-09-18        1.83         -0.4694
240 2023-09-19        0.83         -0.4694
241 2023-09-21       -3.23         -0.4694
242 2023-09-25       -1.64