In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
import numpy as np

# Função para preparar os dados e calcular o MSE
def prepare_and_evaluate(df):
    # Assegurando que os dados estão ordenados por data
    df.sort_values('Date', inplace=True)

    # Usando o preço real do dia anterior como feature para prever o preço do dia atual
    df['Preco_Real_Anterior'] = df['Preco_Real'].shift(1)

    # Removendo a primeira linha que agora contém um NaN
    df.dropna(inplace=True)

    # Separando features (X) e target (y)
    X = df[['Preco_Real_Anterior']]
    y = df['Preco_Real']

    # Dividindo os dados em conjuntos de treinamento e teste
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Criando e treinando o modelo de regressão linear
    model = LinearRegression()
    model.fit(X_train, y_train)

    # Fazendo previsões no conjunto de teste
    y_pred = model.predict(X_test)

    # Calculando o MSE e outras métricas
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mse)

    return y_test, y_pred, mse, mae, rmse

# Carregando e preparando os dados do arquivo CSV
df = pd.read_csv('C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv')

# Preparando e avaliando os dados
y_test, y_pred, mse, mae, rmse = prepare_and_evaluate(df)

# Preparando o DataFrame de resultados
results = pd.DataFrame({
    'Data': y_test.index,
    'Valor Real': y_test.values,
    'Valor Previsto': y_pred,
    'Erro Absoluto': abs(y_test.values - y_pred)
})

# Adicionando as métricas ao final do DataFrame
results = results.append({
    'Data': 'Métricas',
    'Valor Real': 'N/A',
    'Valor Previsto': 'N/A',
    'Erro Absoluto': 'N/A',
    'MSE': mse,
    'MAE': mae,
    'RMSE': rmse
}, ignore_index=True)

# Exibindo os resultados
print(results)


         Data Valor Real Valor Previsto Erro Absoluto        MSE       MAE  \
0        2391     498.33     495.100141      3.229859        NaN       NaN   
1        2068     470.81     482.858766     12.048766        NaN       NaN   
2         528     267.94     268.905635      0.965635        NaN       NaN   
3         440     259.82     262.353489      2.533489        NaN       NaN   
4        3625     826.92     830.694402      3.774402        NaN       NaN   
..        ...        ...            ...           ...        ...       ...   
748      1657     285.72     284.107013      1.612987        NaN       NaN   
749       613     269.11     269.597975      0.487975        NaN       NaN   
750      1172     479.47     477.761538      1.708462        NaN       NaN   
751      1776     285.86     288.471766      2.611766        NaN       NaN   
752  Métricas        N/A            N/A           N/A  47.582422  4.434611   

         RMSE  
0         NaN  
1         NaN  
2         NaN  

  results = results.append({


In [13]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the training and test data files
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_test.csv'


train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Combine training and test data
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# Calculate the difference in 'ultimo' value for the next day
combined_data['next_day_diff'] = combined_data['ultimo'].diff(periods=-1) * -1

# Drop the last row as it does not have a next day value
combined_data = combined_data[:-1]

# Split back into train and test data
train_data = combined_data.iloc[:len(train_data)]
test_data = combined_data.iloc[len(train_data):]

# Prepare the data for model
X_train = train_data['ultimo'].values.reshape(-1, 1)
y_train = train_data['next_day_diff'].values

X_test = test_data['ultimo'].values.reshape(-1, 1)
y_test = test_data['next_day_diff'].values

# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict for test data
y_pred = model.predict(X_test)

# Add predictions to test data for comparison
test_data['predicted_diff'] = y_pred

# Update the real difference for 2023-01-10 as per user's input
test_data.loc[test_data['Date'] == '2023-01-10', 'next_day_diff'] = -34.48

# Calculate the mean squared error and R^2 score
mse_updated = mean_squared_error(test_data['next_day_diff'], test_data['predicted_diff'])
r2_updated = r2_score(test_data['next_day_diff'], test_data['predicted_diff'])

# Function to display predicted and real values for a specific date, including MSE
def display_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Valor Real:", specific_day_data['next_day_diff'].iloc[0])
        print("Valor Previsto:", specific_day_data['predicted_diff'].iloc[0])
        print("MSE:", mse_updated)
    else:
        print("Data não encontrada no conjunto de dados.")

# Display predictions for 2023-01-10
display_predictions_for_date('2023-05-10', test_data)

print("\nMSE Geral:", mse_updated)
print("R^2:", r2_updated)



Data: 2023-05-10
Valor Real: -0.010000000000000675
Valor Previsto: 0.0004997505491416065
MSE: 5.858147237562954

MSE Geral: 5.858147237562954
R^2: -0.005016826817992426


In [23]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load and process data
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)
combined_data['next_day_diff'] = combined_data['ultimo'].diff(periods=-1) * -1
combined_data = combined_data[:-1]  # Drop the last row
train_data = combined_data.iloc[:len(train_data)]
test_data = combined_data.iloc[len(train_data):]

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    train_data[['ultimo']],
    train_data['next_day_diff'],
    test_size=0.2,
    random_state=42
)
X_test = test_data[['ultimo']]
y_test = test_data['next_day_diff']

# Train models
model_xgb = XGBRegressor(random_state=42)
model_rf = RandomForestRegressor(random_state=42)
model_xgb.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

# Make predictions
val_pred_xgb = model_xgb.predict(X_val)
val_pred_rf = model_rf.predict(X_val)
test_pred_xgb = model_xgb.predict(X_test)
test_pred_rf = model_rf.predict(X_test)

# Update test data and evaluate models
test_data['predicted_diff_xgb'] = test_pred_xgb
test_data['predicted_diff_rf'] = test_pred_rf
test_data.loc[test_data['Date'] == '2023-01-10', 'next_day_diff'] = 0.78
test_data.loc[test_data['Date'] == '2023-01-10', 'next_day_diff'] = -34.48

mse_xgb = mean_squared_error(y_test, test_pred_xgb)
r2_xgb = r2_score(y_test, test_pred_xgb)
mse_rf = mean_squared_error(y_test, test_pred_rf)
r2_rf = r2_score(y_test, test_pred_rf)

# Display predictions function
def display_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Valor Real:", specific_day_data['next_day_diff'].iloc[0])
        print("Valor Previsto XGB:", specific_day_data['predicted_diff_xgb'].iloc[0])
        print("Valor Previsto RF:", specific_day_data['predicted_diff_rf'].iloc[0])
    else:
        print("Data não encontrada no conjunto de dados.")

# Display predictions for 2023-05-10 and 2023-01-10
display_predictions_for_date('2023-05-10', test_data)
display_predictions_for_date('2023-01-10', test_data)

# Print general MSE and R^2 for both models
print("\nMSE Geral XGB:", mse_xgb)
print("R^2 XGB:", r2_xgb)
print("MSE Geral RF:", mse_rf)
print("R^2 RF:", r2_rf)



Data: 2023-05-10
Valor Real: -0.010000000000000675
Valor Previsto XGB: 0.0032010628
Valor Previsto RF: -0.015475714285714095

Data: 2023-01-10
Valor Real: -34.48
Valor Previsto XGB: -0.0077595427
Valor Previsto RF: -0.009108155921306193

MSE Geral XGB: 5.855547455217057
R^2 XGB: -0.00457081122675973
MSE Geral RF: 5.85600384396457
R^2 RF: -0.004649108741693997


In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load the training and test data files
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_test.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Combine training and test data
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# Calculate the difference in 'ultimo' value for the next day
combined_data['next_day_diff'] = combined_data['ultimo'].diff(periods=-1) * -1

# Drop the last row as it does not have a next day value
combined_data = combined_data[:-1]

# Split back into train and test data
train_data = combined_data.iloc[:len(train_data)]
test_data = combined_data.iloc[len(train_data):]

# Split the training data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(
    train_data[['ultimo']],
    train_data['next_day_diff'],
    test_size=0.2,  # 20% for validation
    random_state=42
)

# Prepare the data for model
X_test = test_data[['ultimo']]
y_test = test_data['next_day_diff']

# Create and train the model
model = RandomForestRegressor(random_state=42)
model.fit(train_X, train_y)

# Validate the model
val_pred = model.predict(val_X)
val_mse = mean_squared_error(val_y, val_pred)
val_r2 = r2_score(val_y, val_pred)
print("Validation MSE:", val_mse)
print("Validation R^2:", val_r2)

# Predict for test data
y_pred = model.predict(X_test)

# Add predictions to test data for comparison
test_data['predicted_diff'] = y_pred

# Update the real difference for 2023-01-10 as per user's input
test_data.loc[test_data['Date'] == '2023-01-10', 'next_day_diff'] = -34.48

# Calculate the mean squared error and R^2 score
mse_updated = mean_squared_error(test_data['next_day_diff'], test_data['predicted_diff'])
r2_updated = r2_score(test_data['next_day_diff'], test_data['predicted_diff'])

# Function to display predicted and real values for a specific date, including MSE
def display_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Valor Real:", specific_day_data['next_day_diff'].iloc[0])
        print("Valor Previsto:", specific_day_data['predicted_diff'].iloc[0])
        print("MSE:", mse_updated)
    else:
        print("Data não encontrada no conjunto de dados.")

# Display predictions for 2023-01-10
display_predictions_for_date('2023-05-10', test_data)

print("\nMSE Geral:", mse_updated)
print("R^2:", r2_updated)


Validation MSE: 0.0015702384904894484
Validation R^2: -0.06934369509191352

Data: 2023-05-10
Valor Real: -0.010000000000000675
Valor Previsto: -0.015475714285714095
MSE: 5.85600384396457

MSE Geral: 5.85600384396457
R^2: -0.004649108741693997


In [19]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load the training and test data files
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_test.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Combine training and test data
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# Calculate the difference in 'ultimo' value for the next day
combined_data['next_day_diff'] = combined_data['ultimo'].diff(periods=-1) * -1

# Drop the last row as it does not have a next day value
combined_data = combined_data[:-1]

# Split back into train and test data
train_data = combined_data.iloc[:len(train_data)]
test_data = combined_data.iloc[len(train_data):]

# Split the training data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(
    train_data[['ultimo']],
    train_data['next_day_diff'],
    test_size=0.2,  # 20% for validation
    random_state=42
)

# Prepare the data for model
X_test = test_data[['ultimo']]
y_test = test_data['next_day_diff']

# Create and train the XGBoost model
model = XGBRegressor(random_state=42)
model.fit(train_X, train_y)

# Validate the model
val_pred = model.predict(val_X)
val_mse = mean_squared_error(val_y, val_pred)
val_r2 = r2_score(val_y, val_pred)
print("Validation MSE:", val_mse)
print("Validation R^2:", val_r2)

# Predict for test data
y_pred = model.predict(X_test)

# Add predictions to test data for comparison
test_data['predicted_diff'] = y_pred

# Update the real difference for 2023-01-10 as per user's input
test_data.loc[test_data['Date'] == '2023-01-10', 'next_day_diff'] = 0.78

# Calculate the mean squared error and R^2 score
mse_updated = mean_squared_error(test_data['next_day_diff'], test_data['predicted_diff'])
r2_updated = r2_score(test_data['next_day_diff'], test_data['predicted_diff'])

# Function to display predicted and real values for a specific date, including MSE
def display_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Valor Real:", specific_day_data['next_day_diff'].iloc[0])
        print("Valor Previsto:", specific_day_data['predicted_diff'].iloc[0])
        print("MSE:", mse_updated)
    else:
        print("Data não encontrada no conjunto de dados.")

# Display predictions for 2023-05-10
display_predictions_for_date('2023-05-10', test_data)

print("\nMSE Geral:", mse_updated)
print("R^2:", r2_updated)


Validation MSE: 0.0014867284677561707
Validation R^2: -0.012472769542905215

Data: 2023-05-10
Valor Real: -0.010000000000000675
Valor Previsto: 0.0032010628
MSE: 0.0047356470845260875

MSE Geral: 0.0047356470845260875
R^2: -0.05899442427349855


In [20]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load the training and test data files
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_test.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Combine training and test data
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# Calculate the difference in 'ultimo' value for the next day
combined_data['next_day_diff'] = combined_data['ultimo'].diff(periods=-1) * -1

# Drop the last row as it does not have a next day value
combined_data = combined_data[:-1]

# Split back into train and test data
train_data = combined_data.iloc[:len(train_data)]
test_data = combined_data.iloc[len(train_data):]

# Split the training data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(
    train_data[['ultimo']],
    train_data['next_day_diff'],
    test_size=0.2,  # 20% for validation
    random_state=42
)

# Prepare the data for model
X_test = test_data[['ultimo']]
y_test = test_data['next_day_diff']

# Create and train the model
model = LinearRegression()
model.fit(train_X, train_y)

# Validate the model
val_pred = model.predict(val_X)
val_mse = mean_squared_error(val_y, val_pred)
val_r2 = r2_score(val_y, val_pred)
print("Validation MSE:", val_mse)
print("Validation R^2:", val_r2)

# Predict for test data
y_pred = model.predict(X_test)

# Add predictions to test data for comparison
test_data['predicted_diff'] = y_pred

# Update the real difference for 2023-01-10 as per user's input
test_data.loc[test_data['Date'] == '2023-01-10', 'next_day_diff'] = 0.78

# Calculate the mean squared error and R^2 score
mse_updated = mean_squared_error(test_data['next_day_diff'], test_data['predicted_diff'])
r2_updated = r2_score(test_data['next_day_diff'], test_data['predicted_diff'])

# Function to display predicted and real values for a specific date, including MSE
def display_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Valor Real:", specific_day_data['next_day_diff'].iloc[0])
        print("Valor Previsto:", specific_day_data['predicted_diff'].iloc[0])
        print("MSE:", mse_updated)
    else:
        print("Data não encontrada no conjunto de dados.")

# Display predictions for 2023-05-10
display_predictions_for_date('2023-05-10', test_data)

print("\nMSE Geral:", mse_updated)
print("R^2:", r2_updated)


Validation MSE: 0.0014685781598101646
Validation R^2: -0.00011227941125113716

Data: 2023-05-10
Valor Real: -0.010000000000000675
Valor Previsto: 0.0004258820654241004
MSE: 0.004479295751091413

MSE Geral: 0.004479295751091413
R^2: -0.0016686506428051917


In [15]:
import pandas as pd

# Load the training and test data files
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_test.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Combine training and test data
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# Calculate the difference in 'ultimo' value for the next day
combined_data['next_day_diff'] = combined_data['ultimo'].diff(periods=-1) * -1

# Drop the last row as it does not have a next day value
combined_data = combined_data[:-1]

# Use the rolling mean of the last 5 days as the prediction
combined_data['predicted_diff'] = combined_data['ultimo'].rolling(window=5).mean().shift(-1)

# Fill NaN values with zero or some other logic
combined_data['predicted_diff'].fillna(0, inplace=True)

# Split back into train and test data
train_data = combined_data.iloc[:len(train_data)]
test_data = combined_data.iloc[len(train_data):]

# Update the real difference for 2023-01-10 as per user's input
test_data.loc[test_data['Date'] == '2023-01-10', 'next_day_diff'] = -34.48

# Calculate the mean squared error and R^2 score
mse_updated = mean_squared_error(test_data['next_day_diff'], test_data['predicted_diff'])
r2_updated = r2_score(test_data['next_day_diff'], test_data['predicted_diff'])

# Function to display predicted and real values for a specific date, including MSE
def display_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Valor Real:", specific_day_data['next_day_diff'].iloc[0])
        print("Valor Previsto:", specific_day_data['predicted_diff'].iloc[0])
        print("MSE:", mse_updated)
    else:
        print("Data não encontrada no conjunto de dados.")

# Display predictions for 2023-05-10
display_predictions_for_date('2023-05-10', test_data)

print("\nMSE Geral:", mse_updated)
print("R^2:", r2_updated)



Data: 2023-05-10
Valor Real: -0.010000000000000675
Valor Previsto: 4.9639999999999995
MSE: 32.59931284729063

MSE Geral: 32.59931284729063
R^2: -4.592699641305143


In [22]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load the training and test data files
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_test.csv'

train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Combine training and test data
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# Calculate the difference in 'ultimo' value for the next day
combined_data['next_day_diff'] = combined_data['ultimo'].diff(periods=-1) * -1

# Drop the last row as it does not have a next day value
combined_data = combined_data[:-1]

# Split back into train and test data
train_data = combined_data.iloc[:len(train_data)]
test_data = combined_data.iloc[len(train_data):]

# Split the training data into training and validation sets
train_X, val_X, train_y, val_y = train_test_split(
    train_data[['ultimo']],
    train_data['next_day_diff'],
    test_size=0.2,  # 20% for validation
    random_state=42
)

# Prepare the data for model
X_test = test_data[['ultimo']]
y_test = test_data['next_day_diff']

# Create and train the model
# You can adjust the alpha parameter as needed
model = Ridge(alpha=1.0)
model.fit(train_X, train_y)

# Validate the model
val_pred = model.predict(val_X)
val_mse = mean_squared_error(val_y, val_pred)
val_r2 = r2_score(val_y, val_pred)
print("Validation MSE:", val_mse)
print("Validation R^2:", val_r2)

# Predict for test data
y_pred = model.predict(X_test)

# Add predictions to test data for comparison
test_data['predicted_diff'] = y_pred

# Update the real difference for 2023-01-10 as per user's input
test_data.loc[test_data['Date'] == '2023-01-10', 'next_day_diff'] = -34.48

# Calculate the mean squared error and R^2 score
mse_updated = mean_squared_error(test_data['next_day_diff'], test_data['predicted_diff'])
r2_updated = r2_score(test_data['next_day_diff'], test_data['predicted_diff'])

# Function to display predicted and real values for a specific date, including MSE
def display_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Valor Real:", specific_day_data['next_day_diff'].iloc[0])
        print("Valor Previsto:", specific_day_data['predicted_diff'].iloc[0])
        print("MSE:", mse_updated)
    else:
        print("Data não encontrada no conjunto de dados.")

# Display predictions for 2023-05-10
display_predictions_for_date('2023-05-10', test_data)

print("\nMSE Geral:", mse_updated)
print("R^2:", r2_updated)


Validation MSE: 0.0014685781638859197
Validation R^2: -0.00011228218686953895

Data: 2023-05-10
Valor Real: -0.010000000000000675
Valor Previsto: 0.0004259282635012846
MSE: 5.858122843255394

MSE Geral: 5.858122843255394
R^2: -0.005012641759375436


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from statsmodels.tsa.arima.model import ARIMA
import itertools
import warnings
from sklearn.metrics import r2_score

# Carregar dados
data = pd.read_csv('C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv')
preco_real = data['Preco_Real'].values.reshape(-1, 1)

# Normalizar dados
scaler = MinMaxScaler(feature_range=(0, 1))
preco_real_normalized = scaler.fit_transform(preco_real)

# Diferença diária normalizada
diff_preco_real_normalized = preco_real_normalized[1:] - preco_real_normalized[:-1]

# Dividindo os dados em treino e teste
n_test = 30  # Número de observações para o conjunto de teste
train, test = diff_preco_real_normalized[:-n_test], diff_preco_real_normalized[-n_test:]

# Seleção de parâmetros para ARIMA
p = d = q = range(0, 3)
pdq_combinations = list(itertools.product(p, d, q))
best_aic = float('inf')
best_pdq = None
best_model = None

# Ignorar avisos
warnings.filterwarnings("ignore")

# Testar diferentes combinações
print("Iniciando a seleção de parâmetros para o modelo ARIMA...")
for pdq in pdq_combinations:
    try:
        model = ARIMA(train, order=pdq)
        model_fit = model.fit()
        print(f'Testando PDQ: {pdq}, AIC: {model_fit.aic}')
        if model_fit.aic < best_aic:
            best_aic = model_fit.aic
            best_pdq = pdq
            best_model = model_fit
    except:
        continue

# Ajustar modelo no conjunto de treino
model = ARIMA(train, order=best_pdq)
model_fit = model.fit()

# Fazer previsões no conjunto de teste
predictions = model_fit.forecast(steps=n_test)

# Calcular e imprimir R^2
r2_arima = r2_score(test, predictions)
print(f'R^2 (ARIMA): {r2_arima}')


Iniciando a seleção de parâmetros para o modelo ARIMA...
Testando PDQ: (0, 0, 0), AIC: -30232.286693658072
Testando PDQ: (0, 0, 1), AIC: -30232.421831591695
Testando PDQ: (0, 0, 2), AIC: -30233.3551980154
Testando PDQ: (0, 1, 0), AIC: -27426.08387208671
Testando PDQ: (0, 1, 1), AIC: -30222.52290146103
Testando PDQ: (0, 1, 2), AIC: -30005.00371520884
Testando PDQ: (0, 2, 0), AIC: -22815.299698541465
Testando PDQ: (0, 2, 1), AIC: -27403.160328027792
Testando PDQ: (0, 2, 2), AIC: -29584.518055291555
Testando PDQ: (1, 0, 0), AIC: -30232.54597191583
Testando PDQ: (1, 0, 1), AIC: -30228.288928675072
Testando PDQ: (1, 0, 2), AIC: -30242.70313588049
Testando PDQ: (1, 1, 0), AIC: -28644.957340011762
Testando PDQ: (1, 1, 1), AIC: -30221.512713977703
Testando PDQ: (1, 1, 2), AIC: -29992.222403753647
Testando PDQ: (1, 2, 0), AIC: -25245.52709160508
Testando PDQ: (1, 2, 1), AIC: -28619.784964373735
Testando PDQ: (1, 2, 2), AIC: -28643.99067629912
Testando PDQ: (2, 0, 0), AIC: -30234.061605667444
Te

In [6]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import LSTM, Dense
from sklearn.metrics import r2_score

# Carregar e preparar dados
data = pd.read_csv('C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv')
preco_real = data['Preco_Real'].values.reshape(-1, 1)

# Normalizar dados
scaler = MinMaxScaler(feature_range=(0, 1))
preco_real_normalized = scaler.fit_transform(preco_real)

# Preparar dados para LSTM
def create_dataset(dataset, look_back=1):
    X, Y = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back), 0]
        X.append(a)
        Y.append(dataset[i + look_back, 0])
    return np.array(X), np.array(Y)

look_back = 1
X, Y = create_dataset(preco_real_normalized, look_back)

# Reshape [samples, time steps, features]
X = np.reshape(X, (X.shape[0], 1, X.shape[1]))

# Dividir dados em treino e teste
train_size = int(len(X) * 0.7)
test_size = len(X) - train_size
trainX, testX = X[0:train_size], X[train_size:len(X)]
trainY, testY = Y[0:train_size], Y[train_size:len(Y)]

# Construir modelo LSTM
model = Sequential()
model.add(LSTM(50, input_shape=(1, look_back)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')

# Treinar modelo
print("Iniciando treinamento do modelo LSTM...")
model.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)
print("Treinamento concluído.")

# Fazer previsões
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)

# Inverter previsões
trainPredict = scaler.inverse_transform(trainPredict)
trainY = scaler.inverse_transform([trainY])
testPredict = scaler.inverse_transform(testPredict)
testY = scaler.inverse_transform([testY])

# Calcular e imprimir R^2
r2_lstm = r2_score(testY[0], testPredict[:,0])
print(f'R^2 (LSTM): {r2_lstm}')

# Assuming you have the date for which you want to make a prediction
target_date = '2023-10-11'
target_index = data[data['Date'] == target_date].index[0]

# Check if target_index is within the range
if target_index >= train_size:
    # Prepare the input for prediction
    input_data = X[target_index - train_size].reshape(1, 1, look_back)
    
    # Make the prediction
    predicted_value = model.predict(input_data)

    # Invert the normalization
    predicted_value = scaler.inverse_transform(predicted_value)
    print(f'Predicted value for {target_date}: {predicted_value[0][0]}')
else:
    print(f"Target date {target_date} is not within the prediction range.")


Iniciando treinamento do modelo LSTM...
Epoch 1/100
2949/2949 - 3s - loss: 4.4898e-04 - 3s/epoch - 1ms/step
Epoch 2/100
2949/2949 - 3s - loss: 3.0099e-05 - 3s/epoch - 897us/step
Epoch 3/100
2949/2949 - 2s - loss: 3.0197e-05 - 2s/epoch - 830us/step
Epoch 4/100
2949/2949 - 2s - loss: 3.1645e-05 - 2s/epoch - 832us/step
Epoch 5/100
2949/2949 - 2s - loss: 2.9369e-05 - 2s/epoch - 840us/step
Epoch 6/100
2949/2949 - 3s - loss: 2.8244e-05 - 3s/epoch - 855us/step
Epoch 7/100
2949/2949 - 2s - loss: 2.8313e-05 - 2s/epoch - 833us/step
Epoch 8/100
2949/2949 - 2s - loss: 2.8277e-05 - 2s/epoch - 834us/step
Epoch 9/100
2949/2949 - 3s - loss: 2.7586e-05 - 3s/epoch - 928us/step
Epoch 10/100
2949/2949 - 3s - loss: 2.7155e-05 - 3s/epoch - 971us/step
Epoch 11/100
2949/2949 - 4s - loss: 2.7053e-05 - 4s/epoch - 1ms/step
Epoch 12/100
2949/2949 - 3s - loss: 2.6649e-05 - 3s/epoch - 921us/step
Epoch 13/100
2949/2949 - 3s - loss: 2.6072e-05 - 3s/epoch - 851us/step
Epoch 14/100
2949/2949 - 3s - loss: 2.7316e-05 - 3

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Carregando os dados de treino e teste
file_path_train = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_train.csv'
file_path_test = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/bc_test.csv'


df_train = pd.read_csv(file_path_train)
df_test = pd.read_csv(file_path_test)

# Combinando os conjuntos de dados de treino e teste
df_combined = pd.concat([df_train, df_test])

# Função para remover outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Removendo outliers da coluna 'Preco_Real'
df_combined = remove_outliers(df_combined, 'Preco_Real')

# Calculando a diferença diária dos pontos
df_combined.sort_values('Date', inplace=True)
df_combined['Diferenca_Pontos'] = df_combined['Preco_Real'].diff()

# Criando coluna para o "valor previsto" como a diferença do dia anterior
df_combined['Valor_Previsto'] = df_combined['Diferenca_Pontos'].shift(-1)

# Removendo as linhas com valores NaN resultantes dos cálculos
df_combined.dropna(inplace=True)

# Exibindo os resultados
print(df_combined[['Date', 'Preco_Real', 'Diferenca_Pontos', 'Valor_Previsto']].tail())

# Calculando as métricas
y_real = df_combined['Diferenca_Pontos']
y_previsto = df_combined['Valor_Previsto']
mse = mean_squared_error(y_real, y_previsto)
mae = mean_absolute_error(y_real, y_previsto)
rmse = mse ** 0.5
r2 = r2_score(y_real, y_previsto)

# Exibindo as métricas
print(f"MSE: {mse}, MAE: {mae}, RMSE: {rmse}, R^2: {r2}")

           Date  Preco_Real  Diferenca_Pontos  Valor_Previsto
154  2023-10-10      801.77             -3.07           -1.30
155  2023-10-11      800.47             -1.30           19.00
156  2023-10-13      819.47             19.00            5.03
157  2023-10-16      824.50              5.03            2.39
158  2023-10-17      826.89              2.39            5.97
MSE: 73.9302806302635, MAE: 6.1377940776962765, RMSE: 8.598271956053932, R^2: -1.004415282858817


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the datasets
train_file_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv' # Update this path
test_file_path = 'path_to_testing_dataset.csv'    # Update this path

data_train = pd.read_csv(train_file_path)
data_test = pd.read_csv(test_file_path)

# Convert 'Date' to datetime for both datasets
data_train['Date'] = pd.to_datetime(data_train['Date'])
data_test['Date'] = pd.to_datetime(data_test['Date'])

# Calculate day-to-day price difference for training data
data_train['Price_Diff'] = data_train['Preco_Real'].diff()
data_train = data_train.dropna()

# Lists for metrics and predictions
predictions_df = pd.DataFrame(columns=['Date', 'Actual', 'Predicted'])

# Implementing the Baseline Model: Use last known value as prediction
for index, row in data_test.iterrows():
    current_date = row['Date']
    test_price = row['Preco_Real']

    # Find the closest previous date in the training data
    closest_date = data_train[data_train['Date'] < current_date].iloc[-1]['Date']
    last_known_price = data_train[data_train['Date'] == closest_date]['Preco_Real'].values[0]

    predictions_df = predictions_df.append({
        'Date': current_date,
        'Actual': test_price - last_known_price,
        'Predicted': last_known_price - data_train[data_train['Date'] < closest_date].iloc[-1]['Preco_Real']
    }, ignore_index=True)

# Calculate overall metrics
mse = mean_squared_error(predictions_df['Actual'], predictions_df['Predicted'])
rmse = np.sqrt(mse)
mae = mean_absolute_error(predictions_df['Actual'], predictions_df['Predicted'])
r2 = r2_score(predictions_df['Actual'], predictions_df['Predicted'])

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R2:", r2)
print(predictions_df)


  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictio

MSE: 77175.31569285714
RMSE: 277.8044558549361
MAE: 247.6977380952381
R2: -3.877985105805526
          Date  Actual  Predicted
0   2023-03-01  -26.44      19.51
1   2023-03-02  -23.99      19.51
2   2023-03-03  -46.60      19.51
3   2023-03-06  -37.07      19.51
4   2023-03-07  -26.97      19.51
..         ...     ...        ...
163 2023-10-24 -285.01      19.51
164 2023-10-25 -310.07      19.51
165 2023-10-26 -313.14      19.51
166 2023-10-27 -319.33      19.51
167 2023-10-30 -324.00      19.51

[168 rows x 3 columns]


  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({


In [6]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Carrega o dataset
dolar_file_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv'
data_real = pd.read_csv(dolar_file_path)

# Converte 'Date' para datetime
data_real['Date'] = pd.to_datetime(data_real['Date'])

# Calcula a diferença diária de preço
data_real['Price_Diff'] = data_real['Preco_Real'].diff()

# Remove a primeira linha, pois ela terá um valor NaN para 'Price_Diff'
data_real = data_real.dropna()

# Configuração do período de teste
start_test_date = pd.to_datetime('2022-01-01')
end_test_date = pd.to_datetime('2023-08-01')

# Listas para métricas e previsões
metrics = []
predictions_df = pd.DataFrame(columns=['Date', 'Actual', 'Predicted'])

# Implementando o Modelo de Linha de Base: Usa o último valor conhecido como previsão
for current_date in pd.date_range(start=start_test_date, end=end_test_date, freq='1D'):
    test_window = data_real[data_real['Date'] == current_date]

    if test_window.empty:
        continue

    last_known_price = data_real[data_real['Date'] < current_date].iloc[-1]['Preco_Real']
    predictions_df = predictions_df.append({
        'Date': current_date,
        'Actual': test_window['Price_Diff'].values[0],
        'Predicted': last_known_price - data_real[data_real['Date'] < current_date].iloc[-2]['Preco_Real']
    }, ignore_index=True)

# Calcula as métricas gerais
mse = mean_squared_error(predictions_df['Actual'], predictions_df['Predicted'])
rmse = np.sqrt(mse)
mae = mean_absolute_error(predictions_df['Actual'], predictions_df['Predicted'])
r2 = r2_score(predictions_df['Actual'], predictions_df['Predicted'])

# Mostra as previsões para um período específico
desired_period = predictions_df[
    (predictions_df['Date'] >= '2023-07-01') &
    (predictions_df['Date'] <= '2023-07-09')
]

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R2:", r2)
print(desired_period)


  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictio

MSE: 650.2591000000006
RMSE: 25.500178430748296
MAE: 19.119141414141424
R2: -0.9806884307243899
          Date  Actual  Predicted
374 2023-07-03   -1.11      -4.54
375 2023-07-04    0.22      -1.11
376 2023-07-05   -2.45       0.22
377 2023-07-06    7.16      -2.45
378 2023-07-07   -0.22       7.16


  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictions_df = predictions_df.append({
  predictio