1 D

In [22]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load and process data
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)
combined_data['next_day_diff'] = combined_data['Preco_Real'].diff(periods=-1) * -1
combined_data = combined_data[:-1]  # Drop the last row
train_data = combined_data.iloc[:len(train_data)]
test_data = combined_data.iloc[len(train_data):]

# Split data
X_train, X_val, y_train, y_val = train_test_split(
    train_data[['Preco_Real']],
    train_data['next_day_diff'],
    test_size=0.2,
    random_state=42
)
X_test = test_data[['Preco_Real']]
y_test = test_data['next_day_diff']

# Train models
model_lr = LinearRegression()
model_ridge = Ridge(alpha=1.0)
model_xgb = XGBRegressor(random_state=42)
model_rf = RandomForestRegressor(random_state=42)

model_lr.fit(X_train, y_train)
model_ridge.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

# Make predictions
val_pred_lr = model_lr.predict(X_val)
val_pred_ridge = model_ridge.predict(X_val)
val_pred_xgb = model_xgb.predict(X_val)
val_pred_rf = model_rf.predict(X_val)

test_pred_lr = model_lr.predict(X_test)
test_pred_ridge = model_ridge.predict(X_test)
test_pred_xgb = model_xgb.predict(X_test)
test_pred_rf = model_rf.predict(X_test)

# Update test data and evaluate models
test_data['predicted_diff_lr'] = test_pred_lr
test_data['predicted_diff_ridge'] = test_pred_ridge
test_data['predicted_diff_xgb'] = test_pred_xgb
test_data['predicted_diff_rf'] = test_pred_rf

test_data.loc[test_data['Date'] == '2023-05-10', 'next_day_diff'] = 0.78  # Update as per your requirement

mse_lr = mean_squared_error(y_test, test_pred_lr)
r2_lr = r2_score(y_test, test_pred_lr)
mse_ridge = mean_squared_error(y_test, test_pred_ridge)
r2_ridge = r2_score(y_test, test_pred_ridge)
mse_xgb = mean_squared_error(y_test, test_pred_xgb)
r2_xgb = r2_score(y_test, test_pred_xgb)
mse_rf = mean_squared_error(y_test, test_pred_rf)
r2_rf = r2_score(y_test, test_pred_rf)

# Display predictions function
def display_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Valor Real:", specific_day_data['next_day_diff'].iloc[0])
        print("Valor Previsto LR:", specific_day_data['predicted_diff_lr'].iloc[0])
        print("Valor Previsto Ridge:", specific_day_data['predicted_diff_ridge'].iloc[0])
        print("Valor Previsto XGB:", specific_day_data['predicted_diff_xgb'].iloc[0])
        print("Valor Previsto RF:", specific_day_data['predicted_diff_rf'].iloc[0])
    else:
        print("Data não encontrada no conjunto de dados.")

# # Display predictions for 2023-05-10 and 2023-01-10
display_predictions_for_date('2023-05-10', test_data)

# Print general MSE and R^2 for all models
print("\nMSE Geral LR:", mse_lr)
print("R^2 LR:", r2_lr)
print("MSE Geral Ridge:", mse_ridge)
print("R^2 Ridge:", r2_ridge)
print("MSE Geral XGB:", mse_xgb)
print("R^2 XGB:", r2_xgb)
print("MSE Geral RF:", mse_rf)
print("R^2 RF:", r2_rf)



Data: 2023-05-10
Valor Real: 0.78
Valor Previsto LR: 0.2953449804226712
Valor Previsto Ridge: 0.29534498005706955
Valor Previsto XGB: -1.3776718
Valor Previsto RF: -8.779000000000087

MSE Geral LR: 160.69990437999883
R^2 LR: -0.024772522586222978
MSE Geral Ridge: 160.69990437866147
R^2 Ridge: -0.02477252257769469
MSE Geral XGB: 190.0672813675326
R^2 XGB: -0.21204631788414496
MSE Geral RF: 312.07361564538934
R^2 RF: -0.9900725365791365


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_diff_lr'] = test_pred_lr
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_diff_ridge'] = test_pred_ridge
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_diff_xgb'] = test_pred_xgb
A value is trying to be set on a copy of a slice from a Dat

In [29]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Carregar o dataset
dolar_file_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/base_completa.csv'
data_real = pd.read_csv(dolar_file_path)

# Converter 'Date' para datetime
data_real['Date'] = pd.to_datetime(data_real['Date'])

# Calcular a diferença diária de preço
data_real['Daily_Price_Diff'] = data_real['Preco_Real'].diff()

# Calcular a diferença acumulada de 7 dias
data_real['7_Day_Price_Diff'] = data_real['Daily_Price_Diff'].rolling(window=7).sum()

# Remover linhas com NaN (primeiros 6 dias não terão uma diferença de 7 dias)
data_real = data_real.dropna(subset=['7_Day_Price_Diff'])

# Configuração do período de teste
start_test_date = pd.to_datetime('2022-01-01')
end_test_date = pd.to_datetime('2023-08-01')

# Listas para métricas e previsões
metrics = []
predictions_df = pd.DataFrame(columns=['Date', 'Actual', 'Predicted'])

# Implementar o Modelo de Linha de Base para 7 dias
for current_date in pd.date_range(start=start_test_date, end=end_test_date, freq='7D'):
    test_window = data_real[data_real['Date'] == current_date]

    if test_window.empty:
        continue

    # Usa a diferença acumulada dos últimos 7 dias como previsão
    predicted_diff = data_real[data_real['Date'] < current_date].iloc[-7:]['Daily_Price_Diff'].sum()
    predictions_df = predictions_df.append({
        'Date': current_date,
        'Actual': test_window['7_Day_Price_Diff'].values[0],
        'Predicted': predicted_diff
    }, ignore_index=True)

# Calcular as métricas gerais
mse = mean_squared_error(predictions_df['Actual'], predictions_df['Predicted'])
rmse = np.sqrt(mse)
mae = mean_absolute_error(predictions_df['Actual'], predictions_df['Predicted'])
r2 = r2_score(predictions_df['Actual'], predictions_df['Predicted'])

# Mostrar as previsões para um período específico
desired_period = predictions_df[
    (predictions_df['Date'] >= '2023-03-10') &
    (predictions_df['Date'] <= '2023-05-10')
]

print("MSE:", mse)
print("RMSE:", rmse)
print("MAE:", mae)
print("R2:", r2)
print(desired_period)


ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.

In [23]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Carregar e processar os dados
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# Calcular a diferença do próximo dia (alvo a ser previsto)
combined_data['next_day_diff'] = combined_data['Preco_Real'].diff(periods=-1) * -1

# Calcular média móvel de 30 dias (como característica)
combined_data['30_day_moving_avg'] = combined_data['Preco_Real'].rolling(window=30).mean()

# Remover linhas com NaN
combined_data.dropna(subset=['30_day_moving_avg', 'next_day_diff'], inplace=True)

# Atualizar manualmente o valor para 2023-05-10 conforme necessário
combined_data.loc[combined_data['Date'] == '2023-05-10', 'next_day_diff'] = 0.78

# Dividir novamente em dados de treino e teste
train_len = len(train_data) - 29  # Ajuste para as linhas NaN removidas
train_data = combined_data.iloc[:train_len]
test_data = combined_data.iloc[train_len:]

# Dividir os dados
X_train, X_val, y_train, y_val = train_test_split(
    train_data[['30_day_moving_avg']],
    train_data['next_day_diff'],
    test_size=0.2,
    random_state=42
)
X_test = test_data[['30_day_moving_avg']]
y_test = test_data['next_day_diff']

# Treinar modelos
model_lr = LinearRegression()
model_ridge = Ridge(alpha=1.0)
model_xgb = XGBRegressor(random_state=42)
model_rf = RandomForestRegressor(random_state=42)

model_lr.fit(X_train, y_train)
model_ridge.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

# Fazer previsões
test_pred_lr = model_lr.predict(X_test)
test_pred_ridge = model_ridge.predict(X_test)
test_pred_xgb = model_xgb.predict(X_test)
test_pred_rf = model_rf.predict(X_test)

# Atualizar dados de teste e avaliar modelos
test_data['predicted_diff_lr'] = test_pred_lr
test_data['predicted_diff_ridge'] = test_pred_ridge
test_data['predicted_diff_xgb'] = test_pred_xgb
test_data['predicted_diff_rf'] = test_pred_rf

# Função para exibir previsões para uma data específica
def display_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Valor Real:", specific_day_data['next_day_diff'].iloc[0])
        print("Valor Previsto LR:", specific_day_data['predicted_diff_lr'].iloc[0])
        print("Valor Previsto Ridge:", specific_day_data['predicted_diff_ridge'].iloc[0])
        print("Valor Previsto XGB:", specific_day_data['predicted_diff_xgb'].iloc[0])
        print("Valor Previsto RF:", specific_day_data['predicted_diff_rf'].iloc[0])
    else:
        print("Data não encontrada no conjunto de dados.")

# Exibir previsões para uma data específica
display_predictions_for_date('2023-05-10', test_data)

# Calcular e imprimir o MSE e R^2 para todos os modelos
mse_lr = mean_squared_error(y_test, test_pred_lr)
r2_lr = r2_score(y_test, test_pred_lr)
mse_ridge = mean_squared_error(y_test, test_pred_ridge)
r2_ridge = r2_score(y_test, test_pred_ridge)
mse_xgb = mean_squared_error(y_test, test_pred_xgb)
r2_xgb = r2_score(y_test, test_pred_xgb)
mse_rf = mean_squared_error(y_test, test_pred_rf)
r2_rf = r2_score(y_test, test_pred_rf)

print("\nMSE Geral LR:", mse_lr)
print("R^2 LR:", r2_lr)
print("MSE Geral Ridge:", mse_ridge)
print("R^2 Ridge:", r2_ridge)
print("MSE Geral XGB:", mse_xgb)
print("R^2 XGB:", r2_xgb)
print("MSE Geral RF:", mse_rf)
print("R^2 RF:", r2_rf)



Data: 2023-05-10
Valor Real: 0.78
Valor Previsto LR: 0.012232945821221919
Valor Previsto Ridge: 0.01223294628195179
Valor Previsto XGB: 5.2427754
Valor Previsto RF: -0.11229999999994561

MSE Geral LR: 159.7063500925031
R^2 LR: -0.01843669346764565
MSE Geral Ridge: 159.70635009399254
R^2 Ridge: -0.01843669347714383
MSE Geral XGB: 197.63191770113642
R^2 XGB: -0.2602854969175299
MSE Geral RF: 245.16989148838323
R^2 RF: -0.5634319704922703


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_diff_lr'] = test_pred_lr
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_diff_ridge'] = test_pred_ridge
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_diff_xgb'] = test_pred_xgb
A value is trying to be set on a copy of a slice from a Dat

In [24]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Carregar e processar os dados
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# Calcular a diferença do próximo dia (alvo a ser previsto)
combined_data['next_day_diff'] = combined_data['Preco_Real'].diff(periods=-1) * -1

# Calcular média móvel de 30 dias (como característica)
combined_data['30_day_moving_avg'] = combined_data['Preco_Real'].rolling(window=1).mean()

# Remover linhas com NaN
combined_data.dropna(subset=['30_day_moving_avg', 'next_day_diff'], inplace=True)

# Atualizar manualmente o valor para 2023-05-10 conforme necessário
combined_data.loc[combined_data['Date'] == '2023-05-10', 'next_day_diff'] = 0.78

# Dividir novamente em dados de treino e teste
train_len = len(train_data) - 1  # Ajuste para as linhas NaN removidas
train_data = combined_data.iloc[:train_len]
test_data = combined_data.iloc[train_len:]

# Dividir os dados
X_train, X_val, y_train, y_val = train_test_split(
    train_data[['30_day_moving_avg']],
    train_data['next_day_diff'],
    test_size=0.2,
    random_state=42
)
X_test = test_data[['30_day_moving_avg']]
y_test = test_data['next_day_diff']

# Treinar modelos
model_lr = LinearRegression()
model_ridge = Ridge(alpha=1.0)
model_xgb = XGBRegressor(random_state=42)
model_rf = RandomForestRegressor(random_state=42)

model_lr.fit(X_train, y_train)
model_ridge.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

# Fazer previsões
test_pred_lr = model_lr.predict(X_test)
test_pred_ridge = model_ridge.predict(X_test)
test_pred_xgb = model_xgb.predict(X_test)
test_pred_rf = model_rf.predict(X_test)

# Atualizar dados de teste e avaliar modelos
test_data['predicted_diff_lr'] = test_pred_lr
test_data['predicted_diff_ridge'] = test_pred_ridge
test_data['predicted_diff_xgb'] = test_pred_xgb
test_data['predicted_diff_rf'] = test_pred_rf

# Função para exibir previsões para uma data específica
def display_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Valor Real:", specific_day_data['next_day_diff'].iloc[0])
        print("Valor Previsto LR:", specific_day_data['predicted_diff_lr'].iloc[0])
        print("Valor Previsto Ridge:", specific_day_data['predicted_diff_ridge'].iloc[0])
        print("Valor Previsto XGB:", specific_day_data['predicted_diff_xgb'].iloc[0])
        print("Valor Previsto RF:", specific_day_data['predicted_diff_rf'].iloc[0])
    else:
        print("Data não encontrada no conjunto de dados.")

# Exibir previsões para uma data específica
display_predictions_for_date('2023-05-10', test_data)

# Calcular e imprimir o MSE e R^2 para todos os modelos
mse_lr = mean_squared_error(y_test, test_pred_lr)
r2_lr = r2_score(y_test, test_pred_lr)
mse_ridge = mean_squared_error(y_test, test_pred_ridge)
r2_ridge = r2_score(y_test, test_pred_ridge)
mse_xgb = mean_squared_error(y_test, test_pred_xgb)
r2_xgb = r2_score(y_test, test_pred_xgb)
mse_rf = mean_squared_error(y_test, test_pred_rf)
r2_rf = r2_score(y_test, test_pred_rf)

print("\nMSE Geral LR:", mse_lr)
print("R^2 LR:", r2_lr)
print("MSE Geral Ridge:", mse_ridge)
print("R^2 Ridge:", r2_ridge)
print("MSE Geral XGB:", mse_xgb)
print("R^2 XGB:", r2_xgb)
print("MSE Geral RF:", mse_rf)
print("R^2 RF:", r2_rf)



Data: 2023-05-10
Valor Real: 0.78
Valor Previsto LR: -0.29913911440239715
Valor Previsto Ridge: -0.29913911258913495
Valor Previsto XGB: -2.1938832
Valor Previsto RF: 8.935100000000013

MSE Geral LR: 161.8512311570938
R^2 LR: -0.014685946245272419
MSE Geral Ridge: 161.85123116290248
R^2 Ridge: -0.0146859462816884
MSE Geral XGB: 180.61558992944825
R^2 XGB: -0.13232441584784183
MSE Geral RF: 296.78086209886897
R^2 RF: -0.8605936311599065


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_diff_lr'] = test_pred_lr
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_diff_ridge'] = test_pred_ridge
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_diff_xgb'] = test_pred_xgb
A value is trying to be set on a copy of a slice from a Dat

7 DIAS

In [25]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Load and process data
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)
combined_data['daily_diff'] = combined_data['Preco_Real'].diff(-1)

# Calculate 7-day accumulated difference
combined_data['7_day_diff'] = combined_data['daily_diff'].rolling(min_periods=1, window=7).sum()

# Ensure there are no NaN values in '7_day_diff' column
combined_data = combined_data[~combined_data['7_day_diff'].isna()]

# Split back into train and test data
train_data = combined_data.iloc[:len(train_data)]
test_data = combined_data.iloc[len(train_data):]

# Split data for training and testing
X_train, X_val, y_train, y_val = train_test_split(
    train_data[['Preco_Real']],
    train_data['7_day_diff'],
    test_size=0.2,
    random_state=42
)
X_test = test_data[['Preco_Real']]
y_test = test_data['7_day_diff']

# Train models
model_lr = LinearRegression()
model_ridge = Ridge(alpha=1.0)
model_xgb = XGBRegressor(random_state=42)
model_rf = RandomForestRegressor(random_state=42)

model_lr.fit(X_train, y_train)
model_ridge.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

# Make predictions
test_pred_lr = model_lr.predict(X_test)
test_pred_ridge = model_ridge.predict(X_test)
test_pred_xgb = model_xgb.predict(X_test)
test_pred_rf = model_rf.predict(X_test)

# Update test data with predicted values
test_data['predicted_7_day_diff_lr'] = test_pred_lr
test_data['predicted_7_day_diff_ridge'] = test_pred_ridge
test_data['predicted_7_day_diff_xgb'] = test_pred_xgb
test_data['predicted_7_day_diff_rf'] = test_pred_rf

# Function to display predictions for a specific date
def display_7_day_diff_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Diferença Acumulada Real de 7 Dias:", specific_day_data['7_day_diff'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias LR:", specific_day_data['predicted_7_day_diff_lr'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias Ridge:", specific_day_data['predicted_7_day_diff_ridge'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias XGB:", specific_day_data['predicted_7_day_diff_xgb'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias RF:", specific_day_data['predicted_7_day_diff_rf'].iloc[0])
    else:
        print("Data não encontrada no conjunto de dados.")

# Display predictions for 2023-05-10
display_7_day_diff_predictions_for_date('2023-05-10', test_data)

# Evaluate models
mse_lr = mean_squared_error(y_test, test_pred_lr)
r2_lr = r2_score(y_test, test_pred_lr)
mse_ridge = mean_squared_error(y_test, test_pred_ridge)
r2_ridge = r2_score(y_test, test_pred_ridge)
mse_xgb = mean_squared_error(y_test, test_pred_xgb)
r2_xgb = r2_score(y_test, test_pred_xgb)
mse_rf = mean_squared_error(y_test, test_pred_rf)
r2_rf = r2_score(y_test, test_pred_rf)

# Print general MSE



Data: 2023-05-10
Diferença Acumulada Real de 7 Dias: 36.569999999999936
Diferença Acumulada Prevista de 7 Dias LR: -5.527717834536817
Diferença Acumulada Prevista de 7 Dias Ridge: -5.527717818866343
Diferença Acumulada Prevista de 7 Dias XGB: -30.83013
Diferença Acumulada Prevista de 7 Dias RF: -78.44679999999997


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_7_day_diff_lr'] = test_pred_lr
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_7_day_diff_ridge'] = test_pred_ridge
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_7_day_diff_xgb'] = test_pred_xgb
A value is trying to be set on a copy of 

In [26]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Carregar e processar os dados
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# Calcular a diferença diária e a diferença acumulada de 7 dias
combined_data['daily_diff'] = combined_data['Preco_Real'].diff(-1)
combined_data['7_day_diff'] = combined_data['daily_diff'].rolling(min_periods=1, window=7).sum()

# Calcular a média móvel de 30 dias
combined_data['30_day_moving_avg'] = combined_data['Preco_Real'].rolling(window=30).mean()

# Remover linhas com NaN
combined_data.dropna(subset=['30_day_moving_avg', '7_day_diff'], inplace=True)

# Dividir novamente em dados de treino e teste
train_data = combined_data.iloc[:len(train_data) - 29] # Ajuste para as linhas NaN removidas
test_data = combined_data.iloc[len(train_data) - 29:]

# Dividir os dados para treino e teste
X_train, X_val, y_train, y_val = train_test_split(
    train_data[['30_day_moving_avg']], # Usando a média móvel de 30 dias como característica
    train_data['7_day_diff'], # Previsão da diferença acumulada de 7 dias
    test_size=0.2,
    random_state=42
)
X_test = test_data[['30_day_moving_avg']]
y_test = test_data['7_day_diff']

# Treinar modelos
model_lr = LinearRegression()
model_ridge = Ridge(alpha=1.0)
model_xgb = XGBRegressor(random_state=42)
model_rf = RandomForestRegressor(random_state=42)

model_lr.fit(X_train, y_train)
model_ridge.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

# Fazer previsões
test_pred_lr = model_lr.predict(X_test)
test_pred_ridge = model_ridge.predict(X_test)
test_pred_xgb = model_xgb.predict(X_test)
test_pred_rf = model_rf.predict(X_test)

# Atualizar dados de teste com valores previstos
test_data['predicted_7_day_diff_lr'] = test_pred_lr
test_data['predicted_7_day_diff_ridge'] = test_pred_ridge
test_data['predicted_7_day_diff_xgb'] = test_pred_xgb
test_data['predicted_7_day_diff_rf'] = test_pred_rf

# Função para exibir previsões para uma data específica
def display_7_day_diff_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Diferença Acumulada Real de 7 Dias:", specific_day_data['7_day_diff'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias LR:", specific_day_data['predicted_7_day_diff_lr'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias Ridge:", specific_day_data['predicted_7_day_diff_ridge'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias XGB:", specific_day_data['predicted_7_day_diff_xgb'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias RF:", specific_day_data['predicted_7_day_diff_rf'].iloc[0])
    else:
        print("Data não encontrada no conjunto de dados.")

# Exibir previsões para 2023-05-10
display_7_day_diff_predictions_for_date('2023-05-10', test_data)

# Avaliar modelos
mse_lr = mean_squared_error(y_test, test_pred_lr)
r2_lr = r2_score(y_test, test_pred_lr)
mse_ridge = mean_squared_error(y_test, test_pred_ridge)
r2_ridge = r2_score(y_test, test_pred_ridge)
mse_xgb = mean_squared_error(y_test, test_pred_xgb)
r2_xgb = r2_score(y_test, test_pred_xgb)
mse_rf = mean_squared_error(y_test, test_pred_rf)
r2_rf = r2_score(y_test, test_pred_rf)

# Imprimir MSE geral e R² para todos os modelos
print("\nMSE Geral LR:", mse_lr)
print("R^2 LR:", r2_lr)
print("MSE Geral Ridge:", mse_ridge)
print("R^2 Ridge:", r2_ridge)
print("MSE Geral XGB:", mse_xgb)
print("R^2 XGB:", r2_xgb)
print("MSE Geral RF:", mse_rf)
print("R^2 RF:", r2_rf)



Data: 2023-05-10
Diferença Acumulada Real de 7 Dias: 36.569999999999936
Diferença Acumulada Prevista de 7 Dias LR: -1.7788310664052984
Diferença Acumulada Prevista de 7 Dias Ridge: -1.7788310653872819
Diferença Acumulada Prevista de 7 Dias XGB: -41.51678
Diferença Acumulada Prevista de 7 Dias RF: -46.906200000000005

MSE Geral LR: 1657.8142228790343
R^2 LR: -0.01947842135102995
MSE Geral Ridge: 1657.814222867798
R^2 Ridge: -0.019478421344120145
MSE Geral XGB: 2922.135945958735
R^2 XGB: -0.7969773090651562
MSE Geral RF: 4122.150500534161
R^2 RF: -1.534930287639693


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_7_day_diff_lr'] = test_pred_lr
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_7_day_diff_ridge'] = test_pred_ridge
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_7_day_diff_xgb'] = test_pred_xgb
A value is trying to be set on a copy of 

In [27]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Carregar e processar os dados
train_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_train.csv'
test_data_path = 'C:/Users/milen/OneDrive/Documentos/TCC/Bases/clima_test.csv'
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)

# Calcular a diferença diária e a diferença acumulada de 7 dias
combined_data['daily_diff'] = combined_data['Preco_Real'].diff(-1)
combined_data['7_day_diff'] = combined_data['daily_diff'].rolling(min_periods=1, window=7).sum()

# Calcular a média móvel de 30 dias
combined_data['30_day_moving_avg'] = combined_data['Preco_Real'].rolling(window=1).mean()

# Remover linhas com NaN
combined_data.dropna(subset=['30_day_moving_avg', '7_day_diff'], inplace=True)

# Dividir novamente em dados de treino e teste
train_data = combined_data.iloc[:len(train_data) - 1] # Ajuste para as linhas NaN removidas
test_data = combined_data.iloc[len(train_data) - 1:]

# Dividir os dados para treino e teste
X_train, X_val, y_train, y_val = train_test_split(
    train_data[['30_day_moving_avg']], # Usando a média móvel de 30 dias como característica
    train_data['7_day_diff'], # Previsão da diferença acumulada de 7 dias
    test_size=0.2,
    random_state=42
)
X_test = test_data[['30_day_moving_avg']]
y_test = test_data['7_day_diff']

# Treinar modelos
model_lr = LinearRegression()
model_ridge = Ridge(alpha=1.0)
model_xgb = XGBRegressor(random_state=42)
model_rf = RandomForestRegressor(random_state=42)

model_lr.fit(X_train, y_train)
model_ridge.fit(X_train, y_train)
model_xgb.fit(X_train, y_train)
model_rf.fit(X_train, y_train)

# Fazer previsões
test_pred_lr = model_lr.predict(X_test)
test_pred_ridge = model_ridge.predict(X_test)
test_pred_xgb = model_xgb.predict(X_test)
test_pred_rf = model_rf.predict(X_test)

# Atualizar dados de teste com valores previstos
test_data['predicted_7_day_diff_lr'] = test_pred_lr
test_data['predicted_7_day_diff_ridge'] = test_pred_ridge
test_data['predicted_7_day_diff_xgb'] = test_pred_xgb
test_data['predicted_7_day_diff_rf'] = test_pred_rf

# Função para exibir previsões para uma data específica
def display_7_day_diff_predictions_for_date(date, data):
    specific_day_data = data[data['Date'] == date]
    if not specific_day_data.empty:
        print("\nData:", date)
        print("Diferença Acumulada Real de 7 Dias:", specific_day_data['7_day_diff'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias LR:", specific_day_data['predicted_7_day_diff_lr'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias Ridge:", specific_day_data['predicted_7_day_diff_ridge'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias XGB:", specific_day_data['predicted_7_day_diff_xgb'].iloc[0])
        print("Diferença Acumulada Prevista de 7 Dias RF:", specific_day_data['predicted_7_day_diff_rf'].iloc[0])
    else:
        print("Data não encontrada no conjunto de dados.")

# Exibir previsões para 2023-05-10
display_7_day_diff_predictions_for_date('2023-05-10', test_data)

# Avaliar modelos
mse_lr = mean_squared_error(y_test, test_pred_lr)
r2_lr = r2_score(y_test, test_pred_lr)
mse_ridge = mean_squared_error(y_test, test_pred_ridge)
r2_ridge = r2_score(y_test, test_pred_ridge)
mse_xgb = mean_squared_error(y_test, test_pred_xgb)
r2_xgb = r2_score(y_test, test_pred_xgb)
mse_rf = mean_squared_error(y_test, test_pred_rf)
r2_rf = r2_score(y_test, test_pred_rf)

# Imprimir MSE geral e R² para todos os modelos
print("\nMSE Geral LR:", mse_lr)
print("R^2 LR:", r2_lr)
print("MSE Geral Ridge:", mse_ridge)
print("R^2 Ridge:", r2_ridge)
print("MSE Geral XGB:", mse_xgb)
print("R^2 XGB:", r2_xgb)
print("MSE Geral RF:", mse_rf)
print("R^2 RF:", r2_rf)



Data: 2023-05-10
Diferença Acumulada Real de 7 Dias: 36.569999999999936
Diferença Acumulada Prevista de 7 Dias LR: -4.368421004825163
Diferença Acumulada Prevista de 7 Dias Ridge: -4.368420993022686
Diferença Acumulada Prevista de 7 Dias XGB: -23.940489
Diferença Acumulada Prevista de 7 Dias RF: -70.35059999999987

MSE Geral LR: 1481.4952449755085
R^2 LR: -0.2098504978232516
MSE Geral Ridge: 1481.4952446627601
R^2 Ridge: -0.20985049756784835
MSE Geral XGB: 1778.9165967261401
R^2 XGB: -0.45273711639266523
MSE Geral RF: 2968.411807555942
R^2 RF: -1.4241282685826948


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_7_day_diff_lr'] = test_pred_lr
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_7_day_diff_ridge'] = test_pred_ridge
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['predicted_7_day_diff_xgb'] = test_pred_xgb
A value is trying to be set on a copy of 