In [348]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from xgboost import XGBRegressor
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import optuna as ot

In [349]:
data = pd.read_csv('aggregated_data.csv')

# INFLATION ADJUSTMENT
base_cpi = data['CPI_US'].iloc[-1]
data['AVG_PRICE_AUCTION'] = data['AVG_PRICE_AUCTION'] * (base_cpi/data['CPI_US'])
data['CORN_FUTURES_TYPICAL_PRICE'] = data['CORN_FUTURES_TYPICAL_PRICE'] * (base_cpi/data['CPI_US'])
data['FEEDER_CATTLE_FUTURES_TYPICAL_PRICE'] = data['FEEDER_CATTLE_FUTURES_TYPICAL_PRICE'] * (base_cpi/data['CPI_US'])
data['US_MXN_RATES'] = data['US_MXN_RATES'] * (base_cpi/data['CPI_US'])
data['DIESEL_RETAIL_PRICE'] = data['DIESEL_RETAIL_PRICE'] * (base_cpi/data['CPI_US'])
data['DATE'] = pd.to_datetime(data['DATE'])

# ['avg_price_steers_ML1', 'avg_price_steers_ML1_2', 'avg_price_heifers_ML1', 'avg_price_heifers_ML1_2', 'CPI_US', 'NATIONAL_AUCTION_SALES','NATIONAL_TOTAL_SALES', 'TEXAS_AUCTION_SALES']
data = data.sort_values('DATE', ignore_index=True).set_index('DATE').drop(['avg_price_steers_ML1', 'avg_price_steers_ML1_2', 'avg_price_heifers_ML1', 'avg_price_heifers_ML1_2', 'NATIONAL_AUCTION_SALES','NATIONAL_TOTAL_SALES', 'TEXAS_AUCTION_SALES'], axis=1)
# print(data.head())


## RANDOM FOREST MODEL

In [350]:
# CREATE RANDOM FOREST MODEL
X = data.drop(['AVG_PRICE_AUCTION'], axis=1)
y = data['AVG_PRICE_AUCTION']

# SPLIT DATA INTO 80/20 TRAINING TESTING SETS
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CREATE AND TRAIN RF REGRESSOR
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# MAKE PREDICTIONS ON TEST SET
rf_pred = rf_regressor.predict(X_test)

# EVALUATING MODEL PERFORMANCE
rf_r2 = r2_score(y_test, rf_pred) # R²
print(f'R² Score: {rf_r2}')

n = X_test.shape[0]
k = X_test.shape[1]
rf_adj_r2 = 1-((1-rf_r2)*(n-1)/(n-k-1)) # Adjusted-R²
print(f'Adjusted-R² Score: {rf_adj_r2}')

rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred)) # RMSE
print(f'Root Mean Squared Error (RMSE): {rf_rmse}')

rf_mae = mean_absolute_error(y_test, rf_pred) # MAE
print(f'Mean Absolute Error (MAE): {rf_mae}')

R² Score: 0.9692757482880006
Adjusted-R² Score: 0.9665243227615529
Root Mean Squared Error (RMSE): 5.965637679690656
Mean Absolute Error (MAE): 4.596643251439692


## XGBOOST MODEL

In [352]:
# CREATE AND TRAIN XGBOOST REGRESSOR
xgb_reg = XGBRegressor()
xgb_reg.fit(X_train, y_train)
xgb_pred = xgb_reg.predict(X_test)

# EVALUATING MODEL PERFORMANCE
rf_r2 = r2_score(y_test, rf_pred) # R²
print(f'R² Score: {rf_r2}')

n = X_test.shape[0]
k = X_test.shape[1]
rf_adj_r2 = 1-((1-rf_r2)*(n-1)/(n-k-1)) # Adjusted-R²
print(f'Adjusted-R² Score: {rf_adj_r2}')

rf_rmse = np.sqrt(mean_squared_error(y_test, rf_pred)) # RMSE
print(f'Root Mean Squared Error (RMSE): {rf_rmse}')

rf_mae = mean_absolute_error(y_test, rf_pred) # MAE
print(f'Mean Absolute Error (MAE): {rf_mae}')

R² Score: 0.9692757482880006
Adjusted-R² Score: 0.9665243227615529
Root Mean Squared Error (RMSE): 5.965637679690656
Mean Absolute Error (MAE): 4.596643251439692


## LINEAR REGRESSION MODEL

In [351]:
# CREATE AND TRAIN LINEAR REGRESSION MODEL
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lrg_pred = lin_reg.predict(X_test)

# EVALUATING MODEL PERFORMANCE
lrg_r2 = r2_score(y_test, lrg_pred) # R²
print(f'R² Score: {rf_r2}')

n = X_test.shape[0]
k = X_test.shape[1]
lrg_adj_r2 = 1-((1-rf_r2)*(n-1)/(n-k-1)) # Adjusted-R²
print(f'Adjusted-R² Score: {lrg_adj_r2}')

lrg_rmse = np.sqrt(mean_squared_error(y_test, lrg_pred)) # RMSE
print(f'Root Mean Squared Error (RMSE): {lrg_rmse}')

lrg_mae = mean_absolute_error(y_test, lrg_pred) # MAE
print(f'Mean Absolute Error (MAE): {lrg_mae}')

R² Score: 0.9692757482880006
Adjusted-R² Score: 0.9665243227615529
Root Mean Squared Error (RMSE): 8.50286814543951
Mean Absolute Error (MAE): 6.68824434657122
