In [365]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from scipy.stats import spearmanr
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima.model import ARIMA
from xgboost import XGBRegressor
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import optuna as ot

In [366]:
data = pd.read_csv('aggregated_data.csv')

# INFLATION ADJUSTMENT
base_cpi = data['CPI_US'].iloc[-1]
data['AVG_PRICE_AUCTION'] = data['AVG_PRICE_AUCTION'] * (base_cpi/data['CPI_US'])
data['CORN_FUTURES_TYPICAL_PRICE'] = data['CORN_FUTURES_TYPICAL_PRICE'] * (base_cpi/data['CPI_US'])
data['FEEDER_CATTLE_FUTURES_TYPICAL_PRICE'] = data['FEEDER_CATTLE_FUTURES_TYPICAL_PRICE'] * (base_cpi/data['CPI_US'])
data['US_MXN_RATES'] = data['US_MXN_RATES'] * (base_cpi/data['CPI_US'])
data['DIESEL_RETAIL_PRICE'] = data['DIESEL_RETAIL_PRICE'] * (base_cpi/data['CPI_US'])
data['DATE'] = pd.to_datetime(data['DATE'])

# ['avg_price_steers_ML1', 'avg_price_steers_ML1_2', 'avg_price_heifers_ML1', 'avg_price_heifers_ML1_2', 'CPI_US', 'NATIONAL_AUCTION_SALES','NATIONAL_TOTAL_SALES', 'TEXAS_AUCTION_SALES']
data = data.sort_values('DATE', ignore_index=True).set_index('DATE').drop(['avg_price_steers_ML1', 'avg_price_steers_ML1_2', 'avg_price_heifers_ML1', 'avg_price_heifers_ML1_2', 'NATIONAL_AUCTION_SALES','NATIONAL_TOTAL_SALES', 'TEXAS_AUCTION_SALES'], axis=1)
# print(data.head())


In [367]:
def print_metrics(X_test, y_test, y_pred): # Helper function to print model metrics
    rf_r2 = r2_score(y_test, y_pred) # R²
    print(f'R² Score: {rf_r2}')

    n = X_test.shape[0]
    k = X_test.shape[1]
    adj_r2 = 1-((1-rf_r2)*(n-1)/(n-k-1)) # Adjusted-R²
    print(f'Adjusted-R² Score: {adj_r2}')

    prcorr = np.corrcoef(y_test, y_pred)[0,1] # Pearson Correlation Coefficient 
    print("Pearson correlation:", prcorr)

    srcorr, p_value = spearmanr(y_test, y_pred) # Spearman’s Rank Correlation
    print("Spearman correlation:", srcorr)

    rmse = np.sqrt(mean_squared_error(y_test, y_pred)) # RMSE
    print(f'Root Mean Squared Error (RMSE): {rmse}')

    mae = mean_absolute_error(y_test, y_pred) # MAE
    print(f'Mean Absolute Error (MAE): {mae}')

## RANDOM FOREST MODEL

In [368]:
# # CREATE RANDOM FOREST MODEL
# X = data.drop(['AVG_PRICE_AUCTION'], axis=1)
# y = data['AVG_PRICE_AUCTION']

# # SPLIT DATA INTO 80/20 TRAINING TESTING SETS
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_size = int(len(data) * 0.8)
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]
X_train = train_data.drop('AVG_PRICE_AUCTION', axis=1)
y_train = train_data['AVG_PRICE_AUCTION']
X_test = test_data.drop('AVG_PRICE_AUCTION', axis=1)
y_test = test_data['AVG_PRICE_AUCTION']

# CREATE AND TRAIN RF REGRESSOR
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train, y_train)

# MAKE PREDICTIONS ON TEST SET
rf_pred = rf_regressor.predict(X_test)

# EVALUATING MODEL PERFORMANCE
print_metrics(X_test, y_test, rf_pred)

R² Score: -4.065129840627757
Adjusted-R² Score: -4.5187235576989
Pearson correlation: -0.5421383430544126
Spearman correlation: -0.21288191859158784
Root Mean Squared Error (RMSE): 33.247231240164425
Mean Absolute Error (MAE): 25.119362570084594


## XGBOOST MODEL

In [369]:
# CREATE AND TRAIN XGBOOST REGRESSOR
xgb_reg = XGBRegressor()
xgb_reg.fit(X_train, y_train)
xgb_pred = xgb_reg.predict(X_test)

# EVALUATING MODEL PERFORMANCE
print_metrics(X_test, y_test, xgb_pred)

R² Score: -6.037797673434233
Adjusted-R² Score: -6.668048211353716
Pearson correlation: -0.4322863122348723
Spearman correlation: -0.2508497169330155
Root Mean Squared Error (RMSE): 39.19029940112609
Mean Absolute Error (MAE): 30.649350070335164


## LINEAR REGRESSION MODEL

In [370]:
# CREATE AND TRAIN LINEAR REGRESSION MODEL
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)
lrg_pred = lin_reg.predict(X_test)

# EVALUATING MODEL PERFORMANCE
print_metrics(X_test, y_test, lrg_pred)

R² Score: -1.0747181053000974
Adjusted-R² Score: -1.2605137565210014
Pearson correlation: 0.6415227153267419
Spearman correlation: 0.6232802665679377
Root Mean Squared Error (RMSE): 21.278437846996813
Mean Absolute Error (MAE): 18.041762354218662
