In [1]:
import pandas as pd
prices = pd.read_csv("all_stocks.csv")
prices.rename(columns={"Date":"ds", "Ticker":"unique_id", "Close":"y"}, inplace=True)
prices["ds"] = prices["ds"].str[:10] # remove time
prices["ds"] = pd.to_datetime(prices["ds"])
prices = prices[["ds", "y", "unique_id","12-day EMA", "26-day EMA", "MACD", "Signal Line", "MACD Histogram", "RSI", "OBV"]]
prices.dropna(inplace=True)

In [2]:
# Train / Test split
train_size = 0.8  # 80% train, 20% test
def train_test_split(group):
    split_index = int(len(group) * train_size)
    return group.iloc[:split_index], group.iloc[split_index:]

# Apply the split to each unique stock (grouped by 'unique_id')
train_data, test_data = zip(*prices.groupby("unique_id").apply(train_test_split))

# Convert tuple results to DataFrames
df_train = pd.concat(train_data).reset_index(drop=True)
df_test = pd.concat(test_data).reset_index(drop=True)

  train_data, test_data = zip(*prices.groupby("unique_id").apply(train_test_split))


In [3]:
import pandas as pd
import numpy as np
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tqdm import tqdm

# Define historical window sizes
input_sizes = [10, 15, 30, 60, 90]  

# Store ARIMA results
arima_results = []
model_orders = []

# Define the features (technical indicators) to be used
features = ["12-day EMA", "26-day EMA", "MACD", "Signal Line", "MACD Histogram", "RSI", "OBV"]

for interval in input_sizes:
    print(f"Training Auto-ARIMA for interval: {interval}")

    # Store per-stock results
    all_forecasts = []

    # Loop through each stock (unique_id) in training data
    for stock in tqdm(df_train["unique_id"].unique(), desc=f"Interval {interval}"):
        stock_data = df_train[df_train["unique_id"] == stock].sort_values("ds")  # Ensure sorted time series
        test_data = df_test[df_test["unique_id"] == stock].sort_values("ds")  # Get test data for this stock

        if len(stock_data) < interval + 1:
            print(f"Skipping {stock} due to insufficient data.")
            continue  # Skip stocks with insufficient data
        
        # Extract the last 'interval' days as training input
        train_series = stock_data["y"].iloc[-interval:]
        train_features = stock_data[features].iloc[-interval:]

        # Automatically select best (p,d,q) using auto_arima
        try:
            auto_model = auto_arima(train_series, X=train_features, seasonal=False, stepwise=True, trace=False,
                                    suppress_warnings=True, error_action="ignore", max_p=5, max_q=5)

            # Extract optimal order
            best_p, best_d, best_q = auto_model.order
            print(f"Best ARIMA order for {stock}: ({best_p}, {best_d}, {best_q})")

            # Store the selected order
            model_orders.append({"unique_id": stock, "Interval": interval, "p": best_p, "d": best_d, "q": best_q})

            # Fit ARIMA model with best (p,d,q) and technical indicators
            model = ARIMA(train_series, order=(best_p, best_d, best_q), exog=train_features)
            model_fit = model.fit()

            # Forecast next 'h' steps (matching test data length)
            h = len(test_data)
            test_features = test_data[features].iloc[:h]  # Ensure feature alignment
            forecast = model_fit.forecast(steps=h, exog=test_features)

            # Store results
            forecast_df = pd.DataFrame({
                "ds": test_data["ds"].values,
                "unique_id": stock,
                "ARIMA": forecast.values,
                "y": test_data["y"].values  # Actual values
            })

            # Calculate RMSE & MAE for evaluation
            mae = mean_absolute_error(forecast_df["y"], forecast_df["ARIMA"])
            rmse = np.sqrt(mean_squared_error(forecast_df["y"], forecast_df["ARIMA"]))
            forecast_df["Interval"] = interval
            forecast_df["MAE"] = mae
            forecast_df["RMSE"] = rmse

            all_forecasts.append(forecast_df)

        except Exception as e:
            print(f"Error training ARIMA for {stock}: {e}")
            continue
    
    # Merge all results and save
    if all_forecasts:
        final_df = pd.concat(all_forecasts)
        final_df.to_csv(f"res_arima_{interval}.csv", index=False)
        arima_results.append(final_df)


Training Auto-ARIMA for interval: 10


  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for AAPL: (1, 0, 0)


  return np.roots(self.polynomial_reduced_ma)**-1
  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for JPM: (0, 0, 1)


  return np.roots(self.polynomial_reduced_ma)**-1
  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return np.roots(self.polynomial_reduced_ma)**-1


Best ARIMA order for MSFT: (0, 0, 1)


  return np.roots(self.polynomial_reduced_ma)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return np.roots(self.polynomial_reduced_ma)**-1


Best ARIMA order for NVDA: (0, 0, 1)


  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
Interval 10: 100%|██████████| 5/5 [00:02<00:00,  2.14it/s]


Best ARIMA order for TSLA: (0, 0, 1)
Training Auto-ARIMA for interval: 15


  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for AAPL: (1, 0, 0)


  return np.roots(self.polynomial_reduced_ma)**-1
  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  warn('Non-invertible starting MA parameters found.'
  return get_prediction_index(
  return get_prediction_index(
Interval 15:  40%|████      | 2/5 [00:01<00:01,  1.65it/s]

Best ARIMA order for JPM: (2, 0, 1)


  return np.roots(self.polynomial_reduced_ma)**-1
  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for MSFT: (1, 0, 0)


  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return np.roots(self.polynomial_reduced_ma)**-1


Best ARIMA order for NVDA: (0, 0, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
Interval 15: 100%|██████████| 5/5 [00:02<00:00,  1.88it/s]


Best ARIMA order for TSLA: (2, 0, 0)
Training Auto-ARIMA for interval: 30


  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for AAPL: (2, 0, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for JPM: (0, 0, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for MSFT: (1, 0, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  return np.roots(self.polynomial_reduced_ar)**-1


Best ARIMA order for NVDA: (2, 0, 0)


  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
Interval 30: 100%|██████████| 5/5 [00:02<00:00,  1.68it/s]


Best ARIMA order for TSLA: (0, 0, 2)
Training Auto-ARIMA for interval: 60


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for AAPL: (2, 0, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for JPM: (2, 0, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for MSFT: (0, 0, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for NVDA: (0, 0, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
Interval 60: 100%|██████████| 5/5 [00:02<00:00,  1.68it/s]


Best ARIMA order for TSLA: (0, 0, 1)
Training Auto-ARIMA for interval: 90


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for AAPL: (1, 0, 0)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for JPM: (0, 0, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for MSFT: (0, 1, 1)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


Best ARIMA order for NVDA: (0, 0, 1)


  return np.roots(self.polynomial_reduced_ar)**-1
  return np.roots(self.polynomial_reduced_ma)**-1
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
Interval 90: 100%|██████████| 5/5 [00:03<00:00,  1.46it/s]

Best ARIMA order for TSLA: (1, 0, 0)





In [7]:
import numpy as np
import pandas as pd

# Define function for RMAE calculation
def calculate_rmae(mae, avg_price):
    return (mae / avg_price) * 100 if avg_price != 0 else np.nan

# List to store ARIMA results
arima_comparison_results = []

# Iterate over each interval
for interval in input_sizes:
    try:
        # Load ARIMA results for the given interval
        arima_df = pd.read_csv(f"ARIMA_RESULTS_PER_STOCK/res_arima_{interval}.csv")
        
        # Aggregate RMSE, MAE, and compute the average price for RMAE calculation
        grouped_metrics = arima_df.groupby("unique_id").agg({
            "MAE": "mean",
            "RMSE": "mean",
            "y": "mean"  # Average price in the interval
        }).reset_index()

        # Compute RMAE
        grouped_metrics["RMAE"] = grouped_metrics.apply(lambda row: calculate_rmae(row["MAE"], row["y"]), axis=1)

        # Append interval information
        grouped_metrics["Interval"] = interval

        # Reorder columns to match the LSTM format
        grouped_metrics = grouped_metrics[["unique_id", "Interval", "MAE", "RMSE", "RMAE"]]
        
        # Append to results list
        arima_comparison_results.append(grouped_metrics)
    
    except Exception as e:
        print(f"Error processing interval {interval}: {e}")
        continue

# Combine results into a single DataFrame
final_arima_results = pd.concat(arima_comparison_results)

# Save results in the same format as LSTM results
final_arima_results.to_csv("arima_comparison_results.csv", index=False)
