# ARIMA Model v2 - Optimized Expanding Window

This notebook implements an optimized ARIMA model with expanding window validation for predicting stock returns. This version is designed for memory efficiency and performance with large datasets while maintaining proper time series validation.


In [1]:
# OPTIMIZED IMPORTS
import datetime
import pandas as pd
import numpy as np
import os
import sys
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path for data_loader import
sys.path.append(str(Path.cwd().parent / 'src'))
from data_loader import load_data

print(f"Start time: {datetime.datetime.now()}")
pd.set_option("mode.chained_assignment", None)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Start time: 2025-09-22 08:01:19.432168


In [2]:
# OPTIMIZED DATA LOADING USING DATA HANDLER
print("Loading data...")
raw = load_data(filename="ret_sample.csv", parse_dates=["ret_eom"], low_memory=False)
print(f"Data shape: {raw.shape}")

# Convert date once and work in place
raw['date'] = pd.to_datetime(raw['ret_eom'])
raw.drop('ret_eom', axis=1, inplace=True)  # Remove redundant column

# Load predictor variables using data handler
stock_vars = list(load_data(filename="factor_char_list.csv")["variable"].values)
print(f"Number of predictor variables: {len(stock_vars)}")

# Filter missing returns in place
ret_var = "stock_ret"
raw = raw[raw[ret_var].notna()].copy()  # Only copy once
print(f"Data after removing missing returns: {raw.shape}")
print(f"Date range: {raw['date'].min()} to {raw['date'].max()}")


Loading data...
Loading data from: /Users/kevin/Coding Projects/Asset-Management-Hackathon-2025/data/ret_sample.csv
Data shape: (6401414, 159)
Loading data from: /Users/kevin/Coding Projects/Asset-Management-Hackathon-2025/data/factor_char_list.csv
Number of predictor variables: 147
Data after removing missing returns: (6401414, 158)
Date range: 2005-02-28 00:00:00 to 2025-06-30 00:00:00


In [3]:
# Vectorized rank transform that preserves 'date'
print("Applying rank transformations (vectorized)...")

data = raw.copy()
for var in stock_vars:
    if var not in data.columns:
        continue
    med = data.groupby('date')[var].transform('median')
    data[var] = data[var].fillna(med)

    ranks = data.groupby('date')[var].rank(method='dense') - 1
    maxs = data.groupby('date')[var].transform('max')
    data[var] = np.where(maxs > 0, (ranks / maxs) * 2 - 1, 0)

print(f"Processed data shape: {data.shape}")
del raw  # free memory

# Safety check
assert 'date' in data.columns, "date column missing after transform"

Applying rank transformations (vectorized)...
Processed data shape: (6401414, 158)


In [4]:
# OPTIMIZED ARIMA FUNCTIONS
def check_stationarity(series, title=''):
    """Optimized stationarity check"""
    try:
        result = adfuller(series.dropna())
        return result[1] < 0.05
    except:
        return False

def find_arima_params(series, max_p=2, max_d=1, max_q=2):  # Reduced search space
    """Optimized ARIMA parameter search"""
    best_aic = np.inf
    best_params = None
    
    # Reduced parameter space for speed
    for p in range(max_p + 1):
        for d in range(max_d + 1):
            for q in range(max_q + 1):
                try:
                    model = ARIMA(series, order=(p, d, q))
                    fitted_model = model.fit()
                    if fitted_model.aic < best_aic:
                        best_aic = fitted_model.aic
                        best_params = (p, d, q)
                except:
                    continue
    return best_params, best_aic

print("ARIMA functions defined")


ARIMA functions defined


In [None]:
# Expanding-window ARIMA predictions that build pred_out

import numpy as np
from statsmodels.tsa.arima.model import ARIMA

# Safety checks
assert 'data' in globals(), "Missing 'data' DataFrame. Run the data loading cells first."
assert 'ret_var' in globals(), "Missing ret_var. Run the data loading cell."

pred_out = pd.DataFrame()  # <-- initialize here

starting = pd.to_datetime("20050101", format="%Y%m%d")
counter = 0

unique_dates = np.sort(data['date'].unique())
if len(unique_dates) == 0:
    raise RuntimeError("No dates in 'data'. Check preprocessing.")

print(f"Available dates: {len(unique_dates)} from {unique_dates[0]} to {unique_dates[-1]}")

while (starting + pd.DateOffset(years=11 + counter)) <= unique_dates[-1]:
    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),   # train
        starting + pd.DateOffset(years=10 + counter),  # validate
        starting + pd.DateOffset(years=11 + counter),  # test
    ]
    print(f"\nPeriod {counter+1}: {cutoff[0].date()} → {cutoff[3].date()}")

    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    val   = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test  = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
    if len(train) == 0 or len(test) == 0:
        print("Skipping (insufficient data).")
        counter += 1
        continue

    # Prepare output frame for this test window
    reg_pred = test[["year", "month", "date", "id", ret_var]].copy()
    preds = np.full(len(test), np.nan)

    # Predict per stock
    test_stocks = test['id'].unique()
    for stock_id in test_stocks:
        stock_train = train[train['id'] == stock_id].sort_values('date')
        stock_test_idx = test.index[test['id'] == stock_id]

        if len(stock_train) < 10:
            mean_ret = stock_train[ret_var].mean() if len(stock_train) else 0.0
            preds[test.index.get_indexer(stock_test_idx)] = mean_ret
            continue

        try:
            ts = stock_train[ret_var].values
            # Small param grid for speed; replace with your find_arima_params if defined
            best, best_aic = None, np.inf
            for p in (0,1,2):
                for d in (0,1):
                    for q in (0,1,2):
                        try:
                            fit = ARIMA(ts, order=(p,d,q)).fit()
                            if fit.aic < best_aic:
                                best, best_aic = (p,d,q), fit.aic
                        except:
                            pass

            if best is None:
                mean_ret = stock_train[ret_var].mean()
                preds[test.index.get_indexer(stock_test_idx)] = mean_ret
            else:
                fit = ARIMA(ts, order=best).fit()
                steps = len(stock_test_idx)
                fc = fit.forecast(steps=steps)
                if np.ndim(fc) == 0 or len(fc) == 1:
                    fc = np.full(steps, float(fc[0] if len(np.atleast_1d(fc)) else 0.0))
                preds[test.index.get_indexer(stock_test_idx)] = np.asarray(fc, dtype=float)
        except:
            mean_ret = stock_train[ret_var].mean()
            preds[test.index.get_indexer(stock_test_idx)] = mean_ret

    reg_pred["arima"] = preds
    pred_out = pd.concat([pred_out, reg_pred], ignore_index=True)

    counter += 1

print(f"\nCompleted periods: {counter}")
print(f"pred_out shape: {pred_out.shape}")

Available dates: 245 from 2005-02-28T00:00:00.000000000 to 2025-06-30T00:00:00.000000000

Period 1: 2005-01-01 → 2016-01-01
Train: 2321672, Val: 590345, Test: 300441

Period 2: 2005-01-01 → 2017-01-01
Train: 2615405, Val: 597053, Test: 304279


In [None]:
# Save results and calculate metrics
print("Saving results...")
output_path = Path.cwd().parent / "data" / "arima_predictions_v2.csv"
pred_out.to_csv(output_path, index=False)
print(f"Saved predictions to: {output_path}")

print(f"\nPrediction results shape: {pred_out.shape}")
print(f"Non-null predictions: {pred_out['arima'].notna().sum()}")


In [None]:
# Calculate and display metrics
yreal = pred_out[ret_var].values
ypred = pred_out["arima"].values

# Remove NaN values
mask = ~(np.isnan(yreal) | np.isnan(ypred))
yreal_clean = yreal[mask]
ypred_clean = ypred[mask]

if len(yreal_clean) > 0:
    r2 = 1 - np.sum(np.square((yreal_clean - ypred_clean))) / np.sum(np.square(yreal_clean))
    mse = mean_squared_error(yreal_clean, ypred_clean)
    rmse = np.sqrt(mse)
    mae = np.mean(np.abs(yreal_clean - ypred_clean))
    
    print(f"\n=== ARIMA v2 RESULTS ===")
    print(f"R²: {r2:.6f}")
    print(f"RMSE: {rmse:.6f}")
    print(f"MAE: {mae:.6f}")
    print(f"Valid predictions: {len(yreal_clean)}")
    print(f"Actual returns - Mean: {np.mean(yreal_clean):.6f}, Std: {np.std(yreal_clean):.6f}")
    print(f"Predicted returns - Mean: {np.mean(ypred_clean):.6f}, Std: {np.std(ypred_clean):.6f}")
else:
    print("ERROR: No valid predictions!")

print(f"End time: {datetime.datetime.now()}")


In [None]:
# OPTIMIZED EXPANDING WINDOW PREDICTION LOOP
print("Starting ARIMA predictions with expanding window...")

# Initialize the starting date, counter, and output data
starting = pd.to_datetime("20050101", format="%Y%m%d")
counter = 0
pred_out = pd.DataFrame()

# Get unique dates for efficient filtering
unique_dates = sorted(data['date'].unique())
print(f"Available dates: {len(unique_dates)} from {unique_dates[0]} to {unique_dates[-1]}")

# Estimation with expanding window - optimized version
while (starting + pd.DateOffset(years=11 + counter)) <= unique_dates[-1]:
    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),  # Use 8 years and expanding as training set
        starting + pd.DateOffset(years=10 + counter),  # Use next 2 years as validation set
        starting + pd.DateOffset(years=11 + counter),
    ]  # Use next year as out-of-sample testing set
    
    print(f"\nProcessing period {counter + 1}: {cutoff[0].strftime('%Y-%m-%d')} to {cutoff[3].strftime('%Y-%m-%d')}")
    
    # Efficient filtering using boolean indexing
    train_mask = (data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])
    validate_mask = (data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])
    test_mask = (data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])
    
    train_data = data[train_mask]
    validate_data = data[validate_mask]
    test_data = data[test_mask]
    
    print(f"Training set: {len(train_data)} observations")
    print(f"Validation set: {len(validate_data)} observations")
    print(f"Test set: {len(test_data)} observations")
    
    # Skip if any set is empty
    if len(train_data) == 0 or len(test_data) == 0:
        print(f"Skipping period {counter + 1} - insufficient data")
        counter += 1
        continue
    
    # Prepare output data
    reg_pred = test_data[["year", "month", "date", "id", ret_var]].copy()
    
    # Get unique stocks in test set
    test_stocks = test_data['id'].unique()
    print(f"Predicting for {len(test_stocks)} stocks...")
    
    # Pre-allocate predictions array
    arima_predictions = np.full(len(test_data), np.nan)


In [None]:
# OPTIONAL: Simple visualization
if len(yreal_clean) > 0:
    plt.figure(figsize=(10, 6))
    plt.scatter(yreal_clean, ypred_clean, alpha=0.5, s=1)
    plt.plot([yreal_clean.min(), yreal_clean.max()], [yreal_clean.min(), yreal_clean.max()], 'r--', lw=2)
    plt.xlabel('Actual Returns')
    plt.ylabel('Predicted Returns')
    plt.title('ARIMA v2: Predicted vs Actual Returns')
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("No valid predictions for visualization")


In [None]:
    # Process stocks in batches for memory efficiency
    batch_size = 1000
    processed_stocks = 0
    
    for i in range(0, len(test_stocks), batch_size):
        batch_stocks = test_stocks[i:i+batch_size]
        
        for stock_id in batch_stocks:
            # Get data for this stock
            stock_train = train_data[train_data['id'] == stock_id].sort_values('date')
            stock_test_mask = test_data['id'] == stock_id
            stock_test_indices = test_data[stock_test_mask].index
            
            if len(stock_train) < 10:
                # Use mean prediction
                mean_ret = stock_train[ret_var].mean() if len(stock_train) > 0 else 0
                arima_predictions[test_data.index.get_indexer(stock_test_indices)] = mean_ret
            else:
                try:
                    # Prepare time series
                    train_series = stock_train[ret_var].values
                    
                    # Find optimal parameters (simplified)
                    best_params, _ = find_arima_params(pd.Series(train_series))
                    
                    if best_params is not None:
                        # Fit ARIMA model
                        model = ARIMA(train_series, order=best_params)
                        fitted_model = model.fit()
                        
                        # Make predictions
                        n_predictions = len(stock_test_indices)
                        arima_pred = fitted_model.forecast(steps=n_predictions)
                        
                        # Handle single prediction case
                        if len(arima_pred) == 1:
                            arima_pred = np.full(n_predictions, arima_pred[0])
                        
                        arima_predictions[test_data.index.get_indexer(stock_test_indices)] = arima_pred
                    else:
                        # Fallback to mean
                        mean_ret = stock_train[ret_var].mean()
                        arima_predictions[test_data.index.get_indexer(stock_test_indices)] = mean_ret
                        
                except Exception as e:
                    # Fallback to mean prediction
                    mean_ret = stock_train[ret_var].mean()
                    arima_predictions[test_data.index.get_indexer(stock_test_indices)] = mean_ret
            
            processed_stocks += 1
            if processed_stocks % 100 == 0:
                print(f"  Processed {processed_stocks}/{len(test_stocks)} stocks")
    
    # Add ARIMA predictions to output
    reg_pred["arima"] = arima_predictions
    
    # Add to the output data
    pred_out = pd.concat([pred_out, reg_pred], ignore_index=True)
    
    # Go to the next year
    counter += 1
    
    # Optional: Limit periods for testing (remove this for full run)
    if counter >= 3:  # Limit to 3 periods for demonstration
        print("Limiting to 3 periods for demonstration")
        break

print(f"\nCompleted processing {counter} periods")
