# Baseline Models (Optimized)

This notebook implements efficient baseline models (OLS, Exponential Smoothing, XGBoost) using an expanding window. It uses the shared data loader and the hackathon datasets.


In [1]:
import datetime
import pandas as pd
import numpy as np
from pathlib import Path
import sys

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error

# Use shared data loader
sys.path.append(str(Path.cwd().parent / 'src'))
from data_loader import load_data

print(f"Start time: {datetime.datetime.now()}")
pd.set_option("mode.chained_assignment", None)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Start time: 2025-09-22 14:11:15.048345


In [2]:
# Load data using the shared data loader
print("Loading data...")
raw = load_data(filename="ret_sample.csv", parse_dates=["ret_eom"], low_memory=False)
raw['date'] = pd.to_datetime(raw['ret_eom'])

# Load predictor list
stock_vars = list(load_data(filename="factor_char_list.csv")["variable"].values)
ret_var = "stock_ret"

# Keep only valid target rows
raw = raw[raw[ret_var].notna()].copy()
print(f"Data shape after filter: {raw.shape}")
print(f"Date range: {raw['date'].min()} to {raw['date'].max()}")


Loading data...
Loading data from: /Users/kevin/Coding Projects/Asset-Management-Hackathon-2025/data/ret_sample.csv
Loading data from: /Users/kevin/Coding Projects/Asset-Management-Hackathon-2025/data/factor_char_list.csv
Data shape after filter: (6401414, 159)
Date range: 2005-02-28 00:00:00 to 2025-06-30 00:00:00


In [3]:
# Vectorized cross-sectional rank scaling by date (efficient)
print("Applying vectorized cross-sectional rank scaling...")

data = raw.copy()
for var in stock_vars:
    if var not in data.columns:
        continue
    med = data.groupby('date')[var].transform('median')
    data[var] = data[var].fillna(med)

    ranks = data.groupby('date')[var].rank(method='dense') - 1
    maxs = data.groupby('date')[var].transform('max')
    data[var] = np.where(maxs > 0, (ranks / maxs) * 2 - 1, 0)

del raw
print(f"Scaled data shape: {data.shape}")


Applying vectorized cross-sectional rank scaling...
Scaled data shape: (6401414, 159)


In [4]:
from math import sqrt

# Pre-calc unique dates to bound the loop
unique_dates = np.sort(data['date'].unique())
end_bound = unique_dates[-1]
print(f"Available dates: {unique_dates[0]} to {end_bound}")

# Expanding window setup
starting = pd.to_datetime("20050101", format="%Y%m%d")
counter = 0
results_rows = []
feature_rows = []

while (starting + pd.DateOffset(years=11 + counter)) <= end_bound:
    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),
        starting + pd.DateOffset(years=10 + counter),
        starting + pd.DateOffset(years=11 + counter),
    ]

    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    if len(train) == 0 or len(validate) == 0 or len(test) == 0:
        print(f"Skipping period {counter+1}: insufficient data")
        counter += 1
        continue

    # Standardize features
    scaler = StandardScaler().fit(train[stock_vars])
    X_train = scaler.transform(train[stock_vars])
    X_val   = scaler.transform(validate[stock_vars])
    X_test  = scaler.transform(test[stock_vars])

    Y_train = train[ret_var].values
    Y_val   = validate[ret_var].values
    Y_test  = test[ret_var].values

    # Prepare OOF frame for predictions
    fold_df = test[["year", "month", "date", "id", ret_var]].copy()

    # --- LINEAR REGRESSION ---
    lr = LinearRegression()
    lr.fit(X_train, Y_train)
    fold_df["ols"] = lr.predict(X_test)

    # Store top 25 features for OLS
    coef_abs = np.abs(lr.coef_)
    top_idx = np.argsort(coef_abs)[-25:][::-1]  # descending
    for idx in top_idx:
        feature_rows.append({
            "eval_year": cutoff[2].year,
            "model": "ols",
            "feature": stock_vars[idx],
            "importance": lr.coef_[idx]
        })

    # --- EXPONENTIAL SMOOTHING ---
    try:
        ts = train.groupby("date")[ret_var].mean().sort_index()
        model = ExponentialSmoothing(ts, trend="add", seasonal=None).fit()
        dates_test = sorted(test["date"].unique())
        forecast = model.forecast(len(dates_test))
        forecast_map = dict(zip(dates_test, forecast))
        fold_df["exp_smooth"] = test["date"].map(forecast_map)
        # Feature importance not applicable for this model
    except Exception as e:
        print("Exponential Smoothing failed:", e)
        fold_df["exp_smooth"] = np.nan

    # --- XGBOOST ---
    xgb = XGBRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
    )
    xgb.fit(X_train, Y_train, eval_set=[(X_val, Y_val)], early_stopping_rounds=20, verbose=False)
    fold_df["xgb"] = xgb.predict(X_test)

    # Store top 25 features for XGBoost
    importance_abs = np.abs(xgb.feature_importances_)
    top_idx = np.argsort(importance_abs)[-25:][::-1]
    for idx in top_idx:
        feature_rows.append({
            "eval_year": cutoff[2].year,
            "model": "xgb",
            "feature": stock_vars[idx],
            "importance": xgb.feature_importances_[idx]
        })

    # --- METRICS ---
    for model_name in ["ols", "exp_smooth", "xgb"]:
        y_pred = fold_df[model_name].values
        rmse = sqrt(mean_squared_error(Y_test, y_pred))
        mape = mean_absolute_percentage_error(Y_test, y_pred)
        results_rows.append({
            "eval_year": cutoff[2].year,
            "model": model_name,
            "rmse": float(rmse),
            "mape": float(mape),
        })

    print(f"Finished period {counter+1}: {cutoff[0].date()} -> {cutoff[3].date()}")
    counter += 1

print(f"Total periods processed: {counter}")


Available dates: 2005-02-28T00:00:00.000000000 to 2025-06-30T00:00:00.000000000


  self._init_dates(dates, freq)


Finished period 1: 2005-01-01 -> 2016-01-01


  self._init_dates(dates, freq)


Finished period 2: 2005-01-01 -> 2017-01-01


  self._init_dates(dates, freq)


Finished period 3: 2005-01-01 -> 2018-01-01


  self._init_dates(dates, freq)


Finished period 4: 2005-01-01 -> 2019-01-01


  self._init_dates(dates, freq)


Finished period 5: 2005-01-01 -> 2020-01-01


  self._init_dates(dates, freq)


Finished period 6: 2005-01-01 -> 2021-01-01


  self._init_dates(dates, freq)


Finished period 7: 2005-01-01 -> 2022-01-01


  self._init_dates(dates, freq)


Finished period 8: 2005-01-01 -> 2023-01-01


  self._init_dates(dates, freq)


Finished period 9: 2005-01-01 -> 2024-01-01


  self._init_dates(dates, freq)


Finished period 10: 2005-01-01 -> 2025-01-01
Total periods processed: 10


In [5]:
# Save results
results_df = pd.DataFrame(results_rows)
output_path = Path.cwd().parent / "data" / "results_baselines.csv"
results_df.to_csv(output_path, index=False)
print(f"Saved results to: {output_path}")

# Save feature importances
feature_df = pd.DataFrame(feature_rows)
feature_df.sort_values(["eval_year", "model", "importance"], ascending=[True, True, False], inplace=True)
output_features_path = Path.cwd().parent / "data" / "feature_importances.csv"
feature_df.to_csv(output_features_path, index=False)
print(f"Saved top features to: {output_features_path}")
print(datetime.datetime.now())

Saved results to: /Users/kevin/Coding Projects/Asset-Management-Hackathon-2025/data/results_baselines.csv
Saved top features to: /Users/kevin/Coding Projects/Asset-Management-Hackathon-2025/data/feature_importances.csv
2025-09-22 14:58:16.522173


In [6]:
results_df

Unnamed: 0,eval_year,model,rmse,mape
0,2015,ols,10.008084,254881300000.0
1,2015,exp_smooth,10.00788,103698500000.0
2,2015,xgb,10.007238,786181700000.0
3,2016,ols,0.23661,244624000000.0
4,2016,exp_smooth,0.228054,126957800000.0
5,2016,xgb,0.295283,594766800000.0
6,2017,ols,17.019633,213617900000.0
7,2017,exp_smooth,17.0196,70752130000.0
8,2017,xgb,17.020221,169776800000.0
9,2018,ols,1.468385,277672000000.0


In [7]:
feature_df

Unnamed: 0,eval_year,model,feature,importance
1,2015,ols,ope_be,0.015429
5,2015,ols,ret_9_1,0.011652
6,2015,ols,ni_ivol,0.011297
9,2015,ols,seas_1_1an,0.009422
10,2015,ols,taccruals_at,0.009355
...,...,...,...,...
495,2024,xgb,betadown_252d,0.000877
496,2024,xgb,ni_me,0.000544
497,2024,xgb,eqpo_me,0.000319
498,2024,xgb,ope_be,0.000269
