# Baseline Models (Optimized US_CAN)

This notebook implements efficient baseline models (OLS, Exponential Smoothing, XGBoost) using an expanding window. It uses the shared data loader and the hackathon datasets.


In [1]:
import datetime
import pandas as pd
import numpy as np
from pathlib import Path
import sys

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error
from lightgbm import LGBMRegressor
from lightgbm import early_stopping, log_evaluation

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from math import sqrt
from scipy.stats import spearmanr

# Use shared data loader
sys.path.append(str(Path.cwd().parent / 'src'))
from data_loader import load_data

print(f"Start time: {datetime.datetime.now()}")
pd.set_option("mode.chained_assignment", None)


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Start time: 2025-09-24 17:39:33.975993


In [2]:
# Load data using the shared data loader
print("Loading data...")
raw = load_data(filename="usa_can_filtered_data.csv", parse_dates=["ret_eom"], low_memory=False)
raw['date'] = pd.to_datetime(raw['ret_eom'])

# Load predictor list
stock_vars = list(load_data(filename="factor_char_list.csv")["variable"].values)
ret_var = "stock_ret"

# Keep only valid target rows
raw = raw[raw[ret_var].notna()].copy()
print(f"Data shape after filter: {raw.shape}")
print(f"Date range: {raw['date'].min()} to {raw['date'].max()}")


Loading data...
Loading data from: /Users/kevin/Coding Projects/Asset-Management-Hackathon-2025/data/usa_can_filtered_data.csv
Loading data from: /Users/kevin/Coding Projects/Asset-Management-Hackathon-2025/data/factor_char_list.csv
Data shape after filter: (1398807, 159)
Date range: 2005-02-28 00:00:00 to 2025-06-30 00:00:00


In [3]:
# Vectorized cross-sectional rank scaling by date (efficient)
print("Applying vectorized cross-sectional rank scaling...")

data = raw.copy()
for var in stock_vars:
    if var not in data.columns:
        continue
    med = data.groupby('date')[var].transform('median')
    data[var] = data[var].fillna(med)

    ranks = data.groupby('date')[var].rank(method='dense') - 1
    maxs = data.groupby('date')[var].transform('max')
    data[var] = np.where(maxs > 0, (ranks / maxs) * 2 - 1, 0)

del raw
print(f"Scaled data shape: {data.shape}")


Applying vectorized cross-sectional rank scaling...
Scaled data shape: (1398807, 159)


In [4]:
# Pre-calc unique dates to bound the loop
unique_dates = np.sort(data['date'].unique())
end_bound = unique_dates[-1]
print(f"Available dates: {unique_dates[0]} to {end_bound}")

# Expanding window setup
starting = pd.to_datetime("20050101", format="%Y%m%d")
counter = 0
results_rows = []
feature_rows = []

while (starting + pd.DateOffset(years=11 + counter)) <= end_bound:
    cutoff = [
        starting,
        starting + pd.DateOffset(years=8 + counter),
        starting + pd.DateOffset(years=10 + counter),
        starting + pd.DateOffset(years=11 + counter),
    ]

    train = data[(data["date"] >= cutoff[0]) & (data["date"] < cutoff[1])]
    validate = data[(data["date"] >= cutoff[1]) & (data["date"] < cutoff[2])]
    test = data[(data["date"] >= cutoff[2]) & (data["date"] < cutoff[3])]

    if len(train) == 0 or len(validate) == 0 or len(test) == 0:
        print(f"Skipping period {counter+1}: insufficient data")
        counter += 1
        continue

    # Standardize features
    scaler = StandardScaler().fit(train[stock_vars])
    X_train = scaler.transform(train[stock_vars])
    X_val   = scaler.transform(validate[stock_vars])
    X_test  = scaler.transform(test[stock_vars])

    Y_train = train[ret_var].values
    Y_val   = validate[ret_var].values
    Y_test  = test[ret_var].values

    # Prepare OOF frame for predictions
    fold_df = test[["year", "month", "date", "id", ret_var]].copy()

    # --- LINEAR REGRESSION ---
    lr = LinearRegression()
    lr.fit(X_train, Y_train)
    fold_df["ols"] = lr.predict(X_test)

    # Store top 25 features for OLS
    coef_abs = np.abs(lr.coef_)
    top_idx = np.argsort(coef_abs)[-25:][::-1]  # descending
    for idx in top_idx:
        feature_rows.append({
            "eval_year": cutoff[2].year,
            "model": "ols",
            "feature": stock_vars[idx],
            "importance": lr.coef_[idx]
        })

    # --- EXPONENTIAL SMOOTHING ---
    try:
        ts = train.groupby("date")[ret_var].mean().sort_index()
        model = ExponentialSmoothing(ts, trend="add", seasonal=None).fit()
        dates_test = sorted(test["date"].unique())
        forecast = model.forecast(len(dates_test))
        forecast_map = dict(zip(dates_test, forecast))
        fold_df["exp_smooth"] = test["date"].map(forecast_map)
        # Feature importance not applicable for this model
    except Exception as e:
        print("Exponential Smoothing failed:", e)
        fold_df["exp_smooth"] = np.nan

    # --- XGBOOST ---
    xgb = XGBRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
    )
    xgb.fit(X_train, Y_train, eval_set=[(X_val, Y_val)], early_stopping_rounds=20, verbose=False)
    fold_df["xgb"] = xgb.predict(X_test)

    # Store top 25 features for XGBoost
    importance_abs = np.abs(xgb.feature_importances_)
    top_idx = np.argsort(importance_abs)[-25:][::-1]
    for idx in top_idx:
        feature_rows.append({
            "eval_year": cutoff[2].year,
            "model": "xgb",
            "feature": stock_vars[idx],
            "importance": xgb.feature_importances_[idx]
        })

    # --- LIGHTGBM ---
    lgbm = LGBMRegressor(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1   # silence completely
    )
    lgbm.fit(
        X_train, Y_train,
        eval_set=[(X_val, Y_val)],
        eval_metric="rmse",
        callbacks=[
            early_stopping(stopping_rounds=20),
            log_evaluation(-1)  # disable logging
        ]
    )
    fold_df["lgbm"] = lgbm.predict(X_test)

    # Store top 25 features for LightGBM
    importance_abs = np.abs(lgbm.feature_importances_)
    top_idx = np.argsort(importance_abs)[-25:][::-1]
    for idx in top_idx:
        feature_rows.append({
            "eval_year": cutoff[2].year,
            "model": "lgbm",
            "feature": stock_vars[idx],
            "importance": lgbm.feature_importances_[idx]
        })

        # --- METRICS ---
    for model_name in ["ols", "exp_smooth", "xgb", "lgbm"]:
        y_pred = fold_df[model_name].values
        rmse = sqrt(mean_squared_error(Y_test, y_pred))
        mae = mean_absolute_error(Y_test, y_pred)
        r2 = r2_score(Y_test, y_pred)

        # Spearman correlation (Information Coefficient, IC)
        try:
            ic, _ = spearmanr(Y_test, y_pred)
        except Exception:
            ic = np.nan

        # --- Portfolio backtest (equal-weight long/short 125 each side) ---
        port_return = np.nan
        if fold_df.shape[0] >= 250:
            temp = fold_df.copy()
            temp["pred"] = y_pred

            longs = temp.nlargest(125, "pred")
            shorts = temp.nsmallest(125, "pred")

            # Equal weight: mean of realized returns
            long_ret = longs[ret_var].mean()
            short_ret = shorts[ret_var].mean()
            port_return = long_ret - short_ret

        results_rows.append({
            "eval_year": cutoff[2].year,
            "model": model_name,
            "rmse": float(rmse),
            "mae": float(mae),
            "r2": float(r2),
            "ic": float(ic) if ic is not None else np.nan,
            "portfolio_return": float(port_return) if port_return is not None else np.nan,
        })

    print(f"Finished period {counter+1}: {cutoff[0].date()} -> {cutoff[3].date()}")
    counter += 1

print(f"Total periods processed: {counter}")


Available dates: 2005-02-28T00:00:00.000000000 to 2025-06-30T00:00:00.000000000


  self._init_dates(dates, freq)


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[5]	valid_0's rmse: 0.182909	valid_0's l2: 0.0334559
Finished period 1: 2005-01-01 -> 2016-01-01


  self._init_dates(dates, freq)


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[21]	valid_0's rmse: 15.1068	valid_0's l2: 228.217
Finished period 2: 2005-01-01 -> 2017-01-01


  self._init_dates(dates, freq)


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[43]	valid_0's rmse: 15.2389	valid_0's l2: 232.225
Finished period 3: 2005-01-01 -> 2018-01-01


  self._init_dates(dates, freq)


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 26.8074	valid_0's l2: 718.636
Finished period 4: 2005-01-01 -> 2019-01-01


  self._init_dates(dates, freq)


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[2]	valid_0's rmse: 27.0653	valid_0's l2: 732.531
Finished period 5: 2005-01-01 -> 2020-01-01


  self._init_dates(dates, freq)


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 1.24481	valid_0's l2: 1.54954
Finished period 6: 2005-01-01 -> 2021-01-01


  self._init_dates(dates, freq)


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.307975	valid_0's l2: 0.0948484
Finished period 7: 2005-01-01 -> 2022-01-01


  self._init_dates(dates, freq)


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.398054	valid_0's l2: 0.158447
Finished period 8: 2005-01-01 -> 2023-01-01


  self._init_dates(dates, freq)


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.355366	valid_0's l2: 0.126285
Finished period 9: 2005-01-01 -> 2024-01-01


  self._init_dates(dates, freq)


Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[1]	valid_0's rmse: 0.351247	valid_0's l2: 0.123374
Finished period 10: 2005-01-01 -> 2025-01-01
Total periods processed: 10




In [5]:
# Save results
results_df = pd.DataFrame(results_rows)
output_path = Path.cwd().parent / "data" / "results_can_usa_baselines.csv"
results_df.to_csv(output_path, index=False)
print(f"Saved results to: {output_path}")

# Save feature importances
feature_df = pd.DataFrame(feature_rows)
feature_df.sort_values(["eval_year", "model", "importance"], ascending=[True, True, False], inplace=True)
output_features_path = Path.cwd().parent / "data" / "feature_can_usa_importances.csv"
feature_df.to_csv(output_features_path, index=False)
print(f"Saved top features to: {output_features_path}")
print(datetime.datetime.now())

Saved results to: /Users/kevin/Coding Projects/Asset-Management-Hackathon-2025/data/results_can_usa_baselines.csv
Saved top features to: /Users/kevin/Coding Projects/Asset-Management-Hackathon-2025/data/feature_can_usa_importances.csv
2025-09-24 17:44:30.241551


In [6]:
results_df

Unnamed: 0,eval_year,model,rmse,mae,r2,ic,portfolio_return
0,2015,ols,21.370326,0.203618,-5e-06,-0.008591,-0.023363
1,2015,exp_smooth,21.370389,0.193131,-1.1e-05,0.066059,0.015276
2,2015,xgb,21.369251,0.247527,9.5e-05,0.052647,44.841345
3,2015,lgbm,21.36859,0.194507,0.000157,0.024719,44.126969
4,2016,ols,0.232696,0.129316,-0.042111,0.109864,0.101981
5,2016,exp_smooth,0.227946,0.113641,3e-06,0.046604,0.083197
6,2016,xgb,1.580551,0.129007,-47.078673,0.105857,0.249065
7,2016,lgbm,0.227266,0.112985,0.005955,0.081062,0.29827
8,2017,ols,38.167533,0.266772,-1.5e-05,-0.000165,-0.066963
9,2017,exp_smooth,38.167635,0.246089,-2e-05,-0.022715,0.033806


In [7]:
feature_df

Unnamed: 0,eval_year,model,feature,importance
50,2015,lgbm,dolvol_126d,11.000000
51,2015,lgbm,prc,11.000000
52,2015,lgbm,market_equity,9.000000
53,2015,lgbm,ebitda_mev,8.000000
54,2015,lgbm,bev_mev,7.000000
...,...,...,...,...
720,2024,xgb,ni_me,0.000629
721,2024,xgb,aliq_mat,0.000603
722,2024,xgb,niq_su,0.000524
723,2024,xgb,at_me,0.000434


In [8]:
feature_counts = (
    feature_df.groupby(["model", "feature"])
    .size()
    .reset_index(name="count")
    .sort_values(["model", "count"], ascending=[True, False])
)

In [9]:
(feature_counts[feature_counts["model"] == "xgb"]).head(15)

Unnamed: 0,model,feature,count
143,xgb,age,10
145,xgb,aliq_mat,10
190,xgb,niq_be,9
144,xgb,aliq_at,8
155,xgb,bidaskhl_21d,8
159,xgb,cop_at,8
164,xgb,dolvol_126d,8
213,xgb,turnover_var_126d,8
161,xgb,coskew_21d,7
181,xgb,lti_gr1a,7
