In [1]:
!pip install -q scikit-learn xgboost pyarrow

import os, numpy as np, pandas as pd

# >>> EDIT these two lines if your file path differs
BASE_DIR  = "/content/drive/MyDrive/Projects/JPM Short Interest/outputs"
DATA_FILE = "merged_with_nolbert.parquet"

from google.colab import drive
drive.mount('/content/drive')

path = os.path.join(BASE_DIR, DATA_FILE)
df = pd.read_parquet(path) if DATA_FILE.endswith(".parquet") else pd.read_csv(path, low_memory=False)

# basic sanity
print("shape:", df.shape)
print("sample cols:", df.columns.tolist()[:25])
assert "interval" in df.columns, "interval missing"
if "currentShortPositionQuantity" not in df.columns:
    raise ValueError("currentShortPositionQuantity missing (needed for target/baselines)")


Mounted at /content/drive
shape: (21, 25)
sample cols: ['interval', 'event_dt', 'transcriptid', 'full_text', 'accountingYearMonthNumber', 'symbolCode', 'issueName', 'issuerServicesGroupExchangeCode', 'marketClassCode', 'currentShortPositionQuantity', 'previousShortPositionQuantity', 'stockSplitFlag', 'averageDailyVolumeQuantity', 'daysToCoverQuantity', 'revisionFlag', 'changePercent', 'changePreviousNumber', 'settlementDate', 'doc_id', 'NEUTRAL', 'POSITIVE', 'NEGATIVE', 'sent_polarity', 'sent_entropy', 'target_next']


In [10]:
import os, pandas as pd, numpy as np, re
from google.colab import drive
drive.mount('/content/drive')

BASE_DIR  = "/content/drive/MyDrive/Projects/JPM Short Interest/outputs"  # <- adjust if needed
DATA_FILE = "merged_with_nolbert.parquet"  # the file you saved after NoLBERT scoring

print("Folder listing:")
print(os.listdir(BASE_DIR))

path = os.path.join(BASE_DIR, DATA_FILE)
df = pd.read_parquet(path) if path.endswith(".parquet") else pd.read_csv(path, low_memory=False)
print("Loaded:", path, "shape:", df.shape)
print("Columns:", df.columns.tolist())
print(df.head(3))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Folder listing:
['apple_by_bin_20250904_175544.csv', 'apple_by_bin_20250904_175544.parquet', 'finra_by_bin_20250904_175544.csv', 'finra_by_bin_20250904_175544.parquet', 'merged_20250904_175544.csv', 'merged_20250904_175544.parquet', 'tx_filt_20250904_175544.csv', 'tx_filt_20250904_175544.parquet', 'apple_by_bin_20250911_072828.csv', 'apple_by_bin_20250911_072828.parquet', 'finra_by_bin_20250911_072828.csv', 'finra_by_bin_20250911_072828.parquet', 'merged_20250911_072828.csv', 'merged_20250911_072828.parquet', 'tx_filt_20250911_072829.csv', 'tx_filt_20250911_072829.parquet', 'merged_docs.parquet', 'merged_chunks_nltk.parquet', 'merged_with_nolbert.parquet']
Loaded: /content/drive/MyDrive/Projects/JPM Short Interest/outputs/merged_with_nolbert.parquet shape: (21, 25)
Columns: ['interval', 'event_dt', 'transcriptid', 'full_text', 'accountingYearMonthNumber', 'sy

In [11]:
# ---- Coerce dtypes: run RIGHT AFTER loading df ----
import re

def to_num(s):
    if pd.isna(s): return np.nan
    if isinstance(s,(int,float,np.number)): return float(s)
    s = str(s).strip()
    s = re.sub(r'[,%$()]', '', s)
    try: return float(s)
    except: return np.nan

num_cols = [
    "currentShortPositionQuantity","previousShortPositionQuantity",
    "averageDailyVolumeQuantity","daysToCoverQuantity",
    "changePercent","changePreviousNumber","si_ratio_to_adv",
    "NEUTRAL","POSITIVE","NEGATIVE","sent_polarity","sent_entropy",
    "si_pct_change"  # if already present
]
num_cols = [c for c in num_cols if c in df.columns]

for c in num_cols:
    df[c] = df[c].map(to_num).astype(float)

for c in ["settlementDate","event_dt"]:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors="coerce")

if "interval" in df.columns:
    df["interval"] = pd.to_numeric(df["interval"], errors="coerce").astype("Int64")

print(df[num_cols].describe().T)
print("Null ratios (selected):")
print(df[["interval"] + num_cols].isna().mean().sort_values(ascending=False).head(20))

                               count          mean           std  \
currentShortPositionQuantity    21.0  1.150037e+08  1.493192e+07   
previousShortPositionQuantity   21.0  1.204908e+08  1.875792e+07   
averageDailyVolumeQuantity      21.0  5.575819e+07  1.258956e+07   
daysToCoverQuantity             21.0  2.169524e+00  6.161613e-01   
changePercent                   21.0 -3.938571e+00  7.794049e+00   
changePreviousNumber            21.0 -5.487059e+06  9.871524e+06   
NEUTRAL                         21.0  4.970162e-01  1.720610e-01   
POSITIVE                        21.0  4.204832e-01  1.650656e-01   
NEGATIVE                        21.0  8.250055e-02  5.889358e-02   
sent_polarity                   21.0  3.379827e-01  1.783962e-01   
sent_entropy                    21.0  8.327758e-01  1.445628e-01   

                                        min           25%           50%  \
currentShortPositionQuantity   8.885235e+07  1.051693e+08  1.123331e+08   
previousShortPositionQuantity  9.

In [4]:
def add_lags(g, cols, lags=(1,2,3)):
    for c in cols:
        for L in lags:
            g[f"{c}_lag{L}"] = g[c].shift(L)
    return g

def add_rolls(g, cols, win=3):
    for c in cols:
        g[f"{c}_rollmean{win}"] = g[c].rolling(win).mean()
        g[f"{c}_rollstd{win}"]  = g[c].rolling(win).std()
    return g

def _fe_symbol(g, base_cols, interval_col="interval"):
    g = g.sort_values(interval_col)
    g = add_lags(g, base_cols, lags=(1,2,3))
    g = add_rolls(g, ["currentShortPositionQuantity","averageDailyVolumeQuantity","daysToCoverQuantity"], win=3)
    # lag sentiment once to test predictive power
    if "sent_polarity" in g.columns:
        g["sent_polarity_lag1"] = g["sent_polarity"].shift(1)
    return g

def build_features(df, symbol_col="symbolCode", interval_col="interval"):
    # base fields expected; create si_pct_change if missing
    if "si_pct_change" not in df.columns:
        df["si_pct_change"] = (
            df["currentShortPositionQuantity"] - df["previousShortPositionQuantity"]
        ) / df["previousShortPositionQuantity"].replace(0, np.nan)

    base_cols = [
        "currentShortPositionQuantity","averageDailyVolumeQuantity","daysToCoverQuantity",
        "si_ratio_to_adv","si_pct_change","sent_polarity","sent_entropy",
        "POSITIVE","NEGATIVE","NEUTRAL"
    ]
    for c in base_cols:
        if c not in df.columns:
            df[c] = np.nan

    if symbol_col in df.columns:
        df = df.groupby(symbol_col, group_keys=False).apply(lambda g: _fe_symbol(g, base_cols, interval_col))
    else:
        df = _fe_symbol(df, base_cols, interval_col)

    # simple interactions
    df["polxratio"] = df["sent_polarity"] * df["si_ratio_to_adv"]
    df["negxdtc"]   = df["NEGATIVE"] * df["daysToCoverQuantity"]
    return df

# build features
df = build_features(df)

# target: next interval SI level (create if missing)
if "target_next" not in df.columns:
    if "symbolCode" in df.columns:
        df["target_next"] = df.groupby("symbolCode")["currentShortPositionQuantity"].shift(-1)
    else:
        df["target_next"] = df["currentShortPositionQuantity"].shift(-1)

# drop rows with NA from lags/target
model_df = df.dropna().sort_values(["symbolCode","interval"] if "symbolCode" in df.columns else ["interval"]).reset_index(drop=True)

print("model_df shape:", model_df.shape)

model_df shape: (0, 65)


  df = df.groupby(symbol_col, group_keys=False).apply(lambda g: _fe_symbol(g, base_cols, interval_col))


In [12]:
# Build si_pct_change if missing
if "si_pct_change" not in df.columns and \
   {"currentShortPositionQuantity","previousShortPositionQuantity"}.issubset(df.columns):
    prev = df["previousShortPositionQuantity"].replace(0, np.nan)
    df["si_pct_change"] = (df["currentShortPositionQuantity"] - df["previousShortPositionQuantity"]) / prev

# Target: next short interest level
if "target_next" not in df.columns:
    if "symbolCode" in df.columns:
        df["target_next"] = df.groupby("symbolCode")["currentShortPositionQuantity"].shift(-1)
    else:
        df["target_next"] = df["currentShortPositionQuantity"].shift(-1)

# Keep only rows necessary for modeling (ESSENTIALS ONLY)
must_have = ["target_next","currentShortPositionQuantity"]
for c in ["averageDailyVolumeQuantity","daysToCoverQuantity","sent_polarity","NEGATIVE","POSITIVE"]:
    if c in df.columns: must_have.append(c)

model_df = (df.dropna(subset=must_have)
              .sort_values(["symbolCode","interval"] if "symbolCode" in df.columns else ["interval"])
              .reset_index(drop=True))

print("Rows after minimal dropna:", len(model_df))
print("Intervals span:", model_df["interval"].min() if "interval" in model_df else None,
      "→", model_df["interval"].max() if "interval" in model_df else None)


Rows after minimal dropna: 21
Intervals span: 0 → 46


In [13]:
keep = [c for c in ["interval","symbolCode","settlementDate","currentShortPositionQuantity",
                    "previousShortPositionQuantity","averageDailyVolumeQuantity","daysToCoverQuantity",
                    "NEUTRAL","POSITIVE","NEGATIVE","sent_polarity","sent_entropy","target_next"] if c in df.columns]
print(df[keep].head(10))
print(df[keep].tail(10))

   interval symbolCode settlementDate  currentShortPositionQuantity  \
0         0       AAPL     2023-08-15                    88852352.0   
1         6       AAPL     2023-10-31                    98190963.0   
2         7       AAPL     2023-11-15                   105837123.0   
3        10       AAPL     2023-12-29                   108220157.0   
4        13       AAPL     2024-02-15                    97665956.0   
5        16       AAPL     2024-03-28                   108782648.0   
6        19       AAPL     2024-04-30                    94308265.0   
7        20       AAPL     2024-05-15                    99287450.0   
8        23       AAPL     2024-06-28                   132235437.0   
9        25       AAPL     2024-07-31                   117696224.0   

   previousShortPositionQuantity  averageDailyVolumeQuantity  \
0                    105460144.0                  62061606.0   
1                     93026130.0                  55904604.0   
2                     9819

In [14]:
def add_lags(g, cols, lags=(1,)):
    for c in cols:
        for L in lags:
            g[f"{c}_lag{L}"] = g[c].shift(L)
    return g

def add_rolls(g, cols, win=2):
    for c in cols:
        g[f"{c}_rollmean{win}"] = g[c].rolling(win).mean()
        g[f"{c}_rollstd{win}"]  = g[c].rolling(win).std()
    return g

base_cols = [c for c in [
    "currentShortPositionQuantity","averageDailyVolumeQuantity","daysToCoverQuantity",
    "si_ratio_to_adv","si_pct_change","sent_polarity","sent_entropy","POSITIVE","NEGATIVE","NEUTRAL"
] if c in model_df.columns]

def _fe_symbol(g):
    g = g.sort_values("interval" if "interval" in g.columns else g.index)
    g = add_lags(g, base_cols, lags=(1,))
    g = add_rolls(g, [c for c in ["currentShortPositionQuantity","averageDailyVolumeQuantity","daysToCoverQuantity"] if c in g.columns], win=2)
    if "sent_polarity" in g.columns:
        g["sent_polarity_lag1"] = g["sent_polarity"].shift(1)
    return g

if "symbolCode" in model_df.columns:
    model_df = model_df.groupby("symbolCode", group_keys=False).apply(_fe_symbol)
else:
    model_df = _fe_symbol(model_df)

# Drop rows that became NA due to lags/rolls (only now)
model_df = model_df.dropna(subset=["target_next","currentShortPositionQuantity"]).reset_index(drop=True)
print("Rows after light FE:", len(model_df))

Rows after light FE: 21


  model_df = model_df.groupby("symbolCode", group_keys=False).apply(_fe_symbol)


In [20]:
exclude = {"target_next","full_text","settlementDate","event_dt","interval","transcriptid","doc_id"}
if "symbolCode" in model_df.columns:
    exclude.add("symbolCode")

candidates = [c for c in model_df.columns if c not in exclude]
X = model_df[candidates].select_dtypes(include=["number"]).copy()

# Fallback core features if nothing left:
if X.shape[1] == 0:
    core = [c for c in ["currentShortPositionQuantity","averageDailyVolumeQuantity","daysToCoverQuantity",
                        "si_ratio_to_adv","si_pct_change","sent_polarity","NEGATIVE","POSITIVE"] if c in model_df.columns]
    X = model_df[core].astype(float)

y = model_df["target_next"].astype(float)
y_curr = model_df["currentShortPositionQuantity"].astype(float).to_numpy()

# Clean X: drop all-NA columns, replace infs with NaN (will be imputed)
X = X.replace([np.inf, -np.inf], np.nan)
X = X.loc[:, X.notna().any(axis=0)]  # drop columns entirely NaN
print("After cleaning, X shape:", X.shape)

print("Final X shape:", X.shape, "y len:", len(y))

After cleaning, X shape: (21, 27)
Final X shape: (21, 27) y len: 21


In [16]:
from sklearn.model_selection import TimeSeriesSplit
n_samples = len(X)
if n_samples < 3:
    raise ValueError(f"Not enough samples ({n_samples}). Expand date range or reduce FE.")
n_splits = max(2, min(5, n_samples - 1))
tscv = TimeSeriesSplit(n_splits=n_splits)
print("Using n_splits =", n_splits)

Using n_splits = 5


In [22]:
# === Add BELOW your "Using n_splits = ..." print ===
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor
import time
from sklearn.impute import SimpleImputer

def metrics(y_true, y_pred, y_curr):
    # RMSE compatible with older sklearn (no 'squared' kwarg)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae  = mean_absolute_error(y_true, y_pred)
    r2   = r2_score(y_true, y_pred)
    # directional accuracy on change
    dy_true = np.sign(y_true - y_curr)
    dy_pred = np.sign(y_pred - y_curr)
    dir_acc = float((dy_true == dy_pred).mean())
    return {"RMSE": rmse, "MAE": mae, "R2": r2, "DirAcc": dir_acc}


# Baseline: persistence (predict current SI level)
baseline = metrics(y.to_numpy(), y_curr, y_curr)
print("Baseline (persistence):", baseline)

# Models
models = [
    ("ElasticNet", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  StandardScaler(with_mean=False)),
        ("model",   ElasticNet(alpha=1e-3, l1_ratio=0.3, max_iter=3000, random_state=42))
    ])),
    ("XGBoost", Pipeline([
        ("imputer", SimpleImputer(strategy="median")),  # XGB can handle NaN, but median impute is fine too
        ("model",   XGBRegressor(
            n_estimators=800, max_depth=5, subsample=0.8, colsample_bytree=0.8,
            learning_rate=0.03, reg_lambda=1.0, objective="reg:squarederror", random_state=42
        ))
    ]))
]

# Time-based CV
cv_rows = []
splits = list(tscv.split(X))
for name, mdl in models:
    for fold, (tr, te) in enumerate(splits):
        X_tr, X_te = X.iloc[tr], X.iloc[te]
        y_tr, y_te = y.iloc[tr], y.iloc[te]
        y_curr_te  = y_curr[te]
        mdl.fit(X_tr, y_tr)
        y_hat = mdl.predict(X_te)
        sc = metrics(y_te.to_numpy(), y_hat, y_curr_te)
        sc["model"], sc["fold"] = name, fold
        cv_rows.append(sc)

cv = pd.DataFrame(cv_rows)
print("\nCV means by model:\n", cv.groupby("model")[["RMSE","MAE","R2","DirAcc"]].mean())

# Holdout = last split, use XGB if present (else ElasticNet)
best_name, best_model = models[-1]  # XGBoost
last_tr, last_te = splits[-1]
best_model.fit(X.iloc[last_tr], y.iloc[last_tr])
y_hat = best_model.predict(X.iloc[last_te])
hold = metrics(y.iloc[last_te].to_numpy(), y_hat, y_curr[last_te])
print("\nHoldout:", hold)

# Optional: feature importance (XGBoost)
if hasattr(best_model, "feature_importances_"):
    imp = pd.Series(best_model.feature_importances_, index=X.columns).sort_values(ascending=False)
    print("\nTop 20 features:\n", imp.head(20))

# Optional: save artifacts next to your data file
ts = time.strftime("%Y%m%d_%H%M%S")
OUT_DIR = os.path.join(BASE_DIR, "models_nb", ts)
os.makedirs(OUT_DIR, exist_ok=True)

cv.to_csv(os.path.join(OUT_DIR, "cv_metrics.csv"), index=False)
pd.DataFrame([baseline]).to_csv(os.path.join(OUT_DIR, "baseline_metrics.csv"), index=False)
pd.DataFrame([hold]).to_csv(os.path.join(OUT_DIR, "holdout_metrics.csv"), index=False)

preds = model_df.iloc[last_te][["interval"] + (["symbolCode"] if "symbolCode" in model_df.columns else [])].copy()
preds["y_true"] = y.iloc[last_te].to_numpy()
preds["y_pred"] = y_hat
preds.to_parquet(os.path.join(OUT_DIR, "holdout_predictions.parquet"), index=False)

if 'imp' in locals():
    imp.to_csv(os.path.join(OUT_DIR, "feature_importance.csv"), header=["importance"])

print("\nArtifacts saved to:", OUT_DIR)

Baseline (persistence): {'RMSE': np.float64(11369894.054869568), 'MAE': 8821081.285714285, 'R2': 0.35631260313263513, 'DirAcc': 0.0}


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(



CV means by model:
                     RMSE           MAE         R2    DirAcc
model                                                      
ElasticNet  2.780727e+07  2.426438e+07 -20.817708  0.466667
XGBoost     1.655340e+07  1.456186e+07  -7.235218  0.600000

Holdout: {'RMSE': np.float64(13656591.363147711), 'MAE': 8182638.666666667, 'R2': -3.7631603023577096, 'DirAcc': 0.6666666666666666}

Artifacts saved to: /content/drive/MyDrive/Projects/JPM Short Interest/outputs/models_nb/20250912_044454
