```-
File:           logistic_regression_baseline.ipynb
Description:    Logistic Regression for baseline on raw data features. 
Author:         Morgan Cooper
Created:        2025-09-01
Updated:        2025-09-09

Notes:
Use this as a baseline for comparison with the resnet classifier model. 

```

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv("../../data/ohlc_images/window=180/meta.csv")

X = df.drop(columns=["path", "label"])
y = df["label"]

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---- Sequential splits ----
n = len(df)
train_end = int(0.7 * n)
val_end   = int(0.85 * n)

X_train, y_train = X_scaled[:train_end], y[:train_end]
X_val,   y_val   = X_scaled[train_end:val_end], y[train_end:val_end]
X_test,  y_test  = X_scaled[val_end:], y[val_end:]

# ---- Train logistic regression ----
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# ---- Evaluate on test ----
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.47      0.45      0.46       566
           1       0.54      0.57      0.56       654

    accuracy                           0.51      1220
   macro avg       0.51      0.51      0.51      1220
weighted avg       0.51      0.51      0.51      1220



In [2]:
proba = model.predict_proba(X_test)[:, 1]
preds = model.predict(X_test)
y_test = y_test.reset_index(drop=True)

results = pd.DataFrame({
    "proba": proba,
    "pred": preds,
    "label": y_test
})

results["bin"] = pd.qcut(results["proba"], q=20, labels=False)

bin_stats = results.groupby("bin").agg(
    avg_proba=("proba", "mean"),
    accuracy=("pred", lambda x: (x == results.loc[x.index, "label"]).mean()),
    win_rate=("label", "mean"),
    count=("label", "count")
).reset_index()

bin_stats["num_won"] = (bin_stats["win_rate"] * bin_stats["count"]).astype(int)
bin_stats = bin_stats.rename(columns={"count": "num_total"})

bin_stats_sorted = bin_stats.sort_values("avg_proba", ascending=False).reset_index(drop=True)
bin_stats_sorted["bin_pctile"] = bin_stats_sorted.index.map(lambda i: f"{(20 - i) * 5}%")
final = bin_stats_sorted[["bin_pctile", "avg_proba", "accuracy", "num_total", "num_won"]]

print("\nTEST BINNING STATS (sorted by confidence):")
print(final)



TEST BINNING STATS (sorted by confidence):
   bin_pctile  avg_proba  accuracy  num_total  num_won
0        100%   0.707896  0.475410         61       29
1         95%   0.621833  0.573770         61       35
2         90%   0.596748  0.540984         61       33
3         85%   0.578408  0.655738         61       40
4         80%   0.562541  0.475410         61       29
5         75%   0.551194  0.540984         61       33
6         70%   0.541677  0.557377         61       34
7         65%   0.533391  0.459016         61       28
8         60%   0.523076  0.639344         61       39
9         55%   0.514067  0.557377         61       34
10        50%   0.506228  0.491803         61       30
11        45%   0.497628  0.409836         61       39
12        40%   0.487158  0.540984         61       28
13        35%   0.477051  0.459016         61       33
14        30%   0.465191  0.491803         61       31
15        25%   0.451894  0.459016         61       33
16        20%   0.436