<a href="https://colab.research.google.com/github/Maung-Thura/next-day-directional-signal/blob/main/next_day_directional_signal.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Next‑day directional signal across 8–12 assets (30 days of daily data)

### Imports

In [1]:
import os, warnings, numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, precision_recall_curve, auc
from matplotlib.backends.backend_pdf import PdfPages
warnings.filterwarnings('ignore')
sns.set_style("whitegrid")
plt.rcParams.update({'figure.max_open_warning': 0})

### Load Data

In [10]:
CSV_PATH = "https://raw.githubusercontent.com/Maung-Thura/next-day-directional-signal/refs/heads/main/data/8_assets_30_days_data.csv"
df = pd.read_csv(CSV_PATH)
print("File loaded. Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

File loaded. Shape: (240, 7)
Columns: ['Asset', 'Date', 'Open', 'High', 'Low', 'Close', 'Volume']


Unnamed: 0,Asset,Date,Open,High,Low,Close,Volume
0,AAPL,9/29/25,254.56,255.0,253.01,254.43,40127687
1,AAPL,9/30/25,254.855,255.919,253.11,254.63,37704259
2,AAPL,10/1/25,255.04,258.79,254.93,255.45,48713940
3,AAPL,10/2/25,256.575,258.18,254.15,257.13,42630239
4,AAPL,10/3/25,254.665,259.24,253.95,258.02,49155614


### Preprocess and Feature Engineering

In [3]:
# Normalize columns names
df.columns = [c.strip() for c in df.columns]
# required columns: Asset, Date, Close, Volume
assert 'Asset' in df.columns and 'Date' in df.columns and 'Close' in df.columns, "CSV is missing required columns."

df['date'] = pd.to_datetime(df['Date'])
df = df.sort_values(['Asset','date']).reset_index(drop=True)

def add_features(g):
    g = g.copy()
    g['ret_1'] = g['Close'].pct_change(1)
    g['ret_3'] = g['Close'].pct_change(3)
    g['ma_3'] = g['Close'].rolling(3).mean()
    g['ma_5'] = g['Close'].rolling(5).mean()
    g['ma_diff'] = g['ma_3'] - g['ma_5']
    g['momentum_3'] = g['Close'] / g['ma_3'] - 1
    g['rv_3'] = g['Volume'] / g['Volume'].rolling(3).mean() - 1
    g['vol_5'] = g['ret_1'].rolling(5).std()
    return g

df = df.groupby('Asset').apply(add_features).reset_index(drop=True)
df['next_close'] = df.groupby('Asset')['Close'].shift(-1)
df['next_return'] = (df['next_close'] - df['Close']) / df['Close']
df['y_dir'] = (df['next_return'] > 0).astype(int)

# Drop rows missing features/labels
df_clean = df.dropna(subset=['ret_1','ret_3','ma_diff','momentum_3','rv_3','vol_5','next_return']).copy().reset_index(drop=True)
print("Clean rows:", df_clean.shape)

Clean rows: (192, 19)


### Rolling (Walk-Forward) Splits

In [4]:
def rolling_splits(df, n_splits=6, min_train_days=12):
    dates = sorted(df['date'].unique())
    candidate = dates[min_train_days:]
    test_dates = candidate[-n_splits:]
    splits = []
    for td in test_dates:
        train_idx = df[df['date'] < td].index.values
        test_idx = df[df['date'] == td].index.values
        if train_idx.size and test_idx.size:
            splits.append((train_idx, test_idx))
    return splits

splits = rolling_splits(df_clean, n_splits=6, min_train_days=12)
print("Generated", len(splits), "rolling splits.")
feature_cols = ['ret_1','ret_3','ma_diff','momentum_3','rv_3','vol_5']

Generated 6 rolling splits.


### Evaluation Helper and Models

In [5]:
def evaluate(df, feature_cols, splits, model_pipeline, threshold=0.5):
    rows=[]
    for train_idx, test_idx in splits:
        train, test = df.loc[train_idx], df.loc[test_idx]
        X_train, y_train = train[feature_cols].values, train['y_dir'].values
        X_test, y_test = test[feature_cols].values, test['y_dir'].values
        model_pipeline.fit(X_train, y_train)
        probs = model_pipeline.predict_proba(X_test)[:,1]
        preds = (probs >= threshold).astype(int)
        rows.append({
            'test_date': test['date'].iloc[0],
            'precision': precision_score(y_test, preds, zero_division=0),
            'recall': recall_score(y_test, preds, zero_division=0),
            'f1': f1_score(y_test, preds, zero_division=0),
            'n_signals': int(preds.sum()),
            'mean_return_per_signal': test.loc[preds==1, 'next_return'].mean() if preds.sum()>0 else 0.0
        })
    return pd.DataFrame(rows)

pipe_log = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=500))])
pipe_rf = Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier(n_estimators=300, random_state=42))])

metrics_log = evaluate(df_clean, feature_cols, splits, pipe_log)
metrics_rf = evaluate(df_clean, feature_cols, splits, pipe_rf)

display(metrics_log)
display(metrics_rf)

Unnamed: 0,test_date,precision,recall,f1,n_signals,mean_return_per_signal
0,2025-10-30,0.285714,0.666667,0.4,7,0.010594
1,2025-10-31,0.25,0.25,0.25,4,-0.003542
2,2025-11-03,0.142857,1.0,0.25,7,-0.019899
3,2025-11-04,0.75,0.5,0.6,4,0.003964
4,2025-11-05,0.2,1.0,0.333333,5,-0.011499
5,2025-11-06,0.4,0.5,0.444444,5,-0.003055


Unnamed: 0,test_date,precision,recall,f1,n_signals,mean_return_per_signal
0,2025-10-30,0.5,0.666667,0.571429,4,0.028582
1,2025-10-31,0.666667,0.5,0.571429,3,0.021469
2,2025-11-03,0.0,0.0,0.0,4,-0.029832
3,2025-11-04,0.75,0.5,0.6,4,0.011834
4,2025-11-05,0.333333,1.0,0.5,3,-0.000378
5,2025-11-06,0.2,0.25,0.222222,5,-0.011617


### Summary CSVs and PnL simulation

In [7]:
def summarize(metrics_df):
    return {
        'precision_mean': metrics_df['precision'].mean(),
        'recall_mean': metrics_df['recall'].mean(),
        'f1_mean': metrics_df['f1'].mean(),
        'avg_return_per_signal': metrics_df['mean_return_per_signal'].mean(),
        'avg_signals_per_testday': metrics_df['n_signals'].mean()
    }

summary_log = summarize(metrics_log)
summary_rf = summarize(metrics_rf)
summary_df = pd.DataFrame([{'model':'LogisticRegression', **summary_log},
                           {'model':'RandomForest', **summary_rf}])
summary_df.to_csv('/content/model_summary.csv', index=False)
print("Saved /content/model_summary.csv")

# Simulate daily equal-weight portfolio driven by RF signals (train on all prior days)
def simulate_pnl(df, feature_cols, model_pipeline, threshold=0.5):
    df2 = df.copy(); df2['pred_prob'] = np.nan; df2['pred']=0
    for date in sorted(df2['date'].unique()):
        train_mask = df2['date'] < date
        test_mask = df2['date'] == date
        if train_mask.sum() < 12: continue
        X_train, y_train = df2.loc[train_mask, feature_cols].values, df2.loc[train_mask, 'y_dir'].values
        model_pipeline.fit(X_train, y_train)
        probs = model_pipeline.predict_proba(df2.loc[test_mask, feature_cols].values)[:,1]
        df2.loc[test_mask, 'pred_prob'] = probs
        df2.loc[test_mask, 'pred'] = (probs >= threshold).astype(int)
    daily = []
    for date, g in df2.groupby('date'):
        if g['pred'].sum() == 0:
            daily.append({'date':date,'portfolio_return':0.0,'n':0})
        else:
            daily.append({'date':date,'portfolio_return':g.loc[g['pred']==1,'next_return'].mean(),'n':int(g['pred'].sum())})
    daily_df = pd.DataFrame(daily).sort_values('date').reset_index(drop=True)
    daily_df['cum_return'] = (1 + daily_df['portfolio_return']).cumprod() - 1
    return df2, daily_df

df_preds_rf, daily_pnl_rf = simulate_pnl(df_clean, feature_cols, pipe_rf, threshold=0.5)
daily_pnl_rf.to_csv('/content/daily_pnl_rf.csv', index=False)
print("Saved /content/daily_pnl_rf.csv")

# Feature importance (train RF on all clean rows)
pipe_rf.fit(df_clean[feature_cols].values, df_clean['y_dir'].values)
fi = pipe_rf.named_steps['clf'].feature_importances_
fi_df = pd.DataFrame({'feature':feature_cols,'importance':fi}).sort_values('importance',ascending=False)
fi_df.to_csv('/content/feature_importance.csv', index=False)
print("Saved /content/feature_importance.csv")

Saved /content/model_summary.csv
Saved /content/daily_pnl_rf.csv
Saved /content/feature_importance.csv


### Plots and PDF summary

In [9]:
os.makedirs('/content/plots', exist_ok=True)
# Rolling metrics plots
plt.figure(figsize=(10,4))
plt.plot(metrics_log['test_date'], metrics_log['precision'], marker='o', label='Logistic Precision')
plt.plot(metrics_rf['test_date'], metrics_rf['precision'], marker='o', label='RF Precision')
plt.legend(); plt.title('Rolling Precision'); plt.xticks(rotation=30); plt.tight_layout()
plt.savefig('/content/plots/rolling_precision.png'); plt.close()

plt.figure(figsize=(10,4))
plt.plot(metrics_log['test_date'], metrics_log['recall'], marker='o', label='Logistic Recall')
plt.plot(metrics_rf['test_date'], metrics_rf['recall'], marker='o', label='RF Recall')
plt.legend(); plt.title('Rolling Recall'); plt.xticks(rotation=30); plt.tight_layout()
plt.savefig('/content/plots/rolling_recall.png'); plt.close()

plt.figure(figsize=(10,4))
plt.plot(metrics_log['test_date'], metrics_log['f1'], marker='o', label='Logistic F1')
plt.plot(metrics_rf['test_date'], metrics_rf['f1'], marker='o', label='RF F1')
plt.legend(); plt.title('Rolling F1'); plt.xticks(rotation=30); plt.tight_layout()
plt.savefig('/content/plots/rolling_f1.png'); plt.close()

# Cumulative P&L
plt.figure(figsize=(10,4))
plt.plot(daily_pnl_rf['date'], daily_pnl_rf['cum_return'], marker='o')
plt.title('Cumulative Return (RF signals)'); plt.xticks(rotation=30); plt.tight_layout()
plt.savefig('/content/plots/cumulative_pnl_rf.png'); plt.close()

# Confusion + PR on last split if exists
if splits:
    train_idx, test_idx = splits[-1]
    pipe_rf.fit(df_clean.loc[train_idx, feature_cols].values, df_clean.loc[train_idx, 'y_dir'].values)
    y_test = df_clean.loc[test_idx, 'y_dir'].values
    preds = pipe_rf.predict(df_clean.loc[test_idx, feature_cols].values)
    cm = confusion_matrix(y_test, preds)
    plt.figure(figsize=(4,3)); sns.heatmap(cm,annot=True,fmt='d'); plt.title('Confusion - last split'); plt.tight_layout()
    plt.savefig('/content/plots/confusion_last_split.png'); plt.close()
    probs = pipe_rf.predict_proba(df_clean.loc[test_idx, feature_cols].values)[:,1]
    precision, recall, _ = precision_recall_curve(y_test, probs)
    plt.figure(figsize=(5,4)); plt.plot(recall, precision); plt.title('PR Curve - last split'); plt.tight_layout()
    plt.savefig('/content/plots/pr_curve_last_split.png'); plt.close()

# PDF summary (1-2 pages): page1 = text summary, page2 = plots
with PdfPages('/content/summary_report.pdf') as pdf:
    fig = plt.figure(figsize=(8.5,11)); plt.axis('off')
    summary_text = f"""TokenMetrics - Take-Home Summary

Dataset: 8 assets × 30 days (240 rows)
Models: Logistic Regression, Random Forest
Rolling splits: {len(splits)} test dates (expanding window)

Random Forest averages:
 - precision = {summary_rf['precision_mean']:.3f}
 - recall    = {summary_rf['recall_mean']:.3f}
 - f1        = {summary_rf['f1_mean']:.3f}
 - avg return per signal = {summary_rf['avg_return_per_signal']:.4f}
 - avg signals per test day = {summary_rf['avg_signals_per_testday']:.2f}

Caveats:
 - 30 days is a small sample; results are noisy.
 - No transaction costs or slippage modeled.
 - This is a long-only signal proof-of-concept.

Next steps: extend history, add cost model & position sizing, calibrate thresholds, more features/ensembles.

Files saved: model_summary.csv, daily_pnl_rf.csv, feature_importance.csv, plots/
"""
    plt.text(0.02,0.98, summary_text, va='top', wrap=True, fontsize=10)
    pdf.savefig(); plt.close()
    # add some plots
    for p in ['rolling_precision.png','rolling_recall.png','rolling_f1.png','cumulative_pnl_rf.png']:
        ppath = '/content/plots/' + p
        if os.path.exists(ppath):
            fig = plt.figure(figsize=(8.5,11)); img = plt.imread(ppath); plt.imshow(img); plt.axis('off')
            pdf.savefig(); plt.close()
print("Saved /content/summary_report.pdf and plots/")

print("Files in /content/:")
print(" - /content/model_summary.csv")
print(" - /content/daily_pnl_rf.csv")
print(" - /content/feature_importance.csv")
print(" - /content/summary_report.pdf")
print(" - /content/plots/*")

Saved /content/summary_report.pdf and plots/
Files in /content/:
 - /content/model_summary.csv
 - /content/daily_pnl_rf.csv
 - /content/feature_importance.csv
 - /content/summary_report.pdf
 - /content/plots/*
