In [42]:
import pandas as pd
import numpy as np

# For modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

# Example: Load your CSV (update file name as needed)
df = pd.read_csv(r"C:\Users\Hrulekh Nandan\Documents\institutional_buy_sell_prediction\notebooks\bse_clean_data.csv", parse_dates=["Date"])

# Sort by date if not sorted
df = df.sort_values("Date").reset_index(drop=True)

# Feature Engineering
df["Return_1"] = df["close"].pct_change().shift(1)
df["Return_3"] = df["close"].pct_change(3).shift(1)
df["MA_5"] = df["close"].rolling(5).mean().shift(1)
df["MA_10"] = df["close"].rolling(10).mean().shift(1)
df["Price_MA30"] = df["close"] - df["close"].rolling(30).mean().shift(1)
df["Volatility_10"] = df["close"].rolling(10).std().shift(1)
df["Volume_Change"] = df["VOLUME"].pct_change().shift(1)

# RSI Example (14-period)
def compute_rsi(series, period=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(period).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

df["RSI_14"] = compute_rsi(df["close"]).shift(1)

# Time Features
df["DayOfWeek"] = df["Date"].dt.dayofweek
df["Month"] = df["Date"].dt.month

# Target variable: next day movement
df["Target"] = (df["close"].shift(-1) > df["close"]).astype(int)

# Drop rows with NaNs (due to rolling calculations)
df = df.dropna().reset_index(drop=True)

# Select Features and Target
feature_cols = [
    "Return_1",
    "Return_3",
    "Price_MA30",
    "Volatility_10",
    "Volume_Change",
    "RSI_14",
    "DayOfWeek",
    "Month",
]
X = df[feature_cols]
y = df["Target"]

# Standardize numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Test Split for hold-out evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

# 1️⃣ Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("=== Random Forest Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]))

# 2️⃣ Logistic Regression
lr = LogisticRegression(max_iter=500, random_state=42)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

print("\n=== Logistic Regression Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Classification Report:\n", classification_report(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("ROC AUC:", roc_auc_score(y_test, lr.predict_proba(X_test)[:, 1]))

# 3️⃣ Cross-Validation Scores
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_cv_scores = cross_val_score(rf, X_scaled, y, cv=cv, scoring="accuracy")
lr_cv_scores = cross_val_score(lr, X_scaled, y, cv=cv, scoring="accuracy")

print("\n=== Cross-Validation Accuracy ===")
print("Random Forest Mean CV Accuracy: {:.3f}".format(rf_cv_scores.mean()))
print("Logistic Regression Mean CV Accuracy: {:.3f}".format(lr_cv_scores.mean()))


ValueError: Found array with 0 sample(s) (shape=(0, 8)) while a minimum of 1 is required by StandardScaler.

In [50]:
print("X shape:", X.shape)
print("X.head():", X.head())
print("Any NaNs in X?", X.isna().sum())
print("y shape:", y.shape)
print("y value counts:", y.value_counts())


X shape: (3, 4)
X.head():    OPEN  HIGH  LOW  VOLUME
0     1     2    0     100
1     2     3    1     200
2     3     4    2     300
Any NaNs in X? OPEN      0
HIGH      0
LOW       0
VOLUME    0
dtype: int64
y shape: (3,)
y value counts: Tomorrow_Return
1    2
0    1
Name: count, dtype: int64


In [49]:
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Example DataFrame
df = pd.DataFrame({
    "OPEN": [1,2,3],
    "HIGH": [2,3,4],
    "LOW": [0,1,2],
    "VOLUME": [100,200,300],
    "Tomorrow_Return": [1,0,1]
})

feature_cols = ["OPEN", "HIGH", "LOW", "VOLUME"]
X = df[feature_cols]
y = df["Tomorrow_Return"]

print("X shape:", X.shape)
print("y shape:", y.shape)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X shape: (3, 4)
y shape: (3,)
