# 🧠 Backend Pipeline for Options Prediction

This notebook performs the complete backend logic for building an AI-based directional prediction system for NIFTY and BANKNIFTY options trading. It includes:

- Data preprocessing and labeling
- Feature engineering (technical indicators)
- Model training for multiple time intervals
- Model saving for real-time prediction use in `app.py`

This pipeline automates the creation of frame-wise signals used in the Streamlit UI.


In [1]:
import yfinance as yf
import pandas as pd
import ta
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
import joblib


In [2]:
tickers = {
    "NIFTY": "^NSEI",
    "BANKNIFTY": "^NSEBANK"
}

intervals = {
    "1m": "7d",
    "5m": "60d",
    "15m": "60d",
    "30m": "60d",
    "60m": "60d"
}


In [3]:
os.makedirs("nifty_banknifty_raw", exist_ok=True)

def fetch_raw_data(ticker_symbol, interval, period):
    df = yf.download(ticker_symbol, interval=interval, period=period)
    df.dropna(inplace=True)
    return df

for name, ticker in tickers.items():
    for interval, period in intervals.items():
        df = fetch_raw_data(ticker, interval, period)
        path = f"nifty_banknifty_raw/{name}_{interval}_raw.csv"
        df.to_csv(path)
        print(f"✅ Saved: {path}")


  df = yf.download(ticker_symbol, interval=interval, period=period)


[*********************100%***********************]  1 of 1 completed


  df = yf.download(ticker_symbol, interval=interval, period=period)


✅ Saved: nifty_banknifty_raw/NIFTY_1m_raw.csv


[*********************100%***********************]  1 of 1 completed


  df = yf.download(ticker_symbol, interval=interval, period=period)


✅ Saved: nifty_banknifty_raw/NIFTY_5m_raw.csv


[*********************100%***********************]  1 of 1 completed


  df = yf.download(ticker_symbol, interval=interval, period=period)


✅ Saved: nifty_banknifty_raw/NIFTY_15m_raw.csv


[*********************100%***********************]  1 of 1 completed


  df = yf.download(ticker_symbol, interval=interval, period=period)


✅ Saved: nifty_banknifty_raw/NIFTY_30m_raw.csv


[*********************100%***********************]  1 of 1 completed


  df = yf.download(ticker_symbol, interval=interval, period=period)


✅ Saved: nifty_banknifty_raw/NIFTY_60m_raw.csv


[*********************100%***********************]  1 of 1 completed


  df = yf.download(ticker_symbol, interval=interval, period=period)


✅ Saved: nifty_banknifty_raw/BANKNIFTY_1m_raw.csv


[*********************100%***********************]  1 of 1 completed


  df = yf.download(ticker_symbol, interval=interval, period=period)


✅ Saved: nifty_banknifty_raw/BANKNIFTY_5m_raw.csv


[*********************100%***********************]  1 of 1 completed


  df = yf.download(ticker_symbol, interval=interval, period=period)


✅ Saved: nifty_banknifty_raw/BANKNIFTY_15m_raw.csv


[*********************100%***********************]  1 of 1 completed


  df = yf.download(ticker_symbol, interval=interval, period=period)


✅ Saved: nifty_banknifty_raw/BANKNIFTY_30m_raw.csv


[*********************100%***********************]  1 of 1 completed

✅ Saved: nifty_banknifty_raw/BANKNIFTY_60m_raw.csv





## 🔧 Feature Engineering: Turning Raw Data into Predictive Signals

In this section, we generate technical features that can help predict the direction of the market. These include:

- Moving averages (SMA, EMA)
- Volatility indicators (ATR)
- Momentum indicators (RSI, MACD)
- Rate of return and lag-based shifts

These features help capture market patterns useful for model learning.


In [4]:
def apply_feature_engineering(df):
    df = df.copy()

    # Ensure numeric format
    for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    df.dropna(subset=['Open', 'High', 'Low', 'Close', 'Volume'], inplace=True)

    # Technical Indicators
    df['rsi'] = ta.momentum.RSIIndicator(close=df['Close']).rsi()
    macd = ta.trend.MACD(close=df['Close'])
    df['macd'] = macd.macd()
    df['macd_signal'] = macd.macd_signal()
    df['ema_20'] = ta.trend.EMAIndicator(close=df['Close'], window=20).ema_indicator()
    df['ema_50'] = ta.trend.EMAIndicator(close=df['Close'], window=50).ema_indicator()
    df['atr'] = ta.volatility.AverageTrueRange(high=df['High'], low=df['Low'], close=df['Close']).average_true_range()

    df.dropna(inplace=True)
    return df


In [5]:
os.makedirs("nifty_banknifty_processed", exist_ok=True)

for name in tickers:
    for interval in intervals:
        path = f"nifty_banknifty_raw/{name}_{interval}_raw.csv"
        df = pd.read_csv(path, index_col=0, parse_dates=True)
        df = apply_feature_engineering(df)
        df.to_csv(f"nifty_banknifty_processed/{name}_{interval}_processed.csv")
        print(f"✅ Processed and saved: {name}_{interval}_processed.csv")


  df = pd.read_csv(path, index_col=0, parse_dates=True)


  df = pd.read_csv(path, index_col=0, parse_dates=True)


✅ Processed and saved: NIFTY_1m_processed.csv
✅ Processed and saved: NIFTY_5m_processed.csv


  df = pd.read_csv(path, index_col=0, parse_dates=True)
  df = pd.read_csv(path, index_col=0, parse_dates=True)
  df = pd.read_csv(path, index_col=0, parse_dates=True)


✅ Processed and saved: NIFTY_15m_processed.csv
✅ Processed and saved: NIFTY_30m_processed.csv
✅ Processed and saved: NIFTY_60m_processed.csv


  df = pd.read_csv(path, index_col=0, parse_dates=True)
  df = pd.read_csv(path, index_col=0, parse_dates=True)


✅ Processed and saved: BANKNIFTY_1m_processed.csv


✅ Processed and saved: BANKNIFTY_5m_processed.csv
✅ Processed and saved: BANKNIFTY_15m_processed.csv
✅ Processed and saved: BANKNIFTY_30m_processed.csv


  df = pd.read_csv(path, index_col=0, parse_dates=True)
  df = pd.read_csv(path, index_col=0, parse_dates=True)
  df = pd.read_csv(path, index_col=0, parse_dates=True)


✅ Processed and saved: BANKNIFTY_60m_processed.csv


In [6]:
def add_target_labels(df, close_col, shift_period=1, threshold_pct=0.00085):
    df = df.copy()
    df['future_close'] = df[close_col].shift(-shift_period)
    df['return_pct'] = (df['future_close'] - df[close_col]) / df[close_col]
    df['target'] = 1
    df.loc[df['return_pct'] > threshold_pct, 'target'] = 2
    df.loc[df['return_pct'] < -threshold_pct, 'target'] = 0
    df.dropna(inplace=True)
    return df


In [7]:
os.makedirs("nifty_banknifty_labeled", exist_ok=True)
for name in tickers:
    for interval in intervals:
        file_path = f"nifty_banknifty_processed/{name}_{interval}_processed.csv"
        df = pd.read_csv(file_path, index_col=0, parse_dates=True)
        df = add_target_labels(df, close_col='Close')
        df.to_csv(f"nifty_banknifty_labeled/{name}_{interval}_labeled.csv")
        print(f"✅ Labeled: {name}_{interval}_labeled.csv")
        print(df['target'].value_counts(normalize=True))



✅ Labeled: NIFTY_1m_labeled.csv
target
1    0.989891
2    0.005443
0    0.004666
Name: proportion, dtype: float64


✅ Labeled: NIFTY_5m_labeled.csv
target
1    0.864944
0    0.067640
2    0.067416
Name: proportion, dtype: float64
✅ Labeled: NIFTY_15m_labeled.csv
target
1    0.686207
0    0.157931
2    0.155862
Name: proportion, dtype: float64


✅ Labeled: NIFTY_30m_labeled.csv
target
1    0.573134
0    0.231343
2    0.195522
Name: proportion, dtype: float64
✅ Labeled: NIFTY_60m_labeled.csv
target
1    0.467568
2    0.267568
0    0.264865
Name: proportion, dtype: float64
✅ Labeled: BANKNIFTY_1m_labeled.csv
target
1    0.992224
0    0.004277
2    0.003499
Name: proportion, dtype: float64


✅ Labeled: BANKNIFTY_5m_labeled.csv
target
1    0.83573
2    0.08427
0    0.08000
Name: proportion, dtype: float64
✅ Labeled: BANKNIFTY_15m_labeled.csv
target
1    0.643448
0    0.181379
2    0.175172
Name: proportion, dtype: float64
✅ Labeled: BANKNIFTY_30m_labeled.csv
target
1    0.501493
2    0.250746
0    0.247761
Name: proportion, dtype: float64


✅ Labeled: BANKNIFTY_60m_labeled.csv
target
1    0.435135
0    0.289189
2    0.275676
Name: proportion, dtype: float64


## 🤖 Model Training for Direction Prediction

We train a classification model for each symbol (NIFTY/BANKNIFTY) across four timeframes: 5m, 15m, 30m, and 60m.

Each model predicts one of the following signals:
- 0 = 📉 PUT (Bearish signal)
- 1 = ⏸️ NO ACTION
- 2 = 📈 CALL (Bullish signal)

The training set is built from labeled data based on future return thresholds. We use the RandomForestClassifier for its performance on tabular data and robustness to feature scaling.


In [8]:
import os
import pandas as pd
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
import joblib

# Target intervals and symbols
intervals = ["5m", "15m", "30m", "60m"]
symbols = ["NIFTY", "BANKNIFTY"]

# Ensure model output directory exists
os.makedirs("final_models", exist_ok=True)

for symbol in symbols:
    for interval in intervals:
        print(f"\n📘 Training {symbol} - {interval} model...")

        # Load labeled dataset
        path = f"nifty_banknifty_labeled/{symbol}_{interval}_labeled.csv"
        if not os.path.exists(path):
            print(f"❌ File not found: {path}")
            continue
        df = pd.read_csv(path, index_col=0, parse_dates=True)

        # Drop helper columns if present
        df.drop(columns=[col for col in ['future_close', 'return_pct'] if col in df.columns], inplace=True)

        # Split features and target
        X = df.drop(columns=['target'])
        y = df['target']

        # Sanitize data
        for col in X.columns:
            if X[col].dtype == 'object':
                X[col] = pd.to_numeric(X[col], errors='coerce')
        X = X.select_dtypes(include=['number']).fillna(0)

        # Check class count
        if y.nunique() < 2:
            print(f"❌ Skipping {symbol}-{interval}: Only one class in data.")
            continue
        elif y.nunique() < 3:
            print(f"⚠️ Warning: Only classes {y.unique().tolist()} found in {symbol}-{interval}")

        # Stratified train-test split
        try:
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, stratify=y, random_state=42
            )
        except ValueError as e:
            print(f"⚠️ Skipping due to stratify error: {e}")
            continue

        # Train model
        model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
        model.fit(X_train, y_train)

        # Evaluate
        y_pred = model.predict(X_test)
        print("📊 Classification Report:")
        print(classification_report(y_test, y_pred))
        print("🔍 Confusion Matrix:")
        print(confusion_matrix(y_test, y_pred))

        # Save model
        model_path = f"final_models/{symbol}_model_{interval}.pkl"
        joblib.dump(model, model_path)
        print(f"✅ Model saved to {model_path}")



📘 Training NIFTY - 5m model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        60
           1       0.87      0.98      0.92       770
           2       0.31      0.07      0.11        60

    accuracy                           0.86       890
   macro avg       0.39      0.35      0.34       890
weighted avg       0.78      0.86      0.81       890

🔍 Confusion Matrix:
[[  0  57   3]
 [  7 757   6]
 [  2  54   4]]
✅ Model saved to final_models/NIFTY_model_5m.pkl

📘 Training NIFTY - 15m model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Classification Report:
              precision    recall  f1-score   support

           0       0.14      0.07      0.09        46
           1       0.68      0.85      0.76       199
           2       0.21      0.09      0.12        45

    accuracy                           0.61       290
   macro avg       0.34      0.34      0.32       290
weighted avg       0.52      0.61      0.55       290

🔍 Confusion Matrix:
[[  3  41   2]
 [ 16 170  13]
 [  3  38   4]]
✅ Model saved to final_models/NIFTY_model_15m.pkl

📘 Training NIFTY - 30m model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.16      0.20        31
           1       0.60      0.74      0.66        77
           2       0.16      0.12      0.13        26

    accuracy                           0.49       134
   macro avg       0.34      0.34      0.33       134
weighted avg       0.43      0.49      0.45       134

🔍 Confusion Matrix:
[[ 5 21  5]
 [ 9 57 11]
 [ 6 17  3]]
✅ Model saved to final_models/NIFTY_model_30m.pkl

📘 Training NIFTY - 60m model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Classification Report:
              precision    recall  f1-score   support

           0       0.25      0.10      0.14        20
           1       0.46      0.76      0.57        34
           2       0.44      0.20      0.28        20

    accuracy                           0.43        74
   macro avg       0.38      0.35      0.33        74
weighted avg       0.40      0.43      0.38        74

🔍 Confusion Matrix:
[[ 2 16  2]
 [ 5 26  3]
 [ 1 15  4]]
✅ Model saved to final_models/NIFTY_model_60m.pkl

📘 Training BANKNIFTY - 5m model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Classification Report:
              precision    recall  f1-score   support

           0       0.07      0.01      0.02        71
           1       0.84      0.98      0.91       744
           2       0.25      0.03      0.05        75

    accuracy                           0.83       890
   macro avg       0.39      0.34      0.33       890
weighted avg       0.73      0.83      0.77       890

🔍 Confusion Matrix:
[[  1  68   2]
 [  8 732   4]
 [  5  68   2]]
✅ Model saved to final_models/BANKNIFTY_model_5m.pkl

📘 Training BANKNIFTY - 15m model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Classification Report:
              precision    recall  f1-score   support

           0       0.16      0.08      0.10        52
           1       0.65      0.81      0.72       187
           2       0.31      0.20      0.24        51

    accuracy                           0.57       290
   macro avg       0.37      0.36      0.36       290
weighted avg       0.50      0.57      0.53       290

🔍 Confusion Matrix:
[[  4  45   3]
 [ 16 152  19]
 [  5  36  10]]
✅ Model saved to final_models/BANKNIFTY_model_15m.pkl

📘 Training BANKNIFTY - 30m model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Classification Report:
              precision    recall  f1-score   support

           0       0.31      0.15      0.20        33
           1       0.60      0.79      0.68        67
           2       0.31      0.26      0.29        34

    accuracy                           0.50       134
   macro avg       0.41      0.40      0.39       134
weighted avg       0.45      0.50      0.46       134

🔍 Confusion Matrix:
[[ 5 16 12]
 [ 6 53  8]
 [ 5 20  9]]
✅ Model saved to final_models/BANKNIFTY_model_30m.pkl

📘 Training BANKNIFTY - 60m model...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


📊 Classification Report:
              precision    recall  f1-score   support

           0       0.26      0.23      0.24        22
           1       0.44      0.66      0.53        32
           2       0.43      0.15      0.22        20

    accuracy                           0.39        74
   macro avg       0.38      0.34      0.33        74
weighted avg       0.38      0.39      0.36        74

🔍 Confusion Matrix:
[[ 5 15  2]
 [ 9 21  2]
 [ 5 12  3]]
✅ Model saved to final_models/BANKNIFTY_model_60m.pkl


In [9]:
import pandas as pd
import joblib
from collections import Counter

# Configuration
intervals = ["5m", "15m", "30m", "60m"]
symbols = ["NIFTY", "BANKNIFTY"]
model_dir = "final_models"

# Load the latest feature set (replace this with your actual input features)
# For testing, load last row from any labeled dataset
def load_latest_features(symbol, interval):
    path = f"nifty_banknifty_labeled/{symbol}_{interval}_labeled.csv"
    df = pd.read_csv(path, index_col=0, parse_dates=True)
    df.drop(columns=[col for col in ['target', 'future_close', 'return_pct'] if col in df.columns], inplace=True)
    df = df.select_dtypes(include='number').fillna(0)
    return df.iloc[[-1]]  # last row as latest input

# Voting prediction function
def get_final_signal(symbol):
    votes = []
    for interval in intervals:
        model_path = f"{model_dir}/{symbol}_model_{interval}.pkl"
        if not os.path.exists(model_path):
            print(f"❌ Missing model: {model_path}")
            continue
        
        model = joblib.load(model_path)
        latest_features = load_latest_features(symbol, interval)
        pred = model.predict(latest_features)[0]
        votes.append(pred)

    # Majority vote
    if not votes:
        return "❌ No predictions available"
    
    vote_count = Counter(votes)
    final_decision = vote_count.most_common(1)[0][0]
    
    signal_map = {0: "📉 Suggest PUT (PE)", 1: "⏸️ Suggest NO ACTION", 2: "📈 Suggest CALL (CE)"}
    return signal_map[final_decision]

# Example usage
print("📊 Final Decision for NIFTY:", get_final_signal("NIFTY"))
print("📊 Final Decision for BANKNIFTY:", get_final_signal("BANKNIFTY"))


📊 Final Decision for NIFTY: ⏸️ Suggest NO ACTION


📊 Final Decision for BANKNIFTY: ⏸️ Suggest NO ACTION


In [10]:
# Frame-wise prediction logic
def get_framewise_signal(symbol):
    decisions = {}
    for interval in intervals:
        model_path = f"{model_dir}/{symbol}_model_{interval}.pkl"
        if not os.path.exists(model_path):
            decisions[interval] = "❌ No model"
            continue

        model = joblib.load(model_path)
        latest_features = load_latest_features(symbol, interval)
        pred = model.predict(latest_features)[0]

        signal_map = {0: "📉 PUT", 1: "⏸️ NO ACTION", 2: "📈 CALL"}
        decisions[interval] = signal_map.get(pred, "❓ Unknown")
    
    return decisions

# Usage
nifty_signals = get_framewise_signal("NIFTY")
banknifty_signals = get_framewise_signal("BANKNIFTY")

print("\n🧭 Frame-wise Decision for NIFTY:")
for interval, signal in nifty_signals.items():
    print(f"  ⏱️ {interval}: {signal}")

print("\n🧭 Frame-wise Decision for BANKNIFTY:")
for interval, signal in banknifty_signals.items():
    print(f"  ⏱️ {interval}: {signal}")
# Save final decisions to CSV for UI to read
import os
os.makedirs("final_predictions", exist_ok=True)

for symbol in ["NIFTY", "BANKNIFTY"]:
    frame_signals = get_framewise_signal(symbol)
    df = pd.DataFrame.from_dict(frame_signals, orient="index", columns=["signal"])
    df.to_csv(f"final_predictions/{symbol}_framewise_predictions.csv")



🧭 Frame-wise Decision for NIFTY:
  ⏱️ 5m: ⏸️ NO ACTION
  ⏱️ 15m: ⏸️ NO ACTION
  ⏱️ 30m: ⏸️ NO ACTION
  ⏱️ 60m: ⏸️ NO ACTION

🧭 Frame-wise Decision for BANKNIFTY:
  ⏱️ 5m: ⏸️ NO ACTION
  ⏱️ 15m: ⏸️ NO ACTION
  ⏱️ 30m: ⏸️ NO ACTION
  ⏱️ 60m: ⏸️ NO ACTION


# Conclusion: 
The prediction suggest trader which trades will be non-profitable or may end in losing money, it also help the trader for the hold time for what time the trade can be executed.