In [1]:
import pandas as pd
import numpy as np
import joblib

# Load saved model and preprocessing artifacts once at module start
model = joblib.load(r"C:\My stuff\Coding\ML project\KiranveerSingh_projectfinal\Models\XGBoost.pkl")
scaler = joblib.load(r"C:\My stuff\Coding\ML project\KiranveerSingh_projectfinal\Models\scaler.pkl")
categorical_columns = joblib.load(r"C:\My stuff\Coding\ML project\KiranveerSingh_projectfinal\Models\X_train_cat_enc_columns.pkl") 
# Feature engineering helper
def add_features(df):
    # Copy logic from your training phase
    # --- Moving Averages ---
    for window in [5, 10, 20]:
        df[f"SMA_{window}"] = df.groupby("Symbol")["Close"].transform(lambda x: x.rolling(window).mean())

    # --- Daily Returns ---
    df['Return_1D'] = df.groupby('Symbol')['Close'].pct_change()

    # --- Rolling Volatility ---
    df['Volatility_10'] = df.groupby('Symbol')['Return_1D'].transform(lambda x: x.rolling(10).std())

    # --- Price Ratios ---
    df['High_Low_Ratio'] = df['High'] / df['Low']
    df['Open_Close_Ratio'] = df['Open'] / df['Close']

    # --- Volume Features ---
    df['Volume_SMA_10'] = df.groupby('Symbol')['Volume'].transform(lambda x: x.rolling(10).mean())
    df['Volume_Ratio'] = df['Volume'] / df['Volume_SMA_10']

    # --- Lagged Features (prices and volume) ---
    for lag in [1, 2, 3]:
        df[f'Close_Lag_{lag}'] = df.groupby('Symbol')['Close'].shift(lag)
        df[f'Volume_Lag_{lag}'] = df.groupby('Symbol')['Volume'].shift(lag)

    # --- RSI Calculation (14-day window) ---
    def calc_rsi(series, period=14):
        delta = series.diff()
        up = delta.clip(lower=0)
        down = -1 * delta.clip(upper=0)
        ema_up = up.ewm(com=period-1, adjust=False).mean()
        ema_down = down.ewm(com=period-1, adjust=False).mean()
        rs = ema_up / ema_down
        rsi = 100 - (100 / (1 + rs))
        return rsi

    df['RSI_14'] = df.groupby("Symbol")["Close"].transform(lambda x: calc_rsi(x, 14))
    # --- Remove rows with any missing values (due to rolling/lags) ---
    df = df.dropna().reset_index(drop=True)

# The main pipeline function
def predict_pipeline(input_df):
    df = input_df.copy()
    df = add_features(df)

    # Select features as during training
    exclude_cols = ['Date', 'Symbol', 'Will_Grow']
    all_features = [col for col in df.columns if col not in exclude_cols]
    numeric_features = df[all_features].select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = list(set(all_features) - set(numeric_features))

    # Process categories as in training
    X_num = df[numeric_features]
    X_cat = df[categorical_features]
    X_cat_enc = pd.get_dummies(X_cat, drop_first=True)
    # Align dummies
    X_cat_enc = X_cat_enc.reindex(columns=categorical_columns, fill_value=0)

    # Combine
    X = pd.concat([X_num.reset_index(drop=True), X_cat_enc.reset_index(drop=True)], axis=1)
    # Reorder or rename columns to match the scaler/model
    if hasattr(scaler, 'feature_names_in_'):
        X = X.reindex(columns=scaler.feature_names_in_)
    # Scale numerics
    X[numeric_features] = scaler.transform(X[numeric_features])
    # Predict
    pred = model.predict(X)
    prob = model.predict_proba(X)[:, 1] if hasattr(model, "predict_proba") else None

    # Map prediction to class labels
    direction = ["Fall" if p == 0 else "Grow" for p in pred]

    # Build result DataFrame
    res_df = df.copy()
    res_df["Prediction"] = direction
    if prob is not None:
        res_df["Probability"] = prob
    return res_df

# Example usage:
# new_raw_input = pd.read_csv("your_new_data.csv")
# prediction_df = predict_pipeline(new_raw_input)
# print(prediction_df[["Date", "Symbol", "Prediction", "Probability"]])
