In [1]:
import pandas as pd
import numpy as np

# Load cleaned data
df = pd.read_csv(r"C:\My stuff\Coding\ML project\KiranveerSingh_projectfinal\Dataset\cleaned_nifty5_labeled.csv")
df['Date'] = pd.to_datetime(df['Date'])

# --- Moving Averages ---
for window in [5, 10, 20]:
    df[f"SMA_{window}"] = df.groupby("Symbol")["Close"].transform(lambda x: x.rolling(window).mean())

# --- Daily Returns ---
df['Return_1D'] = df.groupby('Symbol')['Close'].pct_change()

# --- Rolling Volatility ---
df['Volatility_10'] = df.groupby('Symbol')['Return_1D'].transform(lambda x: x.rolling(10).std())

# --- Price Ratios ---
df['High_Low_Ratio'] = df['High'] / df['Low']
df['Open_Close_Ratio'] = df['Open'] / df['Close']

# --- Volume Features ---
df['Volume_SMA_10'] = df.groupby('Symbol')['Volume'].transform(lambda x: x.rolling(10).mean())
df['Volume_Ratio'] = df['Volume'] / df['Volume_SMA_10']

# --- Lagged Features (prices and volume) ---
for lag in [1, 2, 3]:
    df[f'Close_Lag_{lag}'] = df.groupby('Symbol')['Close'].shift(lag)
    df[f'Volume_Lag_{lag}'] = df.groupby('Symbol')['Volume'].shift(lag)

# --- RSI Calculation (14-day window) ---
def calc_rsi(series, period=14):
    delta = series.diff()
    up = delta.clip(lower=0)
    down = -1 * delta.clip(upper=0)
    ema_up = up.ewm(com=period-1, adjust=False).mean()
    ema_down = down.ewm(com=period-1, adjust=False).mean()
    rs = ema_up / ema_down
    rsi = 100 - (100 / (1 + rs))
    return rsi

df['RSI_14'] = df.groupby("Symbol")["Close"].transform(lambda x: calc_rsi(x, 14))

# --- Remove rows with any missing values (due to rolling/lags) ---
df = df.dropna().reset_index(drop=True)

# --- Save engineered features for modeling ---
df.to_csv(r"C:\My stuff\Coding\ML project\KiranveerSingh_projectfinal\Dataset\feature_engineered_nifty5.csv", index=False)

print("Feature engineering complete! Data saved as 'feature_engineered_nifty5.csv'")


Feature engineering complete! Data saved as 'feature_engineered_nifty5.csv'
