# ⚙️ TCS Stock - Advanced Feature Engineering

## 🎯 Objective
Create comprehensive technical indicators and features for machine learning models:
- **Technical Indicators**: RSI, MACD, Bollinger Bands, Stochastic
- **Moving Averages**: Simple, Exponential, Weighted
- **Volatility Measures**: ATR, Bollinger Band Width, Historical Volatility
- **Price Features**: Returns, Gaps, Price Channels
- **Volume Features**: Volume indicators, Money Flow
- **Time-based Features**: Seasonality, Cyclical patterns

---

In [None]:
# Core libraries
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import warnings

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Technical Analysis
try:
    import talib
    TALIB_AVAILABLE = True
    print('✅ TA-Lib available for advanced technical indicators')
except ImportError:
    TALIB_AVAILABLE = False
    print('⚠️ TA-Lib not available - using manual calculations')

# Machine Learning
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

# Configuration
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print('🎯 TCS STOCK - ADVANCED FEATURE ENGINEERING')
print('='*60)
print('✅ All libraries imported successfully!')

In [None]:
# Load cleaned data from previous notebook
print('📁 Loading cleaned TCS stock data...')

try:
    df = pd.read_csv('../data/TCS_stock_cleaned.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date').reset_index(drop=True)
    print(f'✅ Data loaded successfully: {df.shape}')
except FileNotFoundError:
    print('⚠️ Cleaned data not found, loading original data...')
    df = pd.read_csv('../data/TCS_stock_history.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.sort_values('Date').reset_index(drop=True)
    print(f'✅ Original data loaded: {df.shape}')

# Display basic info
print(f'📅 Date range: {df["Date"].min().date()} to {df["Date"].max().date()}')
print(f'📊 Columns: {list(df.columns)}')
print(f'🔍 Shape: {df.shape}')

df.head()

## 💰 Price-Based Features

In [None]:
print('💰 CREATING PRICE-BASED FEATURES')
print('='*40)

# Create a copy for feature engineering
df_features = df.copy()

# 1. Basic Price Features
print('1. 📊 Basic Price Features:')
df_features['Price_Range'] = df_features['High'] - df_features['Low']
df_features['Price_Range_Pct'] = (df_features['Price_Range'] / df_features['Close']) * 100
df_features['Open_Close_Pct'] = ((df_features['Close'] - df_features['Open']) / df_features['Open']) * 100
df_features['High_Low_Pct'] = ((df_features['High'] - df_features['Low']) / df_features['Close']) * 100
print('   ✅ Price range, Open-Close %, High-Low % created')

# 2. Returns Features
print('2. 📈 Returns Features:')
df_features['Daily_Return'] = df_features['Close'].pct_change() * 100
df_features['Log_Return'] = np.log(df_features['Close'] / df_features['Close'].shift(1)) * 100

# Multi-period returns
for period in [2, 3, 5, 10, 20]:
    df_features[f'Return_{period}d'] = df_features['Close'].pct_change(period) * 100
print('   ✅ Daily, log, and multi-period returns created')

# 3. Price Position Features
print('3. 🎯 Price Position Features:')
for window in [10, 20, 50]:
    rolling_high = df_features['High'].rolling(window=window).max()
    rolling_low = df_features['Low'].rolling(window=window).min()
    df_features[f'Price_Position_{window}d'] = ((df_features['Close'] - rolling_low) / (rolling_high - rolling_low)) * 100
print('   ✅ Price position within rolling windows created')

# 4. Gap Analysis
print('4. 📊 Gap Analysis:')
df_features['Gap'] = df_features['Open'] - df_features['Close'].shift(1)
df_features['Gap_Pct'] = (df_features['Gap'] / df_features['Close'].shift(1)) * 100
df_features['Gap_Up'] = (df_features['Gap_Pct'] > 0.5).astype(int)
df_features['Gap_Down'] = (df_features['Gap_Pct'] < -0.5).astype(int)
print('   ✅ Gap analysis features created')

print(f'\n📊 Current features count: {df_features.shape[1]}')

## 📈 Moving Averages & Trends

In [None]:
print('📈 CREATING MOVING AVERAGES & TREND FEATURES')
print('='*50)

# 1. Simple Moving Averages
print('1. 📊 Simple Moving Averages:')
ma_periods = [5, 10, 20, 50, 100, 200]
for period in ma_periods:
    df_features[f'SMA_{period}'] = df_features['Close'].rolling(window=period).mean()
    df_features[f'SMA_{period}_ratio'] = df_features['Close'] / df_features[f'SMA_{period}']
    df_features[f'SMA_{period}_distance'] = ((df_features['Close'] - df_features[f'SMA_{period}']) / df_features[f'SMA_{period}']) * 100
print(f'   ✅ SMA for periods {ma_periods} created')

# 2. Exponential Moving Averages
print('2. 📊 Exponential Moving Averages:')
ema_periods = [12, 26, 50]
for period in ema_periods:
    df_features[f'EMA_{period}'] = df_features['Close'].ewm(span=period).mean()
    df_features[f'EMA_{period}_ratio'] = df_features['Close'] / df_features[f'EMA_{period}']
print(f'   ✅ EMA for periods {ema_periods} created')

# 3. Moving Average Convergence Divergence (MACD)
print('3. 📊 MACD Indicators:')
if TALIB_AVAILABLE:
    macd, macd_signal, macd_hist = talib.MACD(df_features['Close'].values)
    df_features['MACD'] = macd
    df_features['MACD_Signal'] = macd_signal
    df_features['MACD_Histogram'] = macd_hist
else:
    # Manual MACD calculation
    ema_12 = df_features['Close'].ewm(span=12).mean()
    ema_26 = df_features['Close'].ewm(span=26).mean()
    df_features['MACD'] = ema_12 - ema_26
    df_features['MACD_Signal'] = df_features['MACD'].ewm(span=9).mean()
    df_features['MACD_Histogram'] = df_features['MACD'] - df_features['MACD_Signal']

# MACD derived features
df_features['MACD_Bullish'] = (df_features['MACD'] > df_features['MACD_Signal']).astype(int)
df_features['MACD_Cross_Up'] = ((df_features['MACD'] > df_features['MACD_Signal']) & 
                               (df_features['MACD'].shift(1) <= df_features['MACD_Signal'].shift(1))).astype(int)
print('   ✅ MACD and derived signals created')

# 4. Moving Average Cross Signals
print('4. 📊 Moving Average Cross Signals:')
df_features['SMA_Cross_5_20'] = (df_features['SMA_5'] > df_features['SMA_20']).astype(int)
df_features['SMA_Cross_20_50'] = (df_features['SMA_20'] > df_features['SMA_50']).astype(int)
df_features['Golden_Cross'] = ((df_features['SMA_50'] > df_features['SMA_200']) &
                              (df_features['SMA_50'].shift(1) <= df_features['SMA_200'].shift(1))).astype(int)
df_features['Death_Cross'] = ((df_features['SMA_50'] < df_features['SMA_200']) &
                             (df_features['SMA_50'].shift(1) >= df_features['SMA_200'].shift(1))).astype(int)
print('   ✅ Moving average cross signals created')

print(f'\n📊 Current features count: {df_features.shape[1]}')

## 🔧 Technical Indicators

In [None]:
print('🔧 CREATING TECHNICAL INDICATORS')
print('='*40)

# 1. Relative Strength Index (RSI)
print('1. 📊 RSI Indicators:')
def calculate_rsi(prices, window=14):
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

if TALIB_AVAILABLE:
    df_features['RSI_14'] = talib.RSI(df_features['Close'].values, timeperiod=14)
    df_features['RSI_21'] = talib.RSI(df_features['Close'].values, timeperiod=21)
else:
    df_features['RSI_14'] = calculate_rsi(df_features['Close'], 14)
    df_features['RSI_21'] = calculate_rsi(df_features['Close'], 21)

# RSI derived features
df_features['RSI_Overbought'] = (df_features['RSI_14'] > 70).astype(int)
df_features['RSI_Oversold'] = (df_features['RSI_14'] < 30).astype(int)
print('   ✅ RSI and derived signals created')

# 2. Bollinger Bands
print('2. 📊 Bollinger Bands:')
bb_period = 20
bb_std = 2

if TALIB_AVAILABLE:
    bb_upper, bb_middle, bb_lower = talib.BBANDS(df_features['Close'].values, 
                                                timeperiod=bb_period, nbdevup=bb_std, nbdevdn=bb_std)
    df_features['BB_Upper'] = bb_upper
    df_features['BB_Middle'] = bb_middle
    df_features['BB_Lower'] = bb_lower
else:
    df_features['BB_Middle'] = df_features['Close'].rolling(window=bb_period).mean()
    bb_std_dev = df_features['Close'].rolling(window=bb_period).std()
    df_features['BB_Upper'] = df_features['BB_Middle'] + (bb_std_dev * bb_std)
    df_features['BB_Lower'] = df_features['BB_Middle'] - (bb_std_dev * bb_std)

# Bollinger Bands derived features
df_features['BB_Width'] = df_features['BB_Upper'] - df_features['BB_Lower']
df_features['BB_Position'] = ((df_features['Close'] - df_features['BB_Lower']) / 
                             (df_features['BB_Upper'] - df_features['BB_Lower'])) * 100
df_features['BB_Squeeze'] = (df_features['BB_Width'] < df_features['BB_Width'].rolling(window=20).mean()).astype(int)
df_features['BB_Breakout_Up'] = (df_features['Close'] > df_features['BB_Upper']).astype(int)
df_features['BB_Breakout_Down'] = (df_features['Close'] < df_features['BB_Lower']).astype(int)
print('   ✅ Bollinger Bands and derived features created')

# 3. Stochastic Oscillator
print('3. 📊 Stochastic Oscillator:')
def calculate_stochastic(high, low, close, k_period=14, d_period=3):
    lowest_low = low.rolling(window=k_period).min()
    highest_high = high.rolling(window=k_period).max()
    k_percent = ((close - lowest_low) / (highest_high - lowest_low)) * 100
    d_percent = k_percent.rolling(window=d_period).mean()
    return k_percent, d_percent

if TALIB_AVAILABLE:
    stoch_k, stoch_d = talib.STOCH(df_features['High'].values, df_features['Low'].values, 
                                  df_features['Close'].values)
    df_features['Stoch_K'] = stoch_k
    df_features['Stoch_D'] = stoch_d
else:
    df_features['Stoch_K'], df_features['Stoch_D'] = calculate_stochastic(
        df_features['High'], df_features['Low'], df_features['Close'])

# Stochastic derived features
df_features['Stoch_Overbought'] = (df_features['Stoch_K'] > 80).astype(int)
df_features['Stoch_Oversold'] = (df_features['Stoch_K'] < 20).astype(int)
print('   ✅ Stochastic Oscillator created')

# 4. Average True Range (ATR)
print('4. 📊 Average True Range:')
def calculate_atr(high, low, close, period=14):
    tr1 = high - low
    tr2 = abs(high - close.shift(1))
    tr3 = abs(low - close.shift(1))
    true_range = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = true_range.rolling(window=period).mean()
    return atr

if TALIB_AVAILABLE:
    df_features['ATR_14'] = talib.ATR(df_features['High'].values, df_features['Low'].values, 
                                     df_features['Close'].values, timeperiod=14)
else:
    df_features['ATR_14'] = calculate_atr(df_features['High'], df_features['Low'], df_features['Close'])

# ATR derived features
df_features['ATR_Ratio'] = df_features['ATR_14'] / df_features['Close'] * 100
print('   ✅ ATR and ratio created')

print(f'\n📊 Current features count: {df_features.shape[1]}')

## 📊 Volume-Based Features

In [None]:
print('📊 CREATING VOLUME-BASED FEATURES')
print('='*40)

if 'Volume' in df_features.columns:
    # 1. Volume Moving Averages
    print('1. 📊 Volume Moving Averages:')
    volume_periods = [10, 20, 50]
    for period in volume_periods:
        df_features[f'Volume_SMA_{period}'] = df_features['Volume'].rolling(window=period).mean()
        df_features[f'Volume_Ratio_{period}'] = df_features['Volume'] / df_features[f'Volume_SMA_{period}']
    print(f'   ✅ Volume SMA and ratios for periods {volume_periods} created')
    
    # 2. Volume-Price Features
    print('2. 📊 Volume-Price Features:')
    df_features['Volume_Price'] = df_features['Volume'] * df_features['Close']
    df_features['Volume_Weighted_Price'] = (df_features['Volume'] * df_features['Close']).rolling(window=20).sum() / df_features['Volume'].rolling(window=20).sum()
    print('   ✅ Volume-weighted features created')
    
    # 3. On-Balance Volume (OBV)
    print('3. 📊 On-Balance Volume:')
    if TALIB_AVAILABLE:
        df_features['OBV'] = talib.OBV(df_features['Close'].values, df_features['Volume'].values)
    else:
        # Manual OBV calculation
        obv = [0]
        for i in range(1, len(df_features)):
            if df_features['Close'].iloc[i] > df_features['Close'].iloc[i-1]:
                obv.append(obv[-1] + df_features['Volume'].iloc[i])
            elif df_features['Close'].iloc[i] < df_features['Close'].iloc[i-1]:
                obv.append(obv[-1] - df_features['Volume'].iloc[i])
            else:
                obv.append(obv[-1])
        df_features['OBV'] = obv
    
    # OBV derived features
    df_features['OBV_SMA_20'] = df_features['OBV'].rolling(window=20).mean()
    df_features['OBV_Signal'] = (df_features['OBV'] > df_features['OBV_SMA_20']).astype(int)
    print('   ✅ OBV and derived signals created')
    
    # 4. Volume Anomalies
    print('4. 📊 Volume Anomalies:')
    volume_mean = df_features['Volume'].rolling(window=50).mean()
    volume_std = df_features['Volume'].rolling(window=50).std()
    df_features['Volume_High_Anomaly'] = (df_features['Volume'] > (volume_mean + 2 * volume_std)).astype(int)
    df_features['Volume_Low_Anomaly'] = (df_features['Volume'] < (volume_mean - 2 * volume_std)).astype(int)
    print('   ✅ Volume anomaly detection features created')
else:
    print('⚠️ Volume column not found, skipping volume-based features')

print(f'\n📊 Current features count: {df_features.shape[1]}')

## 📅 Time-Based Features

In [None]:
print('📅 CREATING TIME-BASED FEATURES')
print('='*40)

# 1. Basic Time Features
print('1. 📊 Basic Time Features:')
df_features['Year'] = df_features['Date'].dt.year
df_features['Month'] = df_features['Date'].dt.month
df_features['Quarter'] = df_features['Date'].dt.quarter
df_features['DayOfWeek'] = df_features['Date'].dt.dayofweek
df_features['DayOfMonth'] = df_features['Date'].dt.day
df_features['DayOfYear'] = df_features['Date'].dt.dayofyear
print('   ✅ Year, month, quarter, day features created')

# 2. Cyclical Time Features
print('2. 📊 Cyclical Time Features:')
df_features['Month_Sin'] = np.sin(2 * np.pi * df_features['Month'] / 12)
df_features['Month_Cos'] = np.cos(2 * np.pi * df_features['Month'] / 12)
df_features['DayOfWeek_Sin'] = np.sin(2 * np.pi * df_features['DayOfWeek'] / 7)
df_features['DayOfWeek_Cos'] = np.cos(2 * np.pi * df_features['DayOfWeek'] / 7)
df_features['DayOfYear_Sin'] = np.sin(2 * np.pi * df_features['DayOfYear'] / 365.25)
df_features['DayOfYear_Cos'] = np.cos(2 * np.pi * df_features['DayOfYear'] / 365.25)
print('   ✅ Cyclical encodings created')

# 3. Market Timing Features
print('3. 📊 Market Timing Features:')
df_features['Is_Monday'] = (df_features['DayOfWeek'] == 0).astype(int)
df_features['Is_Friday'] = (df_features['DayOfWeek'] == 4).astype(int)
df_features['Is_Month_End'] = (df_features['Date'].dt.is_month_end).astype(int)
df_features['Is_Month_Start'] = (df_features['Date'].dt.is_month_start).astype(int)
df_features['Is_Quarter_End'] = (df_features['Date'].dt.is_quarter_end).astype(int)
print('   ✅ Market timing indicators created')

# 4. Lag Features
print('4. 📊 Lag Features:')
lag_periods = [1, 2, 3, 5, 10]
lag_columns = ['Close', 'Volume', 'Daily_Return'] if 'Volume' in df_features.columns else ['Close', 'Daily_Return']

for col in lag_columns:
    if col in df_features.columns:
        for lag in lag_periods:
            df_features[f'{col}_Lag_{lag}'] = df_features[col].shift(lag)
print(f'   ✅ Lag features for {lag_columns} created')

print(f'\n📊 Current features count: {df_features.shape[1]}')

## 🚀 Advanced Features

In [None]:
print('🚀 CREATING ADVANCED FEATURES')
print('='*40)

# 1. Volatility Features
print('1. 📊 Volatility Features:')
volatility_windows = [10, 20, 30]
for window in volatility_windows:
    df_features[f'Volatility_{window}d'] = df_features['Daily_Return'].rolling(window=window).std()
    df_features[f'Volatility_{window}d_Annualized'] = df_features[f'Volatility_{window}d'] * np.sqrt(252)
print(f'   ✅ Volatility measures for windows {volatility_windows} created')

# 2. Support and Resistance Levels
print('2. 📊 Support and Resistance Features:')
for window in [20, 50]:
    df_features[f'Support_{window}d'] = df_features['Low'].rolling(window=window).min()
    df_features[f'Resistance_{window}d'] = df_features['High'].rolling(window=window).max()
    df_features[f'Support_Distance_{window}d'] = ((df_features['Close'] - df_features[f'Support_{window}d']) / df_features['Close']) * 100
    df_features[f'Resistance_Distance_{window}d'] = ((df_features[f'Resistance_{window}d'] - df_features['Close']) / df_features['Close']) * 100
print('   ✅ Support and resistance levels created')

# 3. Momentum Features
print('3. 📊 Momentum Features:')
momentum_periods = [5, 10, 20]
for period in momentum_periods:
    df_features[f'Momentum_{period}d'] = df_features['Close'] - df_features['Close'].shift(period)
    df_features[f'Momentum_{period}d_Pct'] = ((df_features['Close'] - df_features['Close'].shift(period)) / df_features['Close'].shift(period)) * 100
print(f'   ✅ Momentum features for periods {momentum_periods} created')

# 4. Rate of Change (ROC)
print('4. 📊 Rate of Change Features:')
roc_periods = [5, 10, 20]
for period in roc_periods:
    if TALIB_AVAILABLE:
        df_features[f'ROC_{period}d'] = talib.ROC(df_features['Close'].values, timeperiod=period)
    else:
        df_features[f'ROC_{period}d'] = ((df_features['Close'] - df_features['Close'].shift(period)) / df_features['Close'].shift(period)) * 100
print(f'   ✅ ROC features for periods {roc_periods} created')

# 5. Price Channels
print('5. 📊 Price Channel Features:')
channel_periods = [10, 20]
for period in channel_periods:
    df_features[f'Upper_Channel_{period}d'] = df_features['High'].rolling(window=period).max()
    df_features[f'Lower_Channel_{period}d'] = df_features['Low'].rolling(window=period).min()
    df_features[f'Channel_Position_{period}d'] = ((df_features['Close'] - df_features[f'Lower_Channel_{period}d']) / 
                                                 (df_features[f'Upper_Channel_{period}d'] - df_features[f'Lower_Channel_{period}d'])) * 100
print(f'   ✅ Price channel features for periods {channel_periods} created')

print(f'\n📊 Final features count: {df_features.shape[1]}')

## 🎯 Target Variables for ML

In [None]:
print('🎯 CREATING TARGET VARIABLES FOR MACHINE LEARNING')
print('='*55)

# 1. Price Prediction Targets
print('1. 📊 Price Prediction Targets:')
prediction_horizons = [1, 3, 5, 10, 20]
for horizon in prediction_horizons:
    df_features[f'Target_Price_{horizon}d'] = df_features['Close'].shift(-horizon)
    df_features[f'Target_Return_{horizon}d'] = ((df_features['Close'].shift(-horizon) - df_features['Close']) / df_features['Close']) * 100
print(f'   ✅ Price and return targets for horizons {prediction_horizons} created')

# 2. Classification Targets
print('2. 📊 Classification Targets:')
for horizon in [1, 5, 10]:
    # Binary classification: up/down
    df_features[f'Target_Direction_{horizon}d'] = (df_features[f'Target_Return_{horizon}d'] > 0).astype(int)
    
    # Multi-class classification: strong_down, down, flat, up, strong_up
    conditions = [
        df_features[f'Target_Return_{horizon}d'] <= -2,
        (df_features[f'Target_Return_{horizon}d'] > -2) & (df_features[f'Target_Return_{horizon}d'] <= -0.5),
        (df_features[f'Target_Return_{horizon}d'] > -0.5) & (df_features[f'Target_Return_{horizon}d'] < 0.5),
        (df_features[f'Target_Return_{horizon}d'] >= 0.5) & (df_features[f'Target_Return_{horizon}d'] < 2),
        df_features[f'Target_Return_{horizon}d'] >= 2
    ]
    choices = [0, 1, 2, 3, 4]  # strong_down, down, flat, up, strong_up
    df_features[f'Target_Class_{horizon}d'] = np.select(conditions, choices, default=2)
print('   ✅ Binary and multi-class classification targets created')

# 3. Volatility Targets
print('3. 📊 Volatility Prediction Targets:')
for horizon in [5, 10, 20]:
    # Future volatility
    future_returns = df_features['Daily_Return'].rolling(window=horizon).std().shift(-horizon)
    df_features[f'Target_Volatility_{horizon}d'] = future_returns
    
    # High/Low volatility classification
    vol_median = df_features[f'Target_Volatility_{horizon}d'].median()
    df_features[f'Target_High_Vol_{horizon}d'] = (df_features[f'Target_Volatility_{horizon}d'] > vol_median).astype(int)
print('   ✅ Volatility prediction targets created')

print(f'\n📊 Final dataset shape: {df_features.shape}')
print(f'🎯 Total features created: {df_features.shape[1] - len(df.columns)} new features')

## 📋 Feature Engineering Summary

In [None]:
print('📋 FEATURE ENGINEERING SUMMARY')
print('='*50)

# Calculate feature categories
feature_categories = {
    'Original Features': len(df.columns),
    'Price Features': len([col for col in df_features.columns if any(x in col.lower() for x in ['price', 'open', 'close', 'high', 'low', 'return', 'gap'])]),
    'Moving Average Features': len([col for col in df_features.columns if any(x in col for x in ['SMA', 'EMA', 'MA_'])]),
    'Technical Indicators': len([col for col in df_features.columns if any(x in col for x in ['RSI', 'MACD', 'BB_', 'Stoch', 'ATR'])]),
    'Volume Features': len([col for col in df_features.columns if 'volume' in col.lower() or 'obv' in col.lower()]),
    'Time Features': len([col for col in df_features.columns if any(x in col.lower() for x in ['year', 'month', 'day', 'quarter', 'sin', 'cos', 'is_'])]),
    'Advanced Features': len([col for col in df_features.columns if any(x in col.lower() for x in ['volatility', 'momentum', 'roc', 'channel', 'support', 'resistance'])]),
    'Target Variables': len([col for col in df_features.columns if col.startswith('Target_')]),
    'Total Features': df_features.shape[1]
}

print('\n📊 FEATURE BREAKDOWN:')
for category, count in feature_categories.items():
    print(f'   {category}: {count}')

# Data quality check
print('\n🔍 DATA QUALITY CHECK:')
missing_values = df_features.isnull().sum().sum()
total_values = df_features.size
completeness = ((total_values - missing_values) / total_values) * 100

print(f'   Missing Values: {missing_values:,}')
print(f'   Total Values: {total_values:,}')
print(f'   Data Completeness: {completeness:.2f}%')
print(f'   Memory Usage: {df_features.memory_usage(deep=True).sum() / 1024**2:.2f} MB')

# Feature correlation analysis
print('\n📈 FEATURE CORRELATION ANALYSIS:')
numeric_features = df_features.select_dtypes(include=[np.number]).columns
high_corr_features = []

if len(numeric_features) > 1:
    corr_matrix = df_features[numeric_features].corr()
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_pairs = [(column, index, upper_triangle.loc[index, column]) 
                      for column in upper_triangle.columns 
                      for index in upper_triangle.index 
                      if abs(upper_triangle.loc[index, column]) > 0.95 and not pd.isna(upper_triangle.loc[index, column])]
    
    print(f'   Highly Correlated Feature Pairs (>0.95): {len(high_corr_pairs)}')
    if high_corr_pairs:
        print('   Top 5 highly correlated pairs:')
        for i, (col1, col2, corr) in enumerate(sorted(high_corr_pairs, key=lambda x: abs(x[2]), reverse=True)[:5]):
            print(f'     {i+1}. {col1} - {col2}: {corr:.3f}')

# Save feature-engineered dataset
print('\n💾 SAVING FEATURE-ENGINEERED DATASET:')
output_file = '../data/TCS_stock_features.csv'
df_features.to_csv(output_file, index=False)
print(f'✅ Dataset saved: {output_file}')
print(f'📊 Shape: {df_features.shape}')

# Display sample of features
print('\n📋 SAMPLE OF ENGINEERED FEATURES:')
sample_features = ['Date', 'Close', 'Daily_Return', 'RSI_14', 'MACD', 'BB_Position', 
                  'Volume_Ratio_20', 'SMA_20_ratio', 'Target_Return_1d', 'Target_Direction_1d']
available_features = [col for col in sample_features if col in df_features.columns]
display(df_features[available_features].tail(10))

print('\n✅ FEATURE ENGINEERING COMPLETED SUCCESSFULLY!')
print('🔄 Next: Model Training (04_model_training.ipynb)')