# Processing Layer - TCS Stock Prediction

This notebook processes and engineers features for each time horizon:
- 1D, 5D, 1M, 6M, 1Y, 5Y

In [3]:
import pandas as pd
import numpy as np
import os
from scipy.stats import zscore

os.makedirs('data/processed', exist_ok=True)

## Feature Engineering Function

In [4]:
def engineer_features(df):
    df = df.copy()

    # Detect 'Date' or 'Datetime' column
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date']).dt.date
    elif 'Datetime' in df.columns:
        df['Date'] = pd.to_datetime(df['Datetime']).dt.date
    else:
        raise ValueError("No 'Date' or 'Datetime' column found in DataFrame.")
    
    date_series = df['Date']
    # Essential numeric columns
    df = df[['Open', 'High', 'Low', 'Close', 'Volume']].dropna()
    df = df.apply(pd.to_numeric, errors='coerce')

    # Interpolate and remove outliers
    df.interpolate(method='linear', inplace=True)
    df.dropna(inplace=True)
    z_cols = ['Close', 'Volume', 'Open', 'High', 'Low']
    df = df[(np.abs(zscore(df[z_cols])) < 3).all(axis=1)]

    df['Date'] = date_series.loc[df.index]

    # ──────────────────────────────────────
    # 📊 Technical & Statistical Features

    df['Price'] = (df['High'] + df['Low']) / 2

    # Moving Averages
    df['MA20'] = df['Close'].rolling(20).mean()
    df['MA50'] = df['Close'].rolling(50).mean()

    # RSI
    delta = df['Close'].diff()
    gain = delta.clip(lower=0).rolling(14).mean()
    loss = -delta.clip(upper=0).rolling(14).mean()
    rs = gain / loss
    df['RSI'] = 100 - (100 / (1 + rs))

    # MACD & Signal Line
    ema12 = df['Close'].ewm(span=12, adjust=False).mean()
    ema26 = df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = ema12 - ema26
    df['Signal_Line'] = df['MACD'].ewm(span=9, adjust=False).mean()

    # Bollinger Bands
    rolling_mean = df['Close'].rolling(20).mean()
    rolling_std = df['Close'].rolling(20).std()
    df['BB_High'] = rolling_mean + (rolling_std * 2)
    df['BB_Low'] = rolling_mean - (rolling_std * 2)

    # Lags
    for i in range(1, 8):
        df[f'Lag_{i}'] = df['Close'].shift(i)

    # Return & Volatility
    df['Return_1d'] = df['Close'].pct_change()
    df['Return_5d'] = df['Close'].pct_change(5)
    df['Volatility_5d'] = df['Close'].rolling(5).std()

    # New: price change & volume trend
    df['Price_Change'] = df['Close'] - df['Open']
    df['Volume_Change'] = df['Volume'].pct_change()

    # Target
    df['Tomorrow'] = df['Close'].shift(-1)

    # Final cleanup
    df.dropna(inplace=True)
    # Add 'Date' column if available
    cols = ['Date'] + [col for col in df.columns if col != 'Date']
    return df

## Process All Time Horizons

In [5]:
files = {
    '1d': 'data/tcs_1d_1min.csv',
    '5d': 'data/tcs_5d_30min.csv',
    '1m': 'data/tcs_1m_1d.csv',
    '6m': 'data/tcs_6m_1d.csv',
    '1y': 'data/tcs_1y_1d.csv',
    '5y': 'data/tcs_5y_7d.csv'
}

for tag, path in files.items():
    print(f"Processing: {tag.upper()} data from {path}")
    df = pd.read_csv(path)
    processed = engineer_features(df)
    processed.to_csv(f'data/processed/tcs_{tag}_features.csv', index=False)
    print(processed.head())

Processing: 1D data from data/tcs_1d_1min.csv
           Open         High          Low        Close  Volume        Date  \
51  3464.300049  3467.399902  3464.100098  3466.800049  3083.0  2025-05-08   
52  3466.800049  3467.000000  3464.500000  3464.500000  1703.0  2025-05-08   
53  3464.300049  3464.800049  3464.199951  3464.800049  2636.0  2025-05-08   
54  3464.800049  3464.899902  3464.500000  3464.500000  1209.0  2025-05-08   
55  3464.300049  3464.399902  3463.199951  3463.300049  3562.0  2025-05-08   

          Price         MA20         MA50        RSI  ...        Lag_4  \
51  3465.750000  3466.499988  3464.417993  48.667036  ...  3466.300049   
52  3465.750000  3466.279993  3464.945991  43.976116  ...  3466.199951   
53  3464.500000  3466.139990  3465.461992  47.205374  ...  3465.000000   
54  3464.699951  3466.214990  3465.873994  40.136845  ...  3464.399902   
55  3463.799927  3466.164990  3466.113994  39.334017  ...  3466.800049   

          Lag_5        Lag_6        Lag_