# Predicting Stock Price Movements — Final Notebook

**File:** `Vikash_DataScienceAssignment_final.ipynb`

This notebook includes data collection, cleaning, feature engineering, baseline models (Logistic, RF, XGB), enhanced features, hyperparameter tuning, LightGBM, and an ensemble (RF+XGB+LGBM).

## 0. Setup — Install required libraries
Run this cell once. If running on Colab/Kaggle some packages may already be installed.

In [None]:
!pip install --quiet yfinance pandas numpy matplotlib seaborn scikit-learn xgboost lightgbm ta joblib shap mlxtend

In [None]:
# Imports and plotting style
import warnings
warnings.filterwarnings('ignore')

import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             confusion_matrix, roc_curve, auc, classification_report,
                             matthews_corrcoef)
from scipy.stats import randint, uniform

import ta
import joblib
import shap

# Use a valid matplotlib style
try:
    plt.style.use('seaborn-v0_8')
except Exception:
    plt.style.use('default')

%matplotlib inline
print('Imports done.')

In [None]:
# 1. Data collection
TICKER = 'HDFCBANK.NS'  # change as needed
print('Ticker:', TICKER)

data = yf.download(TICKER, period='2y', interval='1d', auto_adjust=False)
# keep relevant columns
if isinstance(data.columns, pd.MultiIndex):
    # sometimes yfinance returns single-level, sometimes multi; collapse if needed
    try:
        data.columns = data.columns.droplevel(1)
    except Exception:
        data.columns = data.columns.get_level_values(0)

data = data[['Open','High','Low','Close','Adj Close','Volume']].dropna()
# chronological order
data = data.sort_index()

data.head()

In [None]:
# 2. Data cleaning
# Forward/backfill
data = data.ffill().bfill()

# Robust outlier capping for Volume
Q1 = data['Volume'].quantile(0.25)
Q3 = data['Volume'].quantile(0.75)
IQR = Q3 - Q1
if pd.isna(Q1) or pd.isna(Q3) or pd.isna(IQR):
    print('Volume quantiles are NaN; skipping cap.')
else:
    lower = float(Q1 - 1.5 * IQR)
    upper = float(Q3 + 1.5 * IQR)
    if lower >= upper:
        print('Degenerate IQR bounds; skipping cap.')
    else:
        data['Volume'] = data['Volume'].clip(lower=lower, upper=upper)
        print(f'Capped Volume to range [{lower:.2f}, {upper:.2f}]')

# compute daily returns
data['Return'] = data['Adj Close'].pct_change()

# handle zero-volume rows by replacing 0 with NaN then ffill
if (data['Volume'] == 0).any():
    data.loc[data['Volume'] == 0, 'Volume'] = np.nan
    data['Volume'] = data['Volume'].ffill().bfill()

# drop initial NaNs
data = data.dropna()

print('Data shape after cleaning:', data.shape)
data.tail()

In [None]:
# 3. Feature engineering
# Ensure Adj Close is Series
adj = data['Adj Close']

# Moving averages and momentum indicators
data['SMA_20'] = adj.rolling(20).mean()
data['EMA_20'] = adj.ewm(span=20, adjust=False).mean()

# MACD
ema12 = adj.ewm(span=12, adjust=False).mean()
ema26 = adj.ewm(span=26, adjust=False).mean()
data['MACD'] = ema12 - ema26
data['MACD_signal'] = data['MACD'].ewm(span=9, adjust=False).mean()

# RSI
data['RSI_14'] = ta.momentum.rsi(adj, window=14)

# Bollinger Bands
bb = ta.volatility.BollingerBands(close=adj, window=20, window_dev=2)
data['BB_high'] = bb.bollinger_hband()
data['BB_low'] = bb.bollinger_lband()
data['BB_width'] = data['BB_high'] - data['BB_low']

# Extra engineered features
for lag in range(1,6):
    data[f'return_lag_{lag}'] = adj.pct_change(lag)

data['mean_ret_5'] = adj.pct_change().rolling(5).mean()
data['mean_ret_10'] = adj.pct_change().rolling(10).mean()
data['vol_10'] = adj.pct_change().rolling(10).std()
data['vol_20'] = adj.pct_change().rolling(20).std()

data['close_open_ratio'] = data['Close'] / data['Open'] - 1
data['high_low_ratio'] = data['High'] / data['Low'] - 1
data['vol_change'] = data['Volume'].pct_change()

# TA extras
data['ADX_14'] = ta.trend.adx(data['High'], data['Low'], adj, window=14)
data['OBV'] = ta.volume.on_balance_volume(adj, data['Volume'])

# Weekday
data['dayofweek'] = data.index.dayofweek

# Drop NaNs introduced by indicators
data = data.dropna()
print('Data shape after feature engineering:', data.shape)

data.head()

In [None]:
# 4. Target construction (smoothed 3-day)
data['future_mean_3d'] = data['Adj Close'].shift(-1).rolling(window=3).mean()
data['Target_3d'] = (data['future_mean_3d'] > data['Adj Close']).astype(int)
# drop rows with NaN in target
data = data.dropna()
print('Target distribution (Target_3d):')
print(data['Target_3d'].value_counts(normalize=True))

data.tail()

In [None]:
# 5. Exploratory Data Analysis (quick)
plt.figure(figsize=(14,6))
plt.plot(data.index, data['Adj Close'], label='Adj Close')
plt.plot(data.index, data['SMA_20'], label='SMA_20')
plt.plot(data.index, data['EMA_20'], label='EMA_20')
plt.title(f"{TICKER} Price with Moving Averages")
plt.legend(); plt.show()

plt.figure(figsize=(14,3))
plt.plot(data.index, data['RSI_14']); plt.axhline(70, linestyle='--'); plt.axhline(30, linestyle='--'); plt.title('RSI (14)'); plt.show()

plt.figure(figsize=(14,6))
plt.plot(data.index, data['Adj Close'], label='Adj Close')
plt.plot(data.index, data['BB_high'], alpha=0.6); plt.plot(data.index, data['BB_low'], alpha=0.6);
plt.fill_between(data.index, data['BB_low'], data['BB_high'], alpha=0.1); plt.title('Bollinger Bands'); plt.show()

fig, axs = plt.subplots(1,2, figsize=(14,4))
axs[0].plot(data.index, data['Volume']); axs[0].set_title('Volume')
axs[1].hist(data['Return'].dropna(), bins=50); axs[1].set_title('Daily Return Distribution')
plt.show()

In [None]:
# 6. Prepare features and time-based split
feature_cols = [
    'SMA_20','EMA_20','RSI_14','MACD','MACD_signal','BB_width',
    'return_lag_1','return_lag_2','return_lag_3','return_lag_4','return_lag_5',
    'mean_ret_5','mean_ret_10','vol_10','vol_20','close_open_ratio','high_low_ratio',
    'vol_change','ADX_14','OBV','dayofweek','Volume'
]
feature_cols = [c for c in feature_cols if c in data.columns]
print('Using features:', feature_cols)

X = data[feature_cols].copy()
y = data['Target_3d'].copy()

# scale
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), index=X.index, columns=X.columns)

# time-based split 80/20
split_idx = int(len(X_scaled) * 0.8)
X_train = X_scaled.iloc[:split_idx]
X_test = X_scaled.iloc[split_idx:]
y_train = y.iloc[:split_idx]
y_test = y.iloc[split_idx:]

print('Train size:', X_train.shape, 'Test size:', X_test.shape)

In [None]:
# 7. Baseline models: Logistic, RandomForest, XGBoost
models = {
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'RandomForest': RandomForestClassifier(n_estimators=200, random_state=42),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
}

fitted = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    fitted[name] = model
    print(f'{name} trained')

# Evaluate baselines
results = {}
for name, model in fitted.items():
    y_pred = model.predict(X_test)
    try:
        y_proba = model.predict_proba(X_test)[:,1]
    except Exception:
        try:
            df = model.decision_function(X_test)
            y_proba = (df - df.min()) / (df.max() - df.min())
        except Exception:
            y_proba = np.zeros(len(y_pred))
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    cm = confusion_matrix(y_test, y_pred)
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    results[name] = {'accuracy':acc,'precision':prec,'recall':rec,'f1':f1,'roc_auc':roc_auc,'confusion_matrix':cm}

pd.DataFrame({k:{'accuracy':v['accuracy'],'precision':v['precision'],'recall':v['recall'],'f1':v['f1'],'roc_auc':v['roc_auc']} for k,v in results.items()}).T

In [None]:
# 8. Enhanced hyperparameter tuning (RandomizedSearchCV) for RF and XGB
# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=4)

# RandomForest search
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
rf_param_dist = {
    'n_estimators': randint(100, 400),
    'max_depth': randint(3, 20),
    'min_samples_split': randint(2, 12),
    'min_samples_leaf': randint(1, 6),
    'max_features': ['sqrt', 'log2', None]
}
rf_search = RandomizedSearchCV(rf, rf_param_dist, n_iter=20, scoring='f1', cv=tscv, random_state=42, n_jobs=-1)
rf_search.fit(X_train, y_train)
best_rf = rf_search.best_estimator_
print('Best RF params:', rf_search.best_params_)

# XGBoost search
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42, n_jobs=-1)
xgb_param_dist = {
    'n_estimators': randint(80, 400),
    'max_depth': randint(2, 12),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5)
}
xgb_search = RandomizedSearchCV(xgb, xgb_param_dist, n_iter=20, scoring='f1', cv=tscv, random_state=42, n_jobs=-1)
xgb_search.fit(X_train, y_train)
best_xgb = xgb_search.best_estimator_
print('Best XGB params:', xgb_search.best_params_)

# Fit on full selected features
best_rf.fit(X_train, y_train)
best_xgb.fit(X_train, y_train)

# Save
joblib.dump(best_rf, 'best_rf.pkl')
joblib.dump(best_xgb, 'best_xgb.pkl')
print('Saved best_rf.pkl and best_xgb.pkl')

In [None]:
# 9. LightGBM hyperparameter search and fit
from lightgbm import LGBMClassifier
lgb = LGBMClassifier(objective='binary', random_state=42, n_jobs=-1)
param_dist = {
    'n_estimators': randint(80, 600),
    'num_leaves': randint(15, 255),
    'max_depth': randint(3, 16),
    'learning_rate': uniform(0.01, 0.3),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'reg_alpha': uniform(0.0, 1.0),
    'reg_lambda': uniform(0.0, 1.0)
}
rand_search = RandomizedSearchCV(lgb, param_dist, n_iter=30, scoring='f1', cv=tscv, random_state=42, n_jobs=-1, verbose=1)
print('Fitting LightGBM randomized search...')
rand_search.fit(X_train, y_train)
best_lgb = rand_search.best_estimator_
print('Best LGB params:', rand_search.best_params_)

# Fit and save
best_lgb.fit(X_train, y_train)
joblib.dump(best_lgb, 'best_lgbm_model.pkl')
print('Saved best_lgbm_model.pkl')

In [None]:
# 10. Ensemble RF + XGB + LGBM (probability average)
import numpy as np
import pandas as pd
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_curve, auc)

# Load models
best_rf = joblib.load('best_rf.pkl')
best_xgb = joblib.load('best_xgb.pkl')
best_lgb = joblib.load('best_lgbm_model.pkl')
models = {'RandomForest':best_rf, 'XGBoost':best_xgb, 'LightGBM':best_lgb}

# Ensure selected features
try:
    selected_features
except NameError:
    selected_features = X_train.columns.tolist()

X_test_sel = X_test[selected_features]

# get probas
def get_proba(model, X):
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(X)[:,1]
    elif hasattr(model, 'decision_function'):
        df = model.decision_function(X)
        return (df - df.min())/(df.max() - df.min())
    else:
        return model.predict(X).astype(float)

probas = {name: get_proba(m, X_test_sel) for name,m in models.items()}
# ensemble avg
proba_matrix = np.vstack(list(probas.values()))
ensemble_proba = proba_matrix.mean(axis=0)
# threshold tuning on last part of train
val_frac = 0.2
val_idx = int(len(X_train)*(1-val_frac))
X_val = X_train.iloc[val_idx:]
y_val = y_train.iloc[val_idx:]
val_probas = [get_proba(m, X_val[selected_features]) for m in models.values()]
val_ensemble = np.mean(np.vstack(val_probas), axis=0)
thresholds = np.linspace(0.3,0.7,41)
best_t=0.5; best_f1=-1
from sklearn.metrics import f1_score
for t in thresholds:
    f = f1_score(y_val, (val_ensemble>=t).astype(int), zero_division=0)
    if f>best_f1:
        best_f1=f; best_t=t
print('Best threshold on val:', best_t, 'f1:', best_f1)

y_pred_ens = (ensemble_proba>=best_t).astype(int)

# Evaluate
from sklearn.metrics import classification_report, matthews_corrcoef
print('Ensemble Evaluation:')
print('Accuracy', accuracy_score(y_test, y_pred_ens))
print('Precision', precision_score(y_test, y_pred_ens, zero_division=0))
print('Recall', recall_score(y_test, y_pred_ens, zero_division=0))
print('F1', f1_score(y_test, y_pred_ens, zero_division=0))
print('MCC', matthews_corrcoef(y_test, y_pred_ens))
print('ROC AUC', auc(*roc_curve(y_test, ensemble_proba)[:2]))
print('Confusion matrix:\n', confusion_matrix(y_test, y_pred_ens))
print('\nClassification Report:\n', classification_report(y_test, y_pred_ens, zero_division=0))

# Save ensemble probs
out = pd.DataFrame({'date': X_test.index, 'y_true': y_test.values, 'ensemble_proba': ensemble_proba}).set_index('date')
out.to_csv('ensemble_probs.csv')
print('Saved ensemble_probs.csv')

# Feature importances averaged
imps = {}
for name,m in models.items():
    if hasattr(m, 'feature_importances_'):
        imps[name] = pd.Series(m.feature_importances_, index=selected_features)
if imps:
    df_imps = pd.DataFrame(imps).fillna(0)
    df_imps['mean_importance'] = df_imps.mean(axis=1)
    print('Top combined features:')
    display(df_imps['mean_importance'].sort_values(ascending=False).head(15))

print('Ensemble complete.')

In [None]:
# 11. Save final artifacts (models already saved above)
print('Files in working dir:')
import os
for f in os.listdir('.'):
    if f.endswith('.pkl') or f.endswith('.csv') or f.endswith('.ipynb'):
        print('-', f)

# Save notebook copy
import nbformat as nbf
nb = nbf.read('Vikash_DataScienceAssignment_final.ipynb', as_version=4)
# already the current file; nothing else to do
print('Notebook saved as Vikash_DataScienceAssignment_final.ipynb')