Stock Prediction Project

In [None]:
#Import libs
import os
import pandas as pd
import numpy as np
from pkg_resources import non_empty_lines

#indicators
from ta.momentum import RSIIndicator, StochasticOscillator, ROCIndicator, WilliamsRIndicator
from ta.trend import MACD, ADXIndicator, EMAIndicator, CCIIndicator, AroonIndicator
from ta.volatility import BollingerBands, AverageTrueRange
from ta.volume import OnBalanceVolumeIndicator, MFIIndicator

#ML stuffs
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Visual + Warn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Finance data
import yfinance as yf
from datetime import datetime

# Streamlit
import streamlit as st

warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


In [None]:
# Configs from necessary research

TICKERS = [ # I decided to remove tesla since its movement is usually based on the CEO and news...
    'SPY', # INDEX
    'QQQ',
    'AAPL', # Heavy hitter for S&P
    'NVDA',
    'MSFT', # Heavy hitter for S&P
    'JPM',
    'AMZN', # Heavy hitter for S&P
    'XOM',
    'BAC',
    'JNJ'
]

# For our start dates, we are going to go back 10 years.

START_DATE = '2015-01-01'
END_DATE = '2025-01-01'

# 5 day forward predictions
PREDICTION_HORIZON = 5

# Cross-Val
N_SPLITS = 5

# Gap
GAP = 5


# Configs for graphs
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Helper functions

def download_stock_data(ticker, start, end):
    try:
        print(f"Downloading {ticker} data from {start} to {end}...")
        # DEBUG: Added multi_level_index=False to prevent 2D shape errors
        df = yf.download(ticker, start=start, end=end, auto_adjust=True, progress=False, multi_level_index=False)

        if df.empty:
            print(f"No data available for {ticker}...")
            return None

        # Flatten columns if yfinance returns a MultiIndex
        if isinstance(df.columns, pd.MultiIndex):
            df.columns = df.columns.get_level_values(0)

        return df
    except Exception as e:
        print(f"Error downloading {ticker}: {str(e)}")
        return None


def add_tier1_indicators(df): # these will provide 80% of our predictive power
    # RSI is what 90% of traders use. This stands for relative strength index
    df['rsi'] = RSIIndicator(close = df['Close'], window = 14).rsi()

    # MACD is the next ind we plan to implement.
    macd = MACD(close=df['Close'], window_slow=26, window_fast=12, window_sign=9)
    df['macd_line'] = macd.macd()
    df['macd_signal'] = macd.macd_signal()
    df['macd_hist'] = macd.macd_diff()

    # Bollinger Bands
    bb = BollingerBands(close=df['Close'], window=20, window_dev = 2)
    df['bb_width'] = bb.bollinger_wband()
    df['bb_percent'] = bb.bollinger_pband()

    # ADX (Average Directional Index)
    adx = ADXIndicator(high=df['High'], low=df['Low'], close=df['Close'], window=14)
    df['adx'] = adx.adx()
    df['adx_pos'] = adx.adx_pos()
    df['adx_neg'] = adx.adx_neg()
    df['adx_diff'] = df['adx_pos'] - df['adx_neg'] # this is our directional component for better analysis

    # ATR (Average True Range)
    atr = AverageTrueRange(high=df['High'], low=df['Low'], close=df['Close'], window=14)
    df['atr_norm'] = atr.average_true_range() / df['Close']

    # This one must be normalized ^^^
    # These should be about 10 features


    return df

def add_tier2_indicators(df):
    # These are recommended indicators that will increase accuracy and add diversity to the set
    # This should add around 8 features

    # Stoch Osci
    stoch = StochasticOscillator(high=df['High'], low=df['Low'], close=df['Close'], window=14, smooth_window=3)
    df['stoch_k'] = stoch.stoch()
    df['stoch_d'] = stoch.stoch_signal()


    # OBV (on balance volume) - This is should use rate of change and not absolute
    obv = OnBalanceVolumeIndicator(close = df['Close'], volume = df['Volume'])
    df['obv_roc'] = obv.on_balance_volume().pct_change(5)

    # Williams %R
    df['williams_r'] = WilliamsRIndicator(high=df['High'], low=df['Low'], close = df['Close'], lbp = 14).williams_r()

    # EMA
    ema_12 = EMAIndicator(close = df['Close'], window=12).ema_indicator()
    ema_26 = EMAIndicator(close = df['Close'], window=26).ema_indicator()
    df['ema_ratio'] = ema_12 / ema_26

    # ROC (Rate of Change)
    df['roc_5'] = ROCIndicator(close = df['Close'], window = 5).roc()
    df['roc_10'] = ROCIndicator(close = df['Close'], window = 10).roc()
    return df

def add_tier3_indicators(df):
    # these are optional extra that will add 6 additional features to help improve acc

    # CCI Comod chan index
    df['cci'] = CCIIndicator(high = df['High'], low = df['Low'], close = df['Close'], window = 20).cci()

    # MFI money flow index
    df['mfi'] = MFIIndicator(high = df['High'], low = df['Low'], close = df['Close'], volume = df['Volume'], window = 14).money_flow_index()

    # Aroon Indic
    aroon = AroonIndicator(high=df['High'], low=df['Low'], window=25)
    df['aroon_indicator'] = aroon.aroon_indicator()

    return df

def add_all_indicators(df, tier = 'tier2'):
    # With this function we should make sure to add technical indicators based on the tier that is selected

    '''
    :param tier: tier1 (10 feat)
    :param tier: tier2 (18 feat)
    :param tier: tier3 (24 feat)
    '''

    df = add_tier1_indicators(df)

    if tier in ['tier2', 'tier3']:
        df = add_tier2_indicators(df)
    if tier == 'tier3':
        df = add_tier3_indicators(df)

    return df

def create_target_var(df, horizon=5):
    # We need to have a function that creates a binary target. If price goes up compared to the horizon days, return 1.

    df['target'] = (df['Close'].shift(-horizon) > df['Close']).astype(int)
    return df

def get_feature_columns(tier = 'tier2'):
    # Return feature column names based on the selected tier

    tier1_features = [
        'rsi',
        'macd_hist',
        'bb_width',
        'bb_percent',
        'adx',
        'adx_diff',
        'atr_norm'
    ]

    tier2_features = [
        'stoch_k',
        'obv_roc',
        'williams_r',
        'ema_ratio',
        'roc_5',
        'roc_10'
    ]

    tier3_features = [
        'cci',
        'mfi',
        'aroon_indicator'
    ]

    if tier == 'tier1':
        return tier1_features
    elif tier == 'tier2':
        return tier1_features + tier2_features
    else:
        return tier1_features + tier2_features + tier3_features

def check_feature_correlation(df, feature_cols, threshold=0.9):
    """
    With this function we need to check for highly correlated features and recommend removal. From my research document, I have found that correlation >9.9 should be avoided and removed.
    """

    corr_matrix = df[feature_cols].corr().abs()

    high_corr_pairs = []
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if corr_matrix.iloc[i, j] > threshold:
                high_corr_pairs.append({
                    'feature1':corr_matrix.columns[i],
                    'feature2':corr_matrix.columns[j],
                    'correlation':corr_matrix.iloc[i, j]
                })

    if high_corr_pairs:
        print(f"\n High correlation pairs found (>{threshold})")
        for pair in high_corr_pairs:
            print(f" - {pair['feature1']} <-> {pair['feature2']}: {pair['correlation']:.3f}")
    else:
        print(f"\n No highly correlated features (>{threshold})")
    return high_corr_pairs

def train_xgboost_model(X_train, X_test, y_train, y_test, verbose=False):
    """
    Train with XGBoost with reg params
    """

    model = xgb.XGBClassifier(
        max_depth=6,
        min_child_weight=3,
        subsample=0.7,
        colsample_bytree=0.7,
        learning_rate=0.05,
        n_estimators=100,
        reg_lambda=1.0,
        random_state=42,
        eval_metric='logloss',
        use_label_encoder=False,
    )

    model.fit(X_train, y_train, verbose=verbose)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return model, y_pred, accuracy

def train_random_forest_model(X_train, X_test, y_train, y_test):
    """
    Train with random forest with reg params
    :param X_train:
    :param X_test:
    :param y_train:
    :param y_test:
    :return:
    """

    model = RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return model, y_pred, accuracy

def train_and_evaluate(df, ticker, tier='tier2', model_type='xgboost'):
    # Train model with time-series cross val and comp eval
    feature_cols = get_feature_columns(tier)
    df_clean = df.dropna()

    print(f"\n{'='*70}")
    print(f"Training {model_type.upper()} model for {ticker} (Tier: {tier})")
    print(f"{'='*70}")
    print(f"Total samples: {len(df_clean):,}")
    print(f"Features: {len(feature_cols)}")
    print(f"Date range: {df_clean.index[0].date()} to {df_clean.index[-1].date()}")

    # Check for empty data before proceeding
    if len(df_clean) < 100:
        print("Not enough data to train. Skipping.")
        return None

    class_balance = df_clean['target'].value_counts()
    print(f"\nClass balance:")
    try:
        print(f"  Down (0): {class_balance[0]:,} ({class_balance[0]/len(df_clean)*100:.1f}%)")
        print(f"  Up (1): {class_balance[1]:,} ({class_balance[1]/len(df_clean)*100:.1f}%)")
    except KeyError:
        print(f"  Class balance issue: {class_balance}")

    check_feature_correlation(df_clean, feature_cols)
    tscv = TimeSeriesSplit(n_splits=N_SPLITS, gap=GAP)

    fold_accuracies = []
    all_predictions = []
    all_actual = [] # This lists needs to store VALUES, not DataFrames!!!
    feature_importances = []

    print(f"\nCross-validation with {N_SPLITS} folds (gap={GAP} days):")
    print("-" * 70)

    for fold, (train_idx, test_idx) in enumerate(tscv.split(df_clean), 1):
        X_train = df_clean.iloc[train_idx][feature_cols]
        X_test = df_clean.iloc[test_idx][feature_cols]
        y_train = df_clean.iloc[train_idx][['target']]
        y_test = df_clean.iloc[test_idx][['target']]

        train_dates = df_clean.iloc[train_idx].index
        test_dates = df_clean.iloc[test_idx].index

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        if model_type == 'xgboost':
            model, y_pred, accuracy = train_xgboost_model(
                X_train_scaled, X_test_scaled, y_train, y_test
            )
        else:
            model, y_pred, accuracy = train_random_forest_model(
                X_train_scaled, X_test_scaled, y_train, y_test
            )

        fold_accuracies.append(accuracy)
        all_predictions.extend(y_pred)


        all_actual.extend(y_test['target'].values)

        feature_importances.append(model.feature_importances_)

        print(f"Fold {fold}: {accuracy:.4f} ({accuracy*100:.2f}%) | "
              f"Train: {train_dates[0].date()} to {train_dates[-1].date()} | "
              f"Test: {test_dates[0].date()} to {test_dates[-1].date()}")

    avg_accuracy = np.mean(fold_accuracies)
    std_accuracy = np.std(fold_accuracies)
    min_accuracy = min(fold_accuracies)
    max_accuracy = max(fold_accuracies)

    print(f"\n{'='*70}")
    print(f"Overall Results for {ticker}")
    print(f"{'='*70}")
    print(f"Average Accuracy: {avg_accuracy:.4f} ({avg_accuracy*100:.2f}%)")

    if avg_accuracy >= 0.60:
        print(f"Target Met: Achieved 60%+ accuracy!")
    else:
        gap = (0.60 - avg_accuracy) * 100
        print(f"Target Not Met: {gap:.2f}% below 60% target")

    # Generate Report
    # Ensure inputs are numpy arrays or lists of equal length
    print(f"\nClassification Report:")
    print(classification_report(all_actual, all_predictions, target_names=['Down', 'Up'], digits=4))

    cm = confusion_matrix(all_actual, all_predictions)

    # Final Model Training
    X_all = df_clean[feature_cols]
    y_all = df_clean['target']
    scaler_final = StandardScaler()
    X_all_scaled = scaler_final.fit_transform(X_all)

    if model_type == 'xgboost':
        final_model = xgb.XGBClassifier(
            max_depth=6,
            min_child_weight=3,
            subsample=0.7,
            colsample_bytree=0.7,
            learning_rate=0.05,
            n_estimators=100,
            reg_lambda=1.0,
            random_state=42,
            eval_metric='logloss',
            use_label_encoder=False
        )
    else:
        final_model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            max_features='sqrt',
            random_state=42,
            n_jobs=-1
        )

    final_model.fit(X_all_scaled, y_all)
    avg_feature_importance = np.mean(feature_importances, axis=0)

    return {
        'ticker': ticker,
        'model_type': model_type,
        'tier': tier,
        'model': final_model,
        'avg_accuracy': avg_accuracy,
        'std_accuracy': std_accuracy,
        'min_accuracy': min_accuracy,
        'max_accuracy': max_accuracy,
        'fold_accuracies': fold_accuracies,
        'feature_cols': feature_cols,
        'feature_importance': dict(zip(feature_cols, avg_feature_importance)),
        'confusion_matrix': cm
    }

In [None]:
def plot_feature_importance(results, save_dir='./plots'):
    os.makedirs(save_dir, exist_ok=True)

    importance_df = pd.DataFrame([
        {'feature' : k, 'importance' : v}
        for k, v in results['feature_importance'].items()
    ]).sort_values('importance', ascending=False)

    plt.figure(figsize=(12, 8))
    colors = sns.color_palette("viridis", len(importance_df))
    sns.barplot(data=importance_df, y='feature', x='importance', palette=colors)
    plt.title(f"Feature Importance - {results['ticker']} ({results['model_type'].upper()})\n"
              f"Accuracy: {results['avg_accuracy']*100:.2f}%", fontsize=14, fontweight='bold')
    plt.xlabel('Importance Score', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.tight_layout()

    filename = f"{save_dir}/feature_importance_{results['ticker']}_{results['model_type']}.png"
    plt.savefig(filename, dpi=150, bbox_inches='tight')
    plt.close()

    return filename

def plot_accuracy_comparison(all_results, save_dir='./plots'):
    os.makedirs(save_dir, exist_ok=True)

    comparison_data = []
    for result in all_results:
        comparison_data.append({
            'Ticker': result['ticker'],
            'Accuracy': result['avg_accuracy'] * 100,
            'Model': result['model_type'].upper()
        })

    df_comp = pd.DataFrame(comparison_data)

    plt.figure(figsize=(14, 8))
    ax = sns.barplot(data=df_comp, x='Ticker', y='Accuracy', hue='Model', palette='Set2')

    plt.axhline(y=60, color='red', linestyle='--', linewidth=2, label='60% Target')

    for container in ax.containers:
        ax.bar_label(container, fmt='%.1f%%', padding=3)

    plt.title('Model Accuracy Comparison Across Stocks\n5-Day Price Direction Prediction',
              fontsize=14, fontweight='bold')
    plt.xlabel('Stock Ticker', fontsize=12)
    plt.ylabel('Accuracy (%)', fontsize=12)
    plt.legend(title='Model Type', fontsize=10)
    plt.ylim(40, max(df_comp['Accuracy'].max() + 5, 70))
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()

    filename = f"{save_dir}/accuracy_comparison.png"
    plt.savefig(filename, dpi=150, bbox_inches='tight')
    plt.close()

    return filename

def save_detailed_results(all_results, filename = 'detailed_results.csv'):
    detailed_data = []

    for result in all_results:
        for fold_num, fold_acc in enumerate(result['fold_accuracies'], 1):
            detailed_data.append({
                'Ticker': result['ticker'],
                'Model': result['model_type'],
                'Tier': result['tier'],
                'Fold': fold_num,
                'Accuracy': fold_acc * 100,
                'Average_Accuracy': result['avg_accuracy'] * 100,
                'Std_Dev': result['std_accuracy'] * 100,
                'Min_Accuracy': result['min_accuracy'] * 100,
                'Max_Accuracy': result['max_accuracy'] * 100
            })

    df = pd.DataFrame(detailed_data)
    df.to_csv(filename, index=False)
    print(f"\nDetailed results saved to: {filename}")

    return df

def print_final_summary(all_results):
    """Print comprehensive final summary."""
    print("\n" + "="*70)
    print("FINAL SUMMARY - ALL STOCKS")
    print("="*70)

    summary_data = []
    for r in all_results:
        summary_data.append({
            'Ticker': r['ticker'],
            'Model': r['model_type'].upper(),
            'Tier': r['tier'].upper(),
            'Avg Acc': f"{r['avg_accuracy']*100:.2f}%",
            'Std Dev': f"{r['std_accuracy']*100:.2f}%",
            'Min': f"{r['min_accuracy']*100:.2f}%",
            'Max': f"{r['max_accuracy']*100:.2f}%",
            'Target Met': '✓' if r['avg_accuracy'] >= 0.60 else '✗'
        })

    summary_df = pd.DataFrame(summary_data)
    print(summary_df.to_string(index=False))

    overall_avg = np.mean([r['avg_accuracy'] for r in all_results])
    overall_std = np.std([r['avg_accuracy'] for r in all_results])
    stocks_above_60 = sum(1 for r in all_results if r['avg_accuracy'] >= 0.60)

    print(f"\n{'='*70}")
    print(f"Overall Statistics")
    print(f"{'='*70}")
    print(f"Average Accuracy Across All Stocks: {overall_avg*100:.2f}% (±{overall_std*100:.2f}%)")
    print(f"Stocks Meeting 60% Target: {stocks_above_60}/{len(all_results)}")
    print(f"Best Performing Stock: {max(all_results, key=lambda x: x['avg_accuracy'])['ticker']} "
          f"({max(r['avg_accuracy'] for r in all_results)*100:.2f}%)")
    print(f"Worst Performing Stock: {min(all_results, key=lambda x: x['avg_accuracy'])['ticker']} "
          f"({min(r['avg_accuracy'] for r in all_results)*100:.2f}%)")

    if overall_avg >= 0.60:
        print(f"\nSuccess: Overall average EXCEEDS 60% target!")
    else:
        gap = (0.60 - overall_avg) * 100
        print(f"\nOverall average is {gap:.2f}% below 60% target")
        print(f"Recommendations:")
        print(f"   - Try tier3 indicators for more features")
        print(f"   - Test Random Forest model as alternative")
        print(f"   - Consider ensemble methods (combine XGBoost + RF)")

    return summary_df


I want to use StreamLit to create an interactive dashboard like page for the charts

In [None]:
def render_header():
  st.title("Machine Learning Stock Predictor Dashboard")
  st.markdown("""
  This app is designed to predict whether a stock will close **higher** or **lower** in 5 days based on technical indicators such as RSI, MACD, Bollinger Bands, and more.""")

def render_sidebar():
  st.sidebar.header("Configuration")
  ticker = st.sidebar.text_input("Enter Stock Ticker", value="SPY").upper()
  start_date = st.sidebar.date_input("Start Date", value=pd.to_datetime("2015-01-01"))
  end_date = st.sidebar.date_input("End Date", value=pd.to_datetime("2025-01-01"))
  return ticker, start_date, end_date

def display_metrics(accuracy, last_date):
  col1, col2 = st.columns(2)
  with col1:
    st.metric("Model Accuracy", f"{accuracy*100:.2f}%")
  with col2:
    st.metric("Last Date Point", str(last_date.date()))

  if accuracy >= 0.60:
    st.success("Target Accuracy has been met! (60%+)")
  else:
    st.warning(f"Target Not Met ({accuracy*100:.2f}%)")

def plot_importance(model, feature_cols):
    importance_df = pd.DataFrame({
        'Feature': feature_cols,
        'Importance': model.feature_importances_
    }).sort_values('Importance', ascending=False)

    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(data=importance_df, x='Importance', y='Feature', palette='viridis', ax=ax)
    ax.set_title("Feature Importance")
    st.pyplot(fig)



Testing with main and creation of app!

In [None]:
all_results = []

print("Starting ML Pipeline Execution...")
print(f"Targeting: {len(TICKERS)} stocks | Horizon: {PREDICTION_HORIZON} days")
print("-" * 50)

for ticker in TICKERS:
    try:
        # Download
        df = download_stock_data(ticker, START_DATE, END_DATE)

        # Check if data is valid
        if df is None or len(df) < 200:
            print(f"Skipping {ticker}: Insufficient data.")
            continue

        # Add Indicators
        try:
            df = add_all_indicators(df, tier='tier3')
        except Exception as e:
            print(f"Error adding indicators for {ticker}: {e}")
            continue
        # Create Target
        df = create_target_var(df, horizon=PREDICTION_HORIZON)
        # Train & Evaluate
        result = train_and_evaluate(df, ticker, tier='tier3', model_type='xgboost')

        if result:
            all_results.append(result)

            try:
                plot_feature_importance(result)
            except Exception as e:
                print(f"Could not plot importance for {ticker}: {e}")

    except Exception as e:
        print(f"CRITICAL ERROR processing {ticker}: {str(e)}")
        continue


if all_results:
    print("\n" + "="*50)
    print("Generating Final Reports...")
    print("="*50)

    try:
        save_detailed_results(all_results)
        plot_accuracy_comparison(all_results)
        print_final_summary(all_results)
        print("\nExecution Complete! Check the './plots' folder for images.")
    except Exception as e:
        print(f"Error generating final reports: {e}")
else:
    print("\nNo results were generated. Please check your data connection and try again.")

Starting ML Pipeline Execution...
Targeting: 10 stocks | Horizon: 5 days
--------------------------------------------------
Downloading SPY data from 2015-01-01 to 2025-01-01...

Training XGBOOST model for SPY (Tier: tier3)
Total samples: 2,483
Features: 16
Date range: 2015-02-20 to 2024-12-31

Class balance:
  Down (0): 961 (38.7%)
  Up (1): 1,522 (61.3%)

 High correlation pairs found (>0.9)
 - rsi <-> bb_percent: 0.906
 - rsi <-> adx_diff: 0.943
 - bb_percent <-> stoch_k: 0.938
 - bb_percent <-> williams_r: 0.938
 - bb_percent <-> cci: 0.985
 - stoch_k <-> williams_r: 1.000
 - stoch_k <-> cci: 0.903
 - williams_r <-> cci: 0.903

Cross-validation with 5 folds (gap=5 days):
----------------------------------------------------------------------
Fold 1: 0.4625 (46.25%) | Train: 2015-02-20 to 2016-10-07 | Test: 2016-10-17 to 2018-06-07
Fold 2: 0.5351 (53.51%) | Train: 2015-02-20 to 2018-05-31 | Test: 2018-06-08 to 2020-01-29
Fold 3: 0.5763 (57.63%) | Train: 2015-02-20 to 2020-01-22 | Tes