# Baseline XGBoost Model Training for Stock Price Movement Prediction

This notebook demonstrates a baseline workflow for training an XGBoost model to predict significant stock price movements.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import sys
import os

# Add src directory to Python path to import custom modules
module_path = os.path.abspath(os.path.join('..')) # Assumes notebook is in 'notebooks' directory
if module_path not in sys.path:
    sys.path.append(module_path)

from src.data_loader import download_stock_data
from src.feature_engineering import add_technical_indicators, add_rolling_lag_features, create_target_variable

## 2. Configuration and Parameters

In [None]:
TICKER = 'AAPL' # Example ticker
START_DATE = '2019-01-01' # Extended start date for more data for rolling/lag features
END_DATE = '2023-12-31'
INTERVAL = '1d'

FUTURE_DAYS_TARGET = 5      # X days for target variable definition
PERCENT_CHANGE_THRESHOLD = 0.03 # Y% change for target variable definition

# Rolling/Lag feature parameters (can be adjusted)
ROLLING_WINDOWS = [5, 10, 20, 60]
LAG_PERIODS = [1, 2, 3, 5, 10]
KEY_LAG_INDICATORS = ['RSI_14', 'MACD', 'ATR_14', 'Stoch_k', 'ADX_14'] # Selected key indicators to lag

TEST_SIZE = 0.2
RANDOM_STATE = 42 # For reproducibility of train/test split

## 3. Download Data

In [None]:
raw_data = download_stock_data([TICKER], START_DATE, END_DATE, INTERVAL)
if raw_data is not None:
    print(f"Downloaded data for {TICKER}:")
    print(raw_data.head())
    print(f"Shape of raw data: {raw_data.shape}")
else:
    print(f"Failed to download data for {TICKER}.")
    # Handle error appropriately, e.g., by exiting or using sample data
    # For this notebook, we'll assume data download is successful

## 4. Feature Engineering

### 4.1. Add Technical Indicators

In [None]:
if raw_data is not None:
    data_with_ti = add_technical_indicators(raw_data.copy(), fillna=True) # fillna=True is important
    print("\nData with Technical Indicators:")
    print(data_with_ti.head())
    print(f"Shape after adding technical indicators: {data_with_ti.shape}")
    # print(data_with_ti.columns.tolist()) # To see all new TI columns
else:
    print("Skipping technical indicators as raw_data is None.")

### 4.2. Add Rolling and Lag Features

In [None]:
if 'data_with_ti' in locals() and data_with_ti is not None:
    data_with_roll_lag = add_rolling_lag_features(
        data_with_ti.copy(),
        windows=ROLLING_WINDOWS,
        lags=LAG_PERIODS,
        lag_indicators=KEY_LAG_INDICATORS
    )
    print("\nData with Rolling and Lag Features:")
    print(data_with_roll_lag.head())
    print(f"Shape after adding rolling/lag features: {data_with_roll_lag.shape}")
    # print(data_with_roll_lag.columns.tolist())
else:
    print("Skipping rolling/lag features as data_with_ti is not available.")

### 4.3. Create Target Variable

In [None]:
if 'data_with_roll_lag' in locals() and data_with_roll_lag is not None:
    processed_data = create_target_variable(
        data_with_roll_lag.copy(), 
        future_days=FUTURE_DAYS_TARGET, 
        percent_change_threshold=PERCENT_CHANGE_THRESHOLD
    )
    print("\nData with Target Variable:")
    print(processed_data.head())
    print(f"Shape after adding target variable: {processed_data.shape}")
    # Display some rows where target might be 1
    # print(processed_data[processed_data['target'] == 1].head())
else:
    print("Skipping target variable creation as data_with_roll_lag is not available.")

## 5. Data Preprocessing for Model Training

### 5.1. Select Features (X) and Target (y), Drop NaNs

In [None]:
if 'processed_data' in locals() and processed_data is not None:
    # Features: all columns except 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume' (original price data) and 'target'.
    # This means we use all technical indicators, rolling features, and lag features.
    base_price_volume_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
    feature_columns = [col for col in processed_data.columns if col not in base_price_volume_cols + ['target']]
    
    X = processed_data[feature_columns]
    y = processed_data['target']
    
    print(f"\nNumber of features selected: {len(X.columns)}")
    # print("Selected features:", X.columns.tolist())
    print(f"Original shape before NaN drop: X: {X.shape}, y: {y.shape}")
    
    # Crucial step: Drop rows with any NaN values (in features or target)
    # Indicators, rolling features, lag features, and target creation can all introduce NaNs.
    combined_for_cleaning = X.assign(target=y)
    cleaned_data = combined_for_cleaning.dropna()
    
    X_cleaned = cleaned_data[feature_columns]
    y_cleaned = cleaned_data['target']
    
    print(f"Shape after dropping NaNs: X_cleaned: {X_cleaned.shape}, y_cleaned: {y_cleaned.shape}")
    
    if X_cleaned.empty or y_cleaned.empty:
        print("\nError: No data left after NaN removal. Consider:")
        print("- Using a longer date range for data download.")
        print("- Reducing window sizes for rolling features or number of lag periods.")
        print("- Using fillna(0) or other imputation for some features (though this might impact model quality).")
    else:
        print("\nFeatures (X_cleaned head) after NaN drop:")
        print(X_cleaned.head())
        print("\nTarget (y_cleaned head) after NaN drop:")
        print(y_cleaned.head())
else:
    print("Skipping feature/target selection as processed_data is not available.")

### 5.2. Split Data into Training and Testing Sets

In [None]:
if 'X_cleaned' in locals() and not X_cleaned.empty:
    X_train, X_test, y_train, y_test = train_test_split(
        X_cleaned, y_cleaned, 
        test_size=TEST_SIZE, 
        shuffle=False, # Crucial for time-series data to prevent lookahead bias in validation
        random_state=RANDOM_STATE # Ensures reproducibility of the split itself
    )
    
    print(f"\nTraining set size: X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"Testing set size: X_test: {X_test.shape}, y_test: {y_test.shape}")
else:
    print("\nSkipping train-test split as no cleaned data is available.")

## 6. Train XGBoost Model

In [None]:
if 'X_train' in locals() and not X_train.empty:
    model = xgb.XGBClassifier(
        objective='binary:logistic', # For binary classification (significant move vs. no significant move)
        n_estimators=100,          # Number of trees (can be tuned)
        learning_rate=0.1,         # Learning rate (can be tuned)
        max_depth=3,               # Maximum depth of a tree (can be tuned)
        use_label_encoder=False,   # Suppress warning, as labels are already 0/1 integers
        eval_metric='logloss'      # Evaluation metric for early stopping or monitoring (can be 'auc', 'error', etc.)
    )
    
    model.fit(X_train, y_train)
    print("\nXGBoost model trained.")
else:
    print("\nSkipping model training as no training data is available.")

## 7. Make Predictions

In [None]:
if 'model' in locals() and 'X_test' in locals() and not X_test.empty:
    y_pred = model.predict(X_test) # Class predictions (0 or 1)
    y_pred_proba = model.predict_proba(X_test)[:, 1] # Probabilities for the positive class (class 1)
    
    print("\nPredictions made on the test set.")
    print("Sample class predictions (y_pred - first 10):")
    print(y_pred[:10])
    print("\nSample predicted probabilities for class 1 (y_pred_proba - first 10):")
    print(y_pred_proba[:10])
else:
    print("\nSkipping predictions as model or test data is not available.")

## 8. Evaluate Model

Performance might vary with the new features. Feature selection and hyperparameter tuning would be the next steps to optimize.

In [None]:
if 'y_test' in locals() and 'y_pred' in locals() and 'y_pred_proba' in locals():
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=0)
    
    print(f"\nAccuracy with new features: {accuracy:.4f}")
    print("\nClassification Report with new features:")
    print(report)
    
    print("\nTest Set Class Distribution (y_test):")
    print(y_test.value_counts(normalize=True))
    print("\nPredicted Class Distribution (y_pred from model with new features):")
    print(pd.Series(y_pred).value_counts(normalize=True))
    
    # --- ROC Curve and AUC Score ---
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve with New Features')
    plt.legend(loc="lower right")
    plt.grid(alpha=0.3)
    plt.show()
    
    print(f"AUC Score with new features: {roc_auc:.4f}")
    
    # --- Confusion Matrix ---
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm)
    
    fig, ax = plt.subplots(figsize=(7, 7))
    disp.plot(ax=ax, cmap=plt.cm.Blues)
    ax.set_title('Confusion Matrix with New Features')
    plt.show()
    
    print("\nConfusion Matrix values (with new features):")
    print(f"True Negatives (TN): {cm[0, 0]}")
    print(f"False Positives (FP): {cm[0, 1]}")
    print(f"False Negatives (FN): {cm[1, 0]}")
    print(f"True Positives (TP): {cm[1, 1]}")
else:
    print("\nSkipping model evaluation as test results are not available.")

### Interpreting the ROC Curve and AUC Score

The **Receiver Operating Characteristic (ROC) Curve** visualizes the trade-off between the True Positive Rate (TPR, also known as sensitivity or recall) and the False Positive Rate (FPR, also known as 1-specificity) at various classification thresholds.
- **True Positive Rate (TPR)**: Proportion of actual positive cases (significant price moves) that are correctly identified by the model.
- **False Positive Rate (FPR)**: Proportion of actual negative cases (no significant price moves) that are incorrectly identified as positive by the model.

**AUC (Area Under the Curve)**:
- An AUC of **1.0** represents a perfect classifier.
- An AUC of **0.5** represents a classifier with no discriminative ability (equivalent to random guessing), depicted by the dashed diagonal line.
- An AUC **between 0.5 and 1.0** indicates the model has some ability to distinguish between the positive and negative classes. The closer to 1.0, the better the model's performance.
- An AUC **less than 0.5** would suggest the model is performing worse than random (e.g., consistently misclassifying).

A higher AUC generally indicates a better model. The shape of the curve also matters: a curve that bows further towards the top-left corner is better.

### Interpreting the Confusion Matrix

The **Confusion Matrix** provides a detailed breakdown of the model's predictions versus the actual classes.

For our binary classification problem (predicting significant price movement vs. no significant price movement):
- **True Negatives (TN)** (Top-Left): The number of instances where the model correctly predicted **no significant price move**, and there was indeed no significant price move.
- **False Positives (FP)** (Top-Right): The number of instances where the model incorrectly predicted a **significant price move**, but there was actually no significant price move. This is a **Type I error**.
  - *Implication*: May lead to unnecessary caution or missed opportunities if we act based on a predicted move that doesn't happen.
- **False Negatives (FN)** (Bottom-Left): The number of instances where the model incorrectly predicted **no significant price move**, but there was actually a significant price move. This is a **Type II error**.
  - *Implication*: Potentially more costly, as the model fails to identify a significant event.
- **True Positives (TP)** (Bottom-Right): The number of instances where the model correctly predicted a **significant price move**, and there was indeed a significant price move.

**Key metrics derived from the Confusion Matrix (often in the Classification Report):**
- **Accuracy**: `(TP + TN) / (TP + TN + FP + FN)` - Overall correctness.
- **Precision (for class 1 - significant move)**: `TP / (TP + FP)` - Of all predictions of a significant move, how many were correct? High precision means low FP rate for that class.
- **Recall (Sensitivity, TPR, for class 1 - significant move)**: `TP / (TP + FN)` - Of all actual significant moves, how many did the model identify? High recall means low FN rate for that class.
- **F1-Score (for class 1)**: `2 * (Precision * Recall) / (Precision + Recall)` - The harmonic mean of Precision and Recall, useful for imbalanced classes.

Understanding these components helps in diagnosing the model's strengths and weaknesses, especially in identifying where it makes errors.

## 9. Feature Importance (Optional)

Feature importance will likely change with the new set of features.

In [None]:
if 'model' in locals() and 'X_cleaned' in locals() and not X_cleaned.empty:
    feature_importances = pd.Series(model.feature_importances_, index=X_cleaned.columns)
    feature_importances = feature_importances.sort_values(ascending=False)
    
    print("\nFeature Importances (with new features):")
    print(feature_importances.head(20)) # Print top 20 features
    
    plt.figure(figsize=(12, 10)) # Adjusted size for potentially many features
    feature_importances.head(30).plot(kind='bar') # Plot top 30 for readability
    plt.title('Top 30 Feature Importances for XGBoost Model (with New Features)')
    plt.ylabel('Importance')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
else:
    print("\nSkipping feature importance calculation.")

## 10. Backtesting with `simple_backtest`

The backtesting results will also be affected by the new features influencing the model's predictions.

In [None]:
from src.backtesting import simple_backtest

if 'model' in locals() and \ 
   'X_test' in locals() and not X_test.empty and \ 
   'processed_data' in locals() and processed_data is not None and \ 
   'y_pred' in locals() and \ 
   'FUTURE_DAYS_TARGET' in locals():

    y_pred_series_on_test_index = pd.Series(y_pred, index=X_test.index)
    
    # Use 'Close' prices from 'processed_data' aligned with X_test's index for backtesting
    # This ensures we are using the price data that corresponds to the features used for prediction.
    backtest_price_data = processed_data['Close'].loc[X_test.index].copy()
    
    signals_for_backtest = pd.Series(0, index=X_test.index, dtype=int)
    
    for date_index, model_prediction_signal in y_pred_series_on_test_index.items():
        if model_prediction_signal == 1:
            current_price_for_signal_eval = processed_data['Close'].loc[date_index]
            try:
                current_date_location_in_processed_data = processed_data.index.get_loc(date_index)
                future_date_location_in_processed_data = current_date_location_in_processed_data + FUTURE_DAYS_TARGET
                
                if future_date_location_in_processed_data < len(processed_data.index):
                    future_price_actual_date_index = processed_data.index[future_date_location_in_processed_data]
                    future_price_for_signal_eval = processed_data['Close'].loc[future_price_actual_date_index]
                    
                    if future_price_for_signal_eval > current_price_for_signal_eval:
                        signals_for_backtest.loc[date_index] = 1
                    elif future_price_for_signal_eval < current_price_for_signal_eval:
                        signals_for_backtest.loc[date_index] = -1
            except KeyError:
                print(f"Warning: Index {date_index} from X_test not found in processed_data.index during signal generation.")
            except Exception as e:
                print(f"Error during signal generation for index {date_index}: {e}")

    print("\nValue counts for generated 'perfect foresight' signals (with new features):")
    print(signals_for_backtest.value_counts())
    
    initial_capital_backtest = 10000.0
    leverage_backtest = 1.0
    
    print(f"\nRunning backtest with initial capital: ${initial_capital_backtest:,.2f}, leverage: {leverage_backtest}")
    
    if not backtest_price_data.index.equals(signals_for_backtest.index):
        print("CRITICAL ERROR: Price data and signal indices for backtest do not match!")
    elif backtest_price_data.isnull().any() or signals_for_backtest.isnull().any():
        print("Warning: NaNs found in backtest price data or signals!")
    elif backtest_price_data.empty or signals_for_backtest.empty:
        print("Error: Backtest price data or signals are empty.")
    else:
        equity_curve, performance_metrics = simple_backtest(
            price_data=backtest_price_data, 
            signals=signals_for_backtest, 
            initial_capital=initial_capital_backtest,
            leverage=leverage_backtest
        )
        
        print("\nBacktest Performance Metrics (with new features):")
        if performance_metrics:
            for metric, value in performance_metrics.items():
                print(f"{metric}: {value:.2f}")
            
            print("\nPlotting Equity Curve (with new features)...")
            plt.figure(figsize=(12, 7))
            equity_curve.plot()
            plt.title(f'Equity Curve for {TICKER} (Perfect Foresight Signals, New Features)')
            plt.xlabel('Date')
            plt.ylabel('Portfolio Value ($)')
            plt.grid(True)
            plt.show()
        else:
            print("Backtest performance metrics dictionary is empty.")
else:
    print("\nSkipping backtesting. Essential variables not available.")