# Baseline XGBoost Model Training for Stock Price Movement Prediction

This notebook demonstrates a baseline workflow for training an XGBoost model to predict significant stock price movements.

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import sys
import os

# Add src directory to Python path to import custom modules
module_path = os.path.abspath(os.path.join('..')) # Assumes notebook is in 'notebooks' directory
if module_path not in sys.path:
    sys.path.append(module_path)

from src.data_management import download_stock_data # Corrected import from data_management
from src.feature_engineering import add_technical_indicators, add_rolling_lag_features, create_target_variable
from src.backtesting import run_backtrader_backtest # Corrected import for backtrader

## 2. Configuration and Parameters

In [None]:
TICKER = 'AAPL' # Example ticker
START_DATE = '2019-01-01' # Extended start date for more data for rolling/lag features
END_DATE = '2023-12-31'
INTERVAL = '1d'

FUTURE_DAYS_TARGET = 5      # X days for target variable definition
PERCENT_CHANGE_THRESHOLD = 0.03 # Y% change for target variable definition

ROLLING_WINDOWS = [5, 10, 20, 60]
LAG_PERIODS = [1, 2, 3, 5, 10]
KEY_LAG_INDICATORS = ['RSI_14', 'MACD', 'ATR_14', 'Stoch_k', 'ADX_14'] 

TEST_SIZE = 0.2
RANDOM_STATE = 42

## 3. Download Data

In [None]:
raw_data_download = download_stock_data([TICKER], START_DATE, END_DATE, INTERVAL)
if raw_data_download is not None:
    print(f"Downloaded data for {TICKER}:")
    # For single ticker, yfinance might return data without 'Ticker' level if not passed as list
    # Our download_stock_data ensures it's a list, so 'Ticker' level should be present.
    # If it was downloaded for a single ticker and Ticker level is missing, we might need to adjust here.
    # However, our current download_stock_data returns a DataFrame where columns are price types if single ticker,
    # or MultiIndex columns if multiple tickers. Let's assume it's processed correctly by downstream.
    # For Backtrader, we need OHLCV directly. If single ticker, yf output is already like that.
    # The `processed_data` later will be sliced for `ohlc_test` which should be fine.
    # Make sure `raw_data` for single ticker has 'Volume' for feature engineering.
    if isinstance(raw_data_download.columns, pd.MultiIndex):
        raw_data = raw_data_download.droplevel(0, axis=1) # Drop 'Ticker' level from columns if present
    else:
        raw_data = raw_data_download
    print(raw_data.head())
    print(f"Shape of raw data: {raw_data.shape}")
else:
    raise SystemExit(f"Failed to download data for {TICKER}. Halting.")

## 4. Feature Engineering

### 4.1. Add Technical Indicators

In [None]:
data_with_ti = add_technical_indicators(raw_data.copy(), fillna=True)
print(f"Shape after TIs: {data_with_ti.shape}")

### 4.2. Add Rolling and Lag Features

In [None]:
data_with_roll_lag = add_rolling_lag_features(
    data_with_ti.copy(),
    windows=ROLLING_WINDOWS,
    lags=LAG_PERIODS,
    lag_indicators=KEY_LAG_INDICATORS
)
print(f"Shape after rolling/lag: {data_with_roll_lag.shape}")

### 4.3. Create Target Variable

In [None]:
processed_data_full = create_target_variable(
    data_with_roll_lag.copy(), 
    future_days=FUTURE_DAYS_TARGET, 
    percent_change_threshold=PERCENT_CHANGE_THRESHOLD
)
print(f"Shape after target: {processed_data_full.shape}")

## 5. Data Preprocessing for Model Training

### 5.1. Select Features (X) and Target (y), Handle NaNs, Prepare OHLCV for Backtrader

In [None]:
base_price_volume_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
feature_columns = [col for col in processed_data_full.columns if col not in base_price_volume_cols + ['target']]

X_uncleaned = processed_data_full[feature_columns]
y_uncleaned = processed_data_full['target']
# For backtrader, we need OHLCV. Ensure 'Volume' is present. 'OpenInterest' will be dummied by backtester if not present.
ohlcv_uncleaned = processed_data_full[['Open', 'High', 'Low', 'Close', 'Volume']].copy()

# Combine features and target for consistent NaN dropping based on model inputs
model_input_df = X_uncleaned.assign(target=y_uncleaned)
cleaned_indices = model_input_df.dropna().index

X_cleaned = X_uncleaned.loc[cleaned_indices]
y_cleaned = y_uncleaned.loc[cleaned_indices]
ohlcv_cleaned = ohlcv_uncleaned.loc[cleaned_indices] # Align OHLCV data with cleaned features/target

print(f"Shape after NaN drop: X: {X_cleaned.shape}, y: {y_cleaned.shape}, OHLCV: {ohlcv_cleaned.shape}")
if X_cleaned.empty:
    raise SystemExit("No data left after NaN removal for model training.")

### 5.2. Split Data into Training and Testing Sets

In [None]:
X_train, X_test, y_train, y_test, ohlcv_train, ohlcv_test = train_test_split(
    X_cleaned, y_cleaned, ohlcv_cleaned,
    test_size=TEST_SIZE, 
    shuffle=False, 
    random_state=RANDOM_STATE
)
print(f"Test set shape: X_test: {X_test.shape}, ohlcv_test: {ohlcv_test.shape}")

## 6. Train XGBoost Model (as before)

In [None]:
model = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=3, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
print("XGBoost model trained.")

## 7. Make Predictions (as before)

In [None]:
y_pred_test = model.predict(X_test)
y_pred_proba_test = model.predict_proba(X_test)[:, 1]

## 8. Evaluate Model (as before, shortened output for brevity)

In [None]:
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
print(classification_report(y_test, y_pred_test, zero_division=0))

## 9. Feature Importance (as before, shortened)

In [None]:
pass # Feature importance plotting omitted for this refactoring focus

## 10. Backtesting with `run_backtrader_backtest`

In [None]:
# Prepare signals for Backtrader: needs to be a Series with name 'signal'
# The 'perfect foresight' signal generation from previous notebook version:
signals_for_bt = pd.Series(0, index=X_test.index, name='signal', dtype=int)
y_pred_series_test = pd.Series(y_pred_test, index=X_test.index)

for date_idx, model_pred_signal_val in y_pred_series_test.items():
    if model_pred_signal_val == 1: # If model predicts significant move
        # Use processed_data_full to get the actual future price for direction
        # Ensure date_idx is valid in processed_data_full
        if date_idx not in processed_data_full.index: continue
        current_price = processed_data_full['Close'].loc[date_idx]
        try:
            current_date_loc = processed_data_full.index.get_loc(date_idx)
            future_date_loc = current_date_loc + FUTURE_DAYS_TARGET
            if future_date_loc < len(processed_data_full.index):
                future_price_idx = processed_data_full.index[future_date_loc]
                future_price = processed_data_full['Close'].loc[future_price_idx]
                if future_price > current_price: signals_for_bt.loc[date_idx] = 1
                elif future_price < current_price: signals_for_bt.loc[date_idx] = -1
        except Exception as e:
            print(f"Signal generation error for {date_idx}: {e}")

print("\nValue counts for 'perfect foresight' signals for Backtrader:")
print(signals_for_bt.value_counts())

# Ensure ohlcv_test has 'Volume'. yfinance data usually includes it.
if 'Volume' not in ohlcv_test.columns:
    print("Warning: 'Volume' not in ohlcv_test. Adding dummy volume for Backtrader.")
    ohlcv_test_bt = ohlcv_test.assign(Volume=100000) # Add dummy volume
else:
    ohlcv_test_bt = ohlcv_test.copy()

initial_capital_bt = 100000.0

print("\n--- Backtrader Scenario 1: Baseline (Leverage 1x, Comm 2bps, Slip 1bps) ---")
metrics1_bt, cerebro1_bt = run_backtrader_backtest(
    data_df=ohlcv_test_bt, 
    signals_df=signals_for_bt, 
    initial_capital=initial_capital_bt,
    leverage=1.0,
    commission_bps=2.0,
    slippage_bps=1.0
)
for metric, value in metrics1_bt.items(): print(f"  {metric}: {value}")

print("\n--- Backtrader Scenario 2: Higher Leverage (2x) ---")
metrics2_bt, cerebro2_bt = run_backtrader_backtest(
    data_df=ohlcv_test_bt, 
    signals_df=signals_for_bt, 
    initial_capital=initial_capital_bt,
    leverage=2.0, # Increased leverage
    commission_bps=2.0,
    slippage_bps=1.0
)
for metric, value in metrics2_bt.items(): print(f"  {metric}: {value}")

print("\n--- Backtrader Scenario 3: Higher Costs (Comm 5bps, Slip 3bps) ---")
metrics3_bt, cerebro3_bt = run_backtrader_backtest(
    data_df=ohlcv_test_bt, 
    signals_df=signals_for_bt, 
    initial_capital=initial_capital_bt,
    leverage=1.0,
    commission_bps=5.0, # Increased commission
    slippage_bps=3.0    # Increased slippage
)
for metric, value in metrics3_bt.items(): print(f"  {metric}: {value}")

print("\nNote: Knock-out parameters are placeholders in run_backtrader_backtest and not yet implemented.")

# Optional: Plotting (might require specific setup for notebook environment)
try:
    print("\nAttempting to plot results for Scenario 1 (Baseline)... NOTE: Plotting may not render in all remote environments.")
    # cerebro1_bt.plot(style='candlestick', barup='green', bardown='red') # This would generate a plot window
    # For saving to file or showing inline, more setup might be needed depending on environment.
    # For now, we'll just acknowledge it ran.
    print("Plot command executed (actual display depends on environment).")
except Exception as e:
    print(f"Cerebro plotting failed: {e}")