# Advanced Modeling: LSTM for Stock Price Movement Prediction

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import sys
import os

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import classification_report, accuracy_score, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay

import matplotlib.pyplot as plt

# Add src directory to Python path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.data_loader import download_stock_data
from src.feature_engineering import add_technical_indicators, add_rolling_lag_features, create_target_variable

## 2. Configuration and Parameters

In [None]:
TICKER = 'AAPL'
START_DATE = '2018-01-01' # Needs enough data for sequence creation and TIs
END_DATE = '2023-12-31'
INTERVAL = '1d'

FUTURE_DAYS_TARGET = 5
PERCENT_CHANGE_THRESHOLD = 0.03

ROLLING_WINDOWS = [5, 10, 20, 60]
LAG_PERIODS = [1, 2, 3, 5, 10]
KEY_LAG_INDICATORS = ['RSI_14', 'MACD', 'ATR_14', 'Stoch_k', 'ADX_14']

SEQUENCE_LENGTH = 20 # Number of past days' data to use for predicting next target
N_SPLITS_TIMESERIES = 5 # For TimeSeriesSplit

# LSTM Model Parameters
LSTM_UNITS = 50
DROPOUT_RATE = 0.2
EPOCHS = 50 # Increased epochs, with early stopping
BATCH_SIZE = 32

MODEL_SAVE_PATH = '../models/lstm_stock_predictor.h5' # Path to save the trained model

## 3. Load and Engineer Features

In [None]:
# 3.1 Download Data
raw_data = download_stock_data([TICKER], START_DATE, END_DATE, INTERVAL)
if raw_data is None:
    raise ValueError(f"Failed to download data for {TICKER}. Halting execution.")
print(f"Raw data shape: {raw_data.shape}")

# 3.2 Add Technical Indicators
data_with_ti = add_technical_indicators(raw_data.copy(), fillna=True)
print(f"Shape after TIs: {data_with_ti.shape}")

# 3.3 Add Rolling and Lag Features
data_with_roll_lag = add_rolling_lag_features(
    data_with_ti.copy(),
    windows=ROLLING_WINDOWS,
    lags=LAG_PERIODS,
    lag_indicators=KEY_LAG_INDICATORS
)
print(f"Shape after rolling/lag features: {data_with_roll_lag.shape}")

# 3.4 Create Target Variable
processed_data = create_target_variable(
    data_with_roll_lag.copy(), 
    future_days=FUTURE_DAYS_TARGET, 
    percent_change_threshold=PERCENT_CHANGE_THRESHOLD
)
print(f"Shape after target creation: {processed_data.shape}")

# 3.5 Define Features (X) and Target (y) and Drop NaNs
base_price_volume_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
feature_columns = [col for col in processed_data.columns if col not in base_price_volume_cols + ['target']]

X_raw_full = processed_data[feature_columns]
y_raw_full = processed_data['target']

print(f"Number of features: {len(X_raw_full.columns)}")
print(f"Original shape before NaN drop: X: {X_raw_full.shape}, y: {y_raw_full.shape}")

combined_for_cleaning = X_raw_full.assign(target=y_raw_full)
cleaned_data = combined_for_cleaning.dropna()

X_cleaned_full = cleaned_data[feature_columns]
y_cleaned_full = cleaned_data['target']

print(f"Shape after NaN drop: X_cleaned: {X_cleaned_full.shape}, y_cleaned: {y_cleaned_full.shape}")

if X_cleaned_full.empty or y_cleaned_full.empty:
    raise ValueError("No data left after NaN removal. Adjust parameters or data range.")

## 4. Prepare Data for LSTM

### 4.1. Split Data using TimeSeriesSplit
We use TimeSeriesSplit to get train/test indices. For this example, we'll use the last split as our primary train/test set.

In [None]:
tscv = TimeSeriesSplit(n_splits=N_SPLITS_TIMESERIES)
train_index, test_index = None, None # Initialize
for i, (train_idx, test_idx) in enumerate(tscv.split(X_cleaned_full)):
    print(f"Split {i+1}: Train indices: {len(train_idx)}, Test indices: {len(test_idx)}")
    if i == N_SPLITS_TIMESERIES - 1: # Use the last split for train/test
        train_index, test_index = train_idx, test_idx

X_train_raw, X_test_raw = X_cleaned_full.iloc[train_index], X_cleaned_full.iloc[test_index]
y_train_raw, y_test_raw = y_cleaned_full.iloc[train_index], y_cleaned_full.iloc[test_index]

print(f"\nUsing last split: X_train_raw shape: {X_train_raw.shape}, X_test_raw shape: {X_test_raw.shape}")

### 4.2. Scale Features
Fit scaler ONLY on training data, then transform both train and test sets.

In [None]:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_test_scaled = scaler.transform(X_test_raw)

print("Data scaled. X_train_scaled shape:", X_train_scaled.shape)
print("X_test_scaled shape:", X_test_scaled.shape)

### 4.3. Create Sequences

In [None]:
def create_sequences(X_data, y_data, sequence_length):
    X_sequences, y_sequences = [], []
    for i in range(len(X_data) - sequence_length):
        X_sequences.append(X_data[i:i + sequence_length])
        y_sequences.append(y_data.iloc[i + sequence_length]) # Target is for the day after the sequence
    return np.array(X_sequences), np.array(y_sequences)

X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_raw, SEQUENCE_LENGTH)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test_raw, SEQUENCE_LENGTH)

print(f"X_train_seq shape: {X_train_seq.shape}, y_train_seq shape: {y_train_seq.shape}")
print(f"X_test_seq shape: {X_test_seq.shape}, y_test_seq shape: {y_test_seq.shape}")

if X_train_seq.shape[0] == 0 or X_test_seq.shape[0] == 0:
    raise ValueError("Not enough data to create sequences after train/test split. Increase data range or decrease SEQUENCE_LENGTH.")

## 5. Define and Train LSTM Model

In [None]:
model = Sequential([
    LSTM(LSTM_UNITS, input_shape=(SEQUENCE_LENGTH, X_train_seq.shape[2]), return_sequences=False), # return_sequences=True if stacking LSTMs
    Dropout(DROPOUT_RATE),
    Dense(LSTM_UNITS // 2, activation='relu'), # Optional intermediate dense layer
    Dropout(DROPOUT_RATE / 2),
    Dense(1, activation='sigmoid') # Output layer for binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
model.summary()

# Callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.00001)

history = model.fit(
    X_train_seq,
    y_train_seq,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    validation_split=0.1, # Use a portion of training data for validation during training
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

### Plot Training History

In [None]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Loss Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
if 'auc' in history.history and 'val_auc' in history.history:
    plt.plot(history.history['auc'], label='Train AUC')
    plt.plot(history.history['val_auc'], label='Validation AUC')
plt.title('Accuracy/AUC Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Metric')
plt.legend()
plt.tight_layout()
plt.show()

## 6. Evaluate LSTM Model

In [None]:
y_pred_proba_lstm = model.predict(X_test_seq)
y_pred_lstm = (y_pred_proba_lstm > 0.5).astype(int) # Apply threshold

print("\nLSTM Model Evaluation:")
accuracy_lstm = accuracy_score(y_test_seq, y_pred_lstm)
print(f"Accuracy: {accuracy_lstm:.4f}")

print("\nClassification Report:")
print(classification_report(y_test_seq, y_pred_lstm, zero_division=0))

# ROC Curve and AUC
fpr_lstm, tpr_lstm, _ = roc_curve(y_test_seq, y_pred_proba_lstm)
roc_auc_lstm = auc(fpr_lstm, tpr_lstm)

plt.figure(figsize=(8,6))
plt.plot(fpr_lstm, tpr_lstm, color='darkorange', lw=2, label=f'LSTM ROC curve (area = {roc_auc_lstm:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('LSTM Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(alpha=0.3)
plt.show()
print(f"AUC Score: {roc_auc_lstm:.4f}")

# Confusion Matrix
cm_lstm = confusion_matrix(y_test_seq, y_pred_lstm)
disp_lstm = ConfusionMatrixDisplay(confusion_matrix=cm_lstm)
fig, ax = plt.subplots(figsize=(7,7))
disp_lstm.plot(ax=ax, cmap=plt.cm.Blues)
ax.set_title('LSTM Confusion Matrix')
plt.show()

print("\nConfusion Matrix values:")
print(f"True Negatives (TN): {cm_lstm[0, 0]}")
print(f"False Positives (FP): {cm_lstm[0, 1]}")
print(f"False Negatives (FN): {cm_lstm[1, 0]}")
print(f"True Positives (TP): {cm_lstm[1, 1]}")

## 7. Save Trained Model

In [None]:
model_dir = os.path.dirname(MODEL_SAVE_PATH)
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
    print(f"Created directory: {model_dir}")

model.save(MODEL_SAVE_PATH)
print(f"\nLSTM model saved to {MODEL_SAVE_PATH}")

## 8. (Placeholder) Optuna Hyperparameter Tuning
This section would contain the setup for Optuna to find optimal hyperparameters for the LSTM model. Due to execution time constraints in this environment, the actual Optuna study is not run here but the structure would be defined.

In [None]:
import optuna

def create_lstm_model_optuna(trial, X_train_shape):
    # Hyperparameters to tune
    lstm_units = trial.suggest_int('lstm_units', 32, 128, step=16)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5, step=0.1)
    dense_units = trial.suggest_int('dense_units', 16, 64, step=16)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-2, log=True)
    
    model = Sequential([
        LSTM(lstm_units, input_shape=(X_train_shape[1], X_train_shape[2]), return_sequences=False),
        Dropout(dropout_rate),
        Dense(dense_units, activation='relu'),
        Dropout(dropout_rate / 2),
        Dense(1, activation='sigmoid')
    ])
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    return model

def objective_lstm(trial, X_train_s, y_train_s, X_val_s, y_val_s):
    model = create_lstm_model_optuna(trial, X_train_s.shape)
    
    # Train the model (using a subset or fewer epochs for faster Optuna trials)
    history = model.fit(
        X_train_s, y_train_s,
        epochs=10, # Reduced epochs for Optuna trials
        batch_size=BATCH_SIZE, 
        validation_data=(X_val_s, y_val_s),
        callbacks=[EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)],
        verbose=0 # Suppress verbose output during Optuna trials
    )
    
    # Evaluate the model on the validation set
    val_auc = history.history['val_auc'][-1] # Using AUC as the metric to optimize
    return val_auc # Optuna tries to maximize this

print("\nOptuna setup for LSTM hyperparameter tuning (not run in this script due to time constraints).")
print("To run Optuna, you would typically do something like:")
print("# 1. Prepare validation data (X_val_seq, y_val_seq) from a split of the training set.")
print("# study = optuna.create_study(direction='maximize')")
print("# study.optimize(lambda trial: objective_lstm(trial, X_train_seq, y_train_seq, X_val_seq, y_val_seq), n_trials=10) # e.g., 10 trials")
print("# print('Best trial:', study.best_trial.params)")