In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
!pip install -q yfinance ta

print("✓ Packages installed successfully!")

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for ta (setup.py) ... [?25l[?25hdone
✓ Packages installed successfully!


In [3]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Data fetching and technical analysis
import yfinance as yf
from ta import add_all_ta_features

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

# Machine learning preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score, 
    classification_report, 
    confusion_matrix,
    roc_auc_score,
    roc_curve
)
from sklearn.inspection import permutation_importance

# Deep learning
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("✓ All libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")

2025-10-25 02:39:17.502685: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761359957.759024      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761359957.831844      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


✓ All libraries imported successfully!
TensorFlow version: 2.18.0


In [4]:
# Using Apple (AAPL) as our example stock
ticker = 'AAPL'
period = '2y'  # Using 2 years for more training data

print(f"Downloading {ticker} data for the last {period}...")

data = yf.download(ticker, period=period, interval='1d', progress=False)

# FIX: yfinance sometimes returns MultiIndex columns, flatten them
if isinstance(data.columns, pd.MultiIndex):
    data.columns = data.columns.get_level_values(0)

# Ensure correct column names dynamically
expected_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
actual_cols = list(data.columns)

# If only 5 columns exist (no 'Adj Close')
if len(actual_cols) == 5:
    data.columns = ['Open', 'High', 'Low', 'Close', 'Volume']
else:
    data.columns = expected_cols[:len(actual_cols)]

print(f"✓ Downloaded {len(data)} days of data")
print(f"Date range: {data.index[0].date()} to {data.index[-1].date()}")
print("\nFirst few rows:")
print(data.head())


Downloading AAPL data for the last 2y...
✓ Downloaded 502 days of data
Date range: 2023-10-25 to 2025-10-24

First few rows:
                  Open        High         Low       Close    Volume
Date                                                                
2023-10-25  169.440369  171.381349  168.994721  170.212802  57157000
2023-10-26  165.271164  169.717616  164.062997  168.717404  70625300
2023-10-27  166.588287  167.321115  165.211771  165.290997  58499100
2023-10-30  168.638184  169.509652  167.231959  167.380513  51131000
2023-10-31  169.113541  169.242269  166.271369  167.707316  44846000


In [5]:
print("Adding technical indicators...")

# Flatten any 2D columns into 1D
for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
    if isinstance(data[col].iloc[0], (np.ndarray, list)):
        data[col] = data[col].apply(lambda x: x[0])

# Add technical indicators
data = add_all_ta_features(
    data, 
    open="Open", 
    high="High", 
    low="Low", 
    close="Close", 
    volume="Volume",
    fillna=True
)

# Drop any remaining NaN values
data.dropna(inplace=True)

print(f"✓ Technical indicators added! Now we have {data.shape[1]} features")
print(f"Remaining data points: {len(data)}")


Adding technical indicators...
✓ Technical indicators added! Now we have 91 features
Remaining data points: 502


In [6]:
#Feature Engineering - Returns and Volatality
print("Engineering custom features...")

# Daily returns (percentage change)
data['Return'] = data['Close'].pct_change()

# Rolling volatility (standard deviation of returns)
data['Volatility_3d'] = data['Return'].rolling(window=3).std()
data['Volatility_5d'] = data['Return'].rolling(window=5).std()
data['Volatility_7d'] = data['Return'].rolling(window=7).std()

# Volume changes
data['Volume_Change'] = data['Volume'].pct_change()

# Price momentum over different periods
data['Momentum_3d'] = data['Close'] / data['Close'].shift(3) - 1
data['Momentum_5d'] = data['Close'] / data['Close'].shift(5) - 1

print("✓ Returns and volatility features added")

Engineering custom features...
✓ Returns and volatility features added


In [7]:
#Feature Engineering - Lag Features (Temporal Memory)
print("Adding lag features for temporal context...")

# We'll add lags for the most important indicators
lag_features = ['Return', 'momentum_rsi', 'trend_macd', 'Volume']
lag_periods = [1, 2, 3, 5]  # Yesterday, 2 days ago, 3 days ago, 5 days ago

for feature in lag_features:
    if feature in data.columns:
        for lag in lag_periods:
            data[f'{feature}_Lag{lag}'] = data[feature].shift(lag)

print(f"✓ Added lag features for {len(lag_features)} indicators")
print(f"Total lag features created: {len(lag_features) * len(lag_periods)}")

Adding lag features for temporal context...
✓ Added lag features for 4 indicators
Total lag features created: 16


In [8]:
#Feature Engineering - Relative Price Positions
print("Adding relative price position features...")

# Price relative to moving averages (trend indicators)
data['Price_vs_SMA20'] = data['Close'] / data['trend_sma_fast']
data['Price_vs_SMA50'] = data['Close'] / data['trend_sma_slow']

# Distance from Bollinger Bands (volatility indicators)
if 'volatility_bbh' in data.columns and 'volatility_bbl' in data.columns:
    bb_range = data['volatility_bbh'] - data['volatility_bbl']
    data['BB_Position'] = (data['Close'] - data['volatility_bbl']) / bb_range
    
# RSI overbought/oversold signals
if 'momentum_rsi' in data.columns:
    data['RSI_Overbought'] = (data['momentum_rsi'] > 70).astype(int)
    data['RSI_Oversold'] = (data['momentum_rsi'] < 30).astype(int)

print("✓ Relative position features added")

# Clean up any NaN values created by our feature engineering
data.dropna(inplace=True)
print(f"Final dataset size: {len(data)} days")


Adding relative price position features...
✓ Relative position features added
Final dataset size: 495 days


In [9]:
#Target Variable
print("Creating target variable...")

# Shift close price by -1 to get tomorrow's price
data['Target'] = (data['Close'].shift(-1) > data['Close']).astype(int)

# Drop the last row (we don't have tomorrow's price for it)
data.dropna(inplace=True)

# Check class distribution
class_distribution = data['Target'].value_counts()
print("\nTarget variable distribution:")
print(class_distribution)
print(f"Class balance: {class_distribution[1]/len(data)*100:.1f}% UP days, "
      f"{class_distribution[0]/len(data)*100:.1f}% DOWN days")


Creating target variable...

Target variable distribution:
Target
1    265
0    230
Name: count, dtype: int64
Class balance: 53.5% UP days, 46.5% DOWN days


In [10]:
#Prepare Features and Target
print("Preparing feature matrix...")

# Columns to exclude from features
exclude_cols = ['Target', 'Adj Close']

# Get all feature columns
feature_columns = [col for col in data.columns if col not in exclude_cols]

X = data[feature_columns]
y = data['Target']

print(f"✓ Feature matrix shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")

Preparing feature matrix...
✓ Feature matrix shape: (495, 119)
Number of features: 119
Number of samples: 495


In [11]:
#Feature Scaling
print("Scaling features...")

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("✓ Features scaled using StandardScaler")
print(f"Scaled data shape: {X_scaled.shape}")

Scaling features...
✓ Features scaled using StandardScaler
Scaled data shape: (495, 119)


In [12]:
#Train-Test Split (Temporal)
print("Splitting data into train and test sets...")

# Use 80% for training, 20% for testing
train_size = int(len(X_scaled) * 0.8)

X_train = X_scaled[:train_size]
X_test = X_scaled[train_size:]
y_train = y[:train_size]
y_test = y[train_size:]

print(f"✓ Training set: {len(X_train)} samples")
print(f"✓ Test set: {len(X_test)} samples")
print(f"\nTrain period: {data.index[0].date()} to {data.index[train_size-1].date()}")
print(f"Test period: {data.index[train_size].date()} to {data.index[-1].date()}")

Splitting data into train and test sets...
✓ Training set: 396 samples
✓ Test set: 99 samples

Train period: 2023-11-03 to 2025-06-04
Test period: 2025-06-05 to 2025-10-24


In [13]:
#Calculate Class Weights
print("Calculating class weights to handle imbalance...")

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights_dict = {i: weight for i, weight in enumerate(class_weights)}

print("Class weights:")
for class_label, weight in class_weights_dict.items():
    class_name = "DOWN (0)" if class_label == 0 else "UP (1)"
    print(f"  {class_name}: {weight:.3f}")

Calculating class weights to handle imbalance...
Class weights:
  DOWN (0): 1.048
  UP (1): 0.957


In [14]:
#Build Neural Network Model
print("Building neural network model...")

model = Sequential([
    # First hidden layer
    Dense(128, input_dim=X_train.shape[1], activation='relu', name='dense_1'),
    BatchNormalization(name='bn_1'),
    Dropout(0.3, name='dropout_1'),
    
    # Second hidden layer
    Dense(64, activation='relu', name='dense_2'),
    BatchNormalization(name='bn_2'),
    Dropout(0.3, name='dropout_2'),
    
    # Third hidden layer
    Dense(32, activation='relu', name='dense_3'),
    Dropout(0.2, name='dropout_3'),
    
    # Output layer
    Dense(1, activation='sigmoid', name='output')
])

# Compile with optimizer and loss function
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

print("\n✓ Model architecture:")
model.summary()


Building neural network model...

✓ Model architecture:


2025-10-25 02:39:32.006546: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [15]:
#Setup Training Callbacks
print("Setting up training callbacks...")

# Early stopping: stop if val_loss doesn't improve for 7 epochs
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=7,
    restore_best_weights=True,
    verbose=1
)

# Learning rate reduction: cut LR in half if val_loss plateaus
lr_reducer = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=3,
    min_lr=1e-6,
    verbose=1
)

print("✓ Callbacks configured")


# ============================================================================
# CELL 15: Train the Model
# ============================================================================
# This is where the magic happens!

print("\nStarting model training...")
print("=" * 60)

history = model.fit(
    X_train, 
    y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.1,  # Use 10% of training data for validation
    class_weight=class_weights_dict,  # Handle class imbalance
    callbacks=[early_stopping, lr_reducer],
    verbose=1
)

print("\n" + "=" * 60)
print("✓ Training complete!")

Setting up training callbacks...
✓ Callbacks configured

Starting model training...
Epoch 1/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 38ms/step - accuracy: 0.5987 - loss: 0.7579 - val_accuracy: 0.4750 - val_loss: 0.7310 - learning_rate: 0.0010
Epoch 2/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5389 - loss: 0.7324 - val_accuracy: 0.5000 - val_loss: 0.7262 - learning_rate: 0.0010
Epoch 3/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5978 - loss: 0.6766 - val_accuracy: 0.5250 - val_loss: 0.7256 - learning_rate: 0.0010
Epoch 4/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6704 - loss: 0.6087 - val_accuracy: 0.5000 - val_loss: 0.7215 - learning_rate: 0.0010
Epoch 5/100
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6800 - loss: 0.5954 - val_accuracy: 0.4500 - val_loss: 0.7235 - learning_rat

In [16]:
#Make Predictions
print("Making predictions on test set...")

# Get probability predictions
y_pred_prob = model.predict(X_test, verbose=0)

# Convert probabilities to binary predictions (threshold = 0.5)
y_pred = (y_pred_prob > 0.5).astype(int).flatten()

print(f"✓ Predictions complete for {len(y_pred)} test samples")

Making predictions on test set...
✓ Predictions complete for 99 test samples


In [17]:
#Evaluate Model Performance
print("\n" + "=" * 60)
print("MODEL PERFORMANCE METRICS")
print("=" * 60)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n📊 Test Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

# Calculate baseline (always predict majority class)
majority_class = y_train.mode()[0]
baseline_acc = accuracy_score(y_test, [majority_class] * len(y_test))
print(f"📊 Baseline Accuracy (always predict {majority_class}): {baseline_acc:.4f} ({baseline_acc*100:.2f}%)")
print(f"📈 Improvement over baseline: {(accuracy - baseline_acc)*100:.2f} percentage points")

# ROC-AUC Score
try:
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    print(f"📊 ROC-AUC Score: {roc_auc:.4f}")
except:
    print("📊 ROC-AUC Score: Could not calculate")

# Detailed classification report
print("\n" + "=" * 60)
print("DETAILED CLASSIFICATION REPORT")
print("=" * 60)
print(classification_report(y_test, y_pred, target_names=['DOWN (0)', 'UP (1)']))


MODEL PERFORMANCE METRICS

📊 Test Accuracy: 0.5758 (57.58%)
📊 Baseline Accuracy (always predict 1): 0.5859 (58.59%)
📈 Improvement over baseline: -1.01 percentage points
📊 ROC-AUC Score: 0.7157

DETAILED CLASSIFICATION REPORT
              precision    recall  f1-score   support

    DOWN (0)       0.49      0.83      0.62        41
      UP (1)       0.77      0.40      0.52        58

    accuracy                           0.58        99
   macro avg       0.63      0.61      0.57        99
weighted avg       0.65      0.58      0.56        99

