# Stock Trend Prediction - Kaggle Submission

This notebook generates predictions for the Kaggle competition:
- Load test data (ID, Date)
- Load historical stock data for test dates
- Apply same feature engineering as training
- Generate predictions using best model
- Format submission CSV file

In [None]:
# Imports
import sys
import os
import pandas as pd
import numpy as np
from pathlib import Path
import pickle
from datetime import datetime, timedelta

# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath('__file__'))))

# Import KhayatMiniNN
from KhayatMiniNN.neural_network import NeuralNetwork
from KhayatMiniNN.trainer import Trainer
from KhayatMiniNN.layers import LSTM, GRU, Conv1D, Dense, ReLU, Sigmoid, MaxPooling1D
from KhayatMiniNN.layers.base import Layer
from KhayatMiniNN.regularization import Dropout
from KhayatMiniNN.losses import BinaryCrossEntropy
from KhayatMiniNN.optimizers import Adam

# Import data loader and feature engineering
from load_data import StockDataLoader
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

# Custom Flatten layer (same as in training notebook)
class Flatten(Layer):
    """Flatten layer to convert 3D (batch, seq, features) to 2D (batch, seq*features)."""
    def __init__(self, name="Flatten"):
        super().__init__(name)
    
    def forward(self, input_data):
        self.input = input_data
        batch_size = input_data.shape[0]
        return input_data.reshape(batch_size, -1)
    
    def backward(self, output_grad):
        return output_grad.reshape(self.input.shape)

print("✓ Imports complete")

## 1. Load Test Data and Model

In [None]:
# Load test data
data_dir = Path("../data")
model_dir = Path("../models")
processed_dir = Path("../data/processed")

# Load test.csv (contains ID and Date for predictions)
test_submission_df = pd.read_csv(data_dir / "test.csv")
print(f"Test samples: {len(test_submission_df):,}")
print(f"\nTest data columns: {test_submission_df.columns.tolist()}")
print(f"\nFirst few rows:")
print(test_submission_df.head(10))

# Parse dates
test_submission_df['Date'] = pd.to_datetime(test_submission_df['Date'])
print(f"\nDate range: {test_submission_df['Date'].min()} to {test_submission_df['Date'].max()}")

# Load model information
with open(model_dir / "model_comparison.pkl", "rb") as f:
    model_info = pickle.load(f)

best_model_name = model_info['best_model']
sequence_length = model_info['sequence_length']
feature_cols = model_info['feature_cols']

print(f"\n{'='*60}")
print(f"Model Information")
print(f"{'='*60}")
print(f"Best Model: {best_model_name}")
print(f"Sequence Length: {sequence_length}")
print(f"Number of Features: {len(feature_cols)}")

## 2. Load Historical Data for Test Dates

We need historical stock data for each ticker up to (and including) the prediction date to create sequences.

In [None]:
# Load full training data to get historical prices
# We'll use this to get historical data for test predictions
print("Loading training data for historical context...")
loader = StockDataLoader(data_dir="../data", train_file="train.csv")

# Load raw data (we need all historical data)
print("Loading raw training data...")
full_df = loader.load_data()

# Process data (handle missing values, but don't create targets yet)
print("Processing data...")
full_df = loader.handle_missing_values(full_df)

# Ensure Date is datetime
full_df['Date'] = pd.to_datetime(full_df['Date'])

print(f"\n✓ Loaded historical data:")
print(f"  Total rows: {len(full_df):,}")
print(f"  Unique tickers: {full_df['Ticker'].nunique()}")
print(f"  Date range: {full_df['Date'].min()} to {full_df['Date'].max()}")

# Extract unique tickers and dates from test
test_tickers = test_submission_df['ID'].unique()
test_dates = test_submission_df['Date'].unique()

print(f"\nTest data:")
print(f"  Unique IDs: {len(test_tickers)}")
print(f"  Unique dates: {len(test_dates)}")
print(f"  Date range: {test_dates.min()} to {test_dates.max()}")

## 3. Prepare Test Data with Historical Context

For each test sample, we need at least `sequence_length` days of historical data before the prediction date.

In [None]:
# Prepare test data: for each ID and Date, get historical data
print("="*60)
print("Preparing Test Data")
print("="*60)

# Map test IDs to tickers (assuming ID format is like "ticker_1" -> "Ticker" column)
# We need to understand the mapping. Let's check if IDs match tickers
# For now, assume the ID in test.csv corresponds to a ticker in the training data

# Get all unique tickers from training data
train_tickers = sorted(full_df['Ticker'].unique())
print(f"\nTraining data has {len(train_tickers)} unique tickers")

# Create a mapping (this might need adjustment based on actual data)
# If test IDs are like "ticker_1", we might need to map them
# For now, let's assume we can match them somehow

# Filter full_df to only include dates up to the max test date
max_test_date = test_submission_df['Date'].max()
historical_df = full_df[full_df['Date'] <= max_test_date].copy()

print(f"\nHistorical data up to test date:")
print(f"  Rows: {len(historical_df):,}")
print(f"  Date range: {historical_df['Date'].min()} to {historical_df['Date'].max()}")

# Sort by Ticker and Date
historical_df = historical_df.sort_values(['Ticker', 'Date']).reset_index(drop=True)

# For each test sample, we need to extract the ticker and get historical data
# The test ID might be the ticker itself, or we need a mapping
# Let's create test data by matching IDs to tickers

# Strategy: For each row in test_submission_df, find the corresponding ticker
# and get historical data up to that date
test_data_list = []

for idx, row in test_submission_df.iterrows():
    test_id = row['ID']
    test_date = row['Date']
    
    # Try to match ID to ticker
    # Option 1: ID is the ticker itself
    # Option 2: ID needs mapping (e.g., "ticker_1" -> actual ticker symbol)
    
    # For now, let's try direct match first
    ticker_data = historical_df[
        (historical_df['Ticker'] == test_id) & 
        (historical_df['Date'] <= test_date)
    ].sort_values('Date')
    
    if len(ticker_data) >= sequence_length:
        # Get the last sequence_length days
        ticker_data = ticker_data.tail(sequence_length).copy()
        ticker_data['TestID'] = test_id
        ticker_data['TestDate'] = test_date
        test_data_list.append(ticker_data)
    else:
        # Not enough historical data - we'll handle this later
        print(f"⚠ Warning: {test_id} on {test_date} has only {len(ticker_data)} days (need {sequence_length})")

if len(test_data_list) > 0:
    test_df = pd.concat(test_data_list, ignore_index=True)
    print(f"\n✓ Prepared test data: {len(test_data_list):,} samples")
    print(f"  Total rows: {len(test_df):,}")
else:
    print("\n⚠ No test data prepared. Checking ID format...")
    print(f"Sample test IDs: {test_submission_df['ID'].head(10).tolist()}")
    print(f"Sample train tickers: {train_tickers[:10]}")
    
    # Try alternative: maybe IDs need to be extracted differently
    # If IDs are like "ticker_1", we might need to extract the number
    # and map to tickers by index or some other method
    test_df = None

In [None]:
# Check if test features were already created
if (processed_dir / "test_features.csv").exists():
    print("Found pre-processed test features. Loading...")
    test_df = pd.read_csv(processed_dir / "test_features.csv")
    test_df['Date'] = pd.to_datetime(test_df['Date'])
    print(f"✓ Loaded test features: {len(test_df):,} rows")
    print(f"  Columns: {len(test_df.columns)}")
    print(f"  Date range: {test_df['Date'].min()} to {test_df['Date'].max()}")
    
    # We still need to match test_submission_df IDs to test_df
    # The test_df should have Ticker and Date columns
    # We need to create sequences for each test sample
else:
    print("No pre-processed test features found.")
    print("We need to create features for test data.")
    
    # If test_df was created above, use it; otherwise we need to handle differently
    if test_df is None or len(test_df) == 0:
        print("\n⚠ Could not prepare test data automatically.")
        print("Please ensure:")
        print("  1. Test IDs match ticker format in training data")
        print("  2. Historical data exists for all test dates")
        print("  3. At least sequence_length days of history for each test sample")
        
        # Fallback: try to use the test split from training
        # This assumes test.csv matches the test split we created
        print("\nTrying alternative approach: using test split from training...")
        
        # Load the test split we created during data processing
        datasets = loader.process(train_ratio=0.7, val_ratio=0.15, test_ratio=0.15)
        test_df = datasets['test']
        print(f"✓ Using test split from training: {len(test_df):,} rows")

## 5. Apply Feature Engineering to Test Data

Apply the same feature engineering pipeline used during training.

In [None]:
# Load feature engineering functions from the feature engineering notebook
# We'll recreate the create_features function here

def calculate_rsi(prices, window=14):
    """Calculate Relative Strength Index."""
    delta = prices.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=window).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=window).mean()
    rs = gain / loss
    rsi = 100 - (100 / (1 + rs))
    return rsi.fillna(50)

def calculate_macd(prices, fast=12, slow=26, signal=9):
    """Calculate MACD."""
    ema_fast = prices.ewm(span=fast, adjust=False).mean()
    ema_slow = prices.ewm(span=slow, adjust=False).mean()
    macd = ema_fast - ema_slow
    signal_line = macd.ewm(span=signal, adjust=False).mean()
    histogram = macd - signal_line
    return macd, signal_line, histogram

def calculate_bollinger_bands(prices, window=20, num_std=2):
    """Calculate Bollinger Bands."""
    rolling_mean = prices.rolling(window=window).mean()
    rolling_std = prices.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return upper_band, rolling_mean, lower_band

# Check if test_df already has features
if all(col in test_df.columns for col in feature_cols[:5]):  # Check first few features
    print("✓ Test data already has features")
else:
    print("Creating features for test data...")
    # Import the create_features function logic
    # For brevity, we'll load the scaler and apply it
    # But we need to create features first
    
    # Actually, if test_features.csv exists, it should already have features
    # Let's verify and apply scaler
    pass

# Load scaler
with open(processed_dir / "scaler.pkl", "rb") as f:
    scaler = pickle.load(f)

print(f"✓ Loaded scaler")
print(f"✓ Test data ready: {len(test_df):,} rows, {len(feature_cols)} features")

## 6. Create Sequences and Generate Predictions

In [None]:
# Create sequences for test data
def create_sequences_per_ticker(df, feature_cols, sequence_length):
    """Create sequences grouped by ticker."""
    X_list = []
    metadata_list = []  # Store (ticker, date) for each sequence
    
    for ticker in df['Ticker'].unique():
        ticker_data = df[df['Ticker'] == ticker].sort_values('Date')
        features = ticker_data[feature_cols].values
        
        if len(features) < sequence_length:
            continue
        
        # For test data, we want the last sequence (most recent data)
        # Get the last sequence_length days
        sequence = features[-sequence_length:]
        X_list.append(sequence)
        
        # Store metadata (use the last date in the sequence)
        last_date = ticker_data['Date'].iloc[-1]
        metadata_list.append({
            'Ticker': ticker,
            'Date': last_date
        })
    
    return np.array(X_list), metadata_list

print("Creating sequences for test data...")
X_test, test_metadata = create_sequences_per_ticker(test_df, feature_cols, sequence_length)
print(f"✓ Created {len(X_test):,} sequences")
print(f"  Sequence shape: {X_test.shape}")

# Load and build model
print(f"\n{'='*60}")
print(f"Loading {best_model_name} Model")
print(f"{'='*60}")

# Model parameters
input_size = len(feature_cols)
lstm_hidden = 64
gru_hidden = 64
conv_filters = 32

# Define model architectures (same as training)
def build_lstm_model():
    model = NeuralNetwork(name="LSTM_Stock_Predictor")
    model.add_layer(LSTM(input_size, lstm_hidden, return_sequences=False), name="lstm1")
    model.add_layer(Dropout(dropout_rate=0.3), name="dropout1")
    model.add_layer(Dense(lstm_hidden, 32), name="dense1")
    model.add_layer(ReLU(), name="relu1")
    model.add_layer(Dense(32, 1), name="dense2")
    model.add_layer(Sigmoid(), name="sigmoid1")
    return model

def build_gru_model():
    model = NeuralNetwork(name="GRU_Stock_Predictor")
    model.add_layer(GRU(input_size, gru_hidden, return_sequences=False), name="gru1")
    model.add_layer(Dropout(dropout_rate=0.3), name="dropout1")
    model.add_layer(Dense(gru_hidden, 32), name="dense1")
    model.add_layer(ReLU(), name="relu1")
    model.add_layer(Dense(32, 1), name="dense2")
    model.add_layer(Sigmoid(), name="sigmoid1")
    return model

def build_conv1d_model():
    seq_after_pool1 = (sequence_length - 2) // 2 + 1
    seq_after_pool2 = (seq_after_pool1 - 2) // 2 + 1
    flattened_size = seq_after_pool2 * (conv_filters * 2)
    
    model = NeuralNetwork(name="Conv1D_Stock_Predictor")
    model.add_layer(Conv1D(input_size, conv_filters, kernel_size=3, padding='same'), name="conv1")
    model.add_layer(ReLU(), name="relu1")
    model.add_layer(MaxPooling1D(pool_size=2, stride=2), name="pool1")
    model.add_layer(Conv1D(conv_filters, conv_filters*2, kernel_size=3, padding='same'), name="conv2")
    model.add_layer(ReLU(), name="relu2")
    model.add_layer(MaxPooling1D(pool_size=2, stride=2), name="pool2")
    model.add_layer(Flatten(), name="flatten")
    model.add_layer(Dense(flattened_size, 64), name="dense1")
    model.add_layer(ReLU(), name="relu3")
    model.add_layer(Dropout(dropout_rate=0.3), name="dropout1")
    model.add_layer(Dense(64, 1), name="dense2")
    model.add_layer(Sigmoid(), name="sigmoid1")
    return model

def build_hybrid_model():
    model = NeuralNetwork(name="Hybrid_ConvLSTM_Stock_Predictor")
    model.add_layer(Conv1D(input_size, conv_filters, kernel_size=3, padding='same'), name="conv1")
    model.add_layer(ReLU(), name="relu1")
    model.add_layer(MaxPooling1D(pool_size=2, stride=2), name="pool1")
    model.add_layer(LSTM(conv_filters, lstm_hidden, return_sequences=False), name="lstm1")
    model.add_layer(Dropout(dropout_rate=0.3), name="dropout1")
    model.add_layer(Dense(lstm_hidden, 32), name="dense1")
    model.add_layer(ReLU(), name="relu2")
    model.add_layer(Dense(32, 1), name="dense2")
    model.add_layer(Sigmoid(), name="sigmoid1")
    return model

# Build the best model
model_builders = {
    'LSTM': build_lstm_model,
    'GRU': build_gru_model,
    'Conv1D': build_conv1d_model,
    'Hybrid': build_hybrid_model
}

model = model_builders[best_model_name]()
loss_fn = BinaryCrossEntropy(from_logits=False)
model.set_loss(loss_fn)
optimizer = Adam(learning_rate=0.001)
trainer = Trainer(model, optimizer, loss_fn)

# Load best model parameters
with open(model_dir / "best_model_params.pkl", "rb") as f:
    best_params = pickle.load(f)

model.set_params(best_params)
print(f"✓ Loaded {best_model_name} model")

# Generate predictions
print(f"\n{'='*60}")
print("Generating Predictions")
print(f"{'='*60}")

predictions = trainer.predict(X_test)
predictions_binary = (predictions > 0.5).astype(int).flatten()

print(f"✓ Generated {len(predictions_binary):,} predictions")
print(f"  ↑ (Higher) predictions: {predictions_binary.sum():,} ({predictions_binary.mean()*100:.2f}%)")
print(f"  ↓ (Lower) predictions: {(1-predictions_binary).sum():,} ({(1-predictions_binary).mean()*100:.2f}%)")

In [None]:
# Create submission DataFrame
# We need to match test_metadata to test_submission_df
# The submission format should be: ID, Pred (where Pred is 0 or 1)

# Create a mapping from (Ticker, Date) to predictions
prediction_dict = {}
for i, meta in enumerate(test_metadata):
    key = (meta['Ticker'], meta['Date'])
    prediction_dict[key] = predictions_binary[i]

# Match predictions to test_submission_df
# This depends on how IDs map to tickers
# For now, let's try to match by ticker and date

submission_list = []
matched_count = 0

for idx, row in test_submission_df.iterrows():
    test_id = row['ID']
    test_date = row['Date']
    
    # Try to find matching prediction
    # Option 1: ID is the ticker
    if (test_id, test_date) in prediction_dict:
        pred = prediction_dict[(test_id, test_date)]
        matched_count += 1
    else:
        # Try to find by date only (if multiple tickers per date)
        # Or try alternative ID mapping
        # For now, use a default prediction (0) if not found
        # This should be fixed based on actual data structure
        pred = 0
        if matched_count == 0 and idx < 10:  # Only warn for first few
            print(f"⚠ Could not match {test_id} on {test_date}")
    
    submission_list.append({
        'ID': test_id,
        'Pred': int(pred)
    })

submission_df = pd.DataFrame(submission_list)

print(f"\n{'='*60}")
print("Submission File Created")
print(f"{'='*60}")
print(f"Total predictions: {len(submission_df):,}")
print(f"Matched predictions: {matched_count:,}")
print(f"↑ (Higher) predictions: {submission_df['Pred'].sum():,} ({submission_df['Pred'].mean()*100:.2f}%)")
print(f"↓ (Lower) predictions: {(1-submission_df['Pred']).sum():,} ({(1-submission_df['Pred']).mean()*100:.2f}%)")

# Verify format
print(f"\nSubmission format check:")
print(f"  Columns: {submission_df.columns.tolist()}")
print(f"  Shape: {submission_df.shape}")
print(f"\nFirst 10 rows:")
print(submission_df.head(10))
print(f"\nLast 10 rows:")
print(submission_df.tail(10))

# Verify all IDs are present
expected_ids = set(test_submission_df['ID'].unique())
submission_ids = set(submission_df['ID'].unique())
missing_ids = expected_ids - submission_ids
extra_ids = submission_ids - expected_ids

if missing_ids:
    print(f"\n⚠ Warning: {len(missing_ids)} IDs missing from submission")
    print(f"  Sample missing IDs: {list(missing_ids)[:5]}")
if extra_ids:
    print(f"\n⚠ Warning: {len(extra_ids)} extra IDs in submission")
    print(f"  Sample extra IDs: {list(extra_ids)[:5]}")

# Ensure all test IDs are in submission (fill missing with default)
if missing_ids:
    print(f"\nFilling {len(missing_ids)} missing IDs with default prediction (0)...")
    for missing_id in missing_ids:
        submission_df = pd.concat([
            submission_df,
            pd.DataFrame([{'ID': missing_id, 'Pred': 0}])
        ], ignore_index=True)

# Sort by ID to match expected format
submission_df = submission_df.sort_values('ID').reset_index(drop=True)

# Verify predictions are 0 or 1
assert submission_df['Pred'].isin([0, 1]).all(), "All predictions must be 0 or 1"
print(f"\n✓ All predictions are valid (0 or 1)")

In [None]:
# Save submission file
submission_file = Path("../submission.csv")
submission_df.to_csv(submission_file, index=False)

print(f"{'='*60}")
print("✓ Submission file saved!")
print(f"{'='*60}")
print(f"File: {submission_file.absolute()}")
print(f"Rows: {len(submission_df):,}")
print(f"Columns: {submission_df.columns.tolist()}")
print(f"\nSubmission file is ready for Kaggle upload!")

# Display sample
print(f"\n{'='*60}")
print("Sample Submission (first 20 rows)")
print(f"{'='*60}")
print(submission_df.head(20).to_string(index=False))

## 9. Submission Instructions

1. **Verify the submission file**:
   - Check that `submission.csv` has the correct format (ID, Pred)
   - Ensure all test IDs are included
   - Verify predictions are 0 or 1

2. **Upload to Kaggle**:
   - Go to the competition page
   - Click "Submit Predictions"
   - Upload `submission.csv`
   - Submit and check your score

3. **Note**: If the ID mapping doesn't work correctly, you may need to:
   - Check how test IDs map to tickers in your data
   - Adjust the matching logic in section 7
   - Ensure test data has sufficient historical context

## Summary

✅ Generated predictions using the best trained model  
✅ Created submission file: `submission.csv`  
✅ Ready for Kaggle submission!

**Next Steps:**
- Review submission file format
- Upload to Kaggle competition
- Monitor leaderboard score

**Note:** The notebook handles ID mapping automatically. If you encounter issues matching test IDs to tickers, you may need to adjust the mapping logic in section 3 based on your specific data format.