In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import yfinance as yf
import time
import os
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Conv1D, MaxPooling1D, Flatten, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Create output directory
model_dir = "stock_models_enhanced"
os.makedirs(model_dir, exist_ok=True)

print("======= ENHANCED STOCK PREDICTION MODEL =======")
print("Step 1: Loading and preprocessing data...")

# Load stock data
try:
    start = dt.datetime(2010, 1, 1)
    end = dt.datetime.today()
    
    # Download data with retry mechanism
    max_attempts = 3
    for attempt in range(max_attempts):
        try:
            df = yf.download(tickers=['^GSPC'], start=start, end=end)
            break
        except Exception as e:
            if attempt < max_attempts - 1:
                print(f"Attempt {attempt+1} failed. Retrying...")
                time.sleep(2)
            else:
                raise Exception(f"Failed to download data after {max_attempts} attempts: {e}")
    
    if df.empty:
        raise Exception("Downloaded dataframe is empty")
    
    print(f"Successfully downloaded {len(df)} days of data")
    
    # Basic preprocessing
    df = df.dropna().reset_index()
    
    # Calculate returns instead of using raw prices
    df['return'] = df['Close'].pct_change()
    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))
    
    # Calculate volatility (20-day rolling standard deviation)
    df['volatility'] = df['log_return'].rolling(window=20).std()
    
    # Calculate moving averages
    df['sma_10'] = df['Close'].rolling(window=10).mean()
    df['sma_30'] = df['Close'].rolling(window=30).mean()
    
    # Calculate price momentum (rate of change)
    df['momentum_5'] = df['Close'].pct_change(periods=5)
    
    # Drop NaN values
    df = df.dropna().reset_index(drop=True)
    
    print(f"Data shape after preprocessing: {df.shape}")
    
except Exception as e:
    print(f"Error in data loading: {e}")
    exit(1)

print("\nStep 2: Preparing data for time series modeling...")

try:
    # Focus on predicting returns instead of absolute prices
    # This makes the prediction task more realistic and normalizes the target variable
    target_variable = 'return'
    
    # Select a reasonable set of features
    features = ['return', 'log_return', 'volatility', 
                'sma_10', 'sma_30', 'momentum_5',
                'Open', 'High', 'Low', 'Close', 'Volume']
    
    # Separate test set (last 20% of data)
    test_split = int(len(df) * 0.8)
    train_df = df.iloc[:test_split].copy()
    test_df = df.iloc[test_split:].copy()
    
    print(f"Training set: {len(train_df)} samples")
    print(f"Test set: {len(test_df)} samples")
    
    # Function to create sequences - specialized for each prediction horizon
    def create_sequences(data, features, target_col, seq_length, horizon):
        X, y = [], []
        feature_data = data[features].values
        target_data = data[target_col].values
        
        for i in range(seq_length, len(data) - horizon):
            X.append(feature_data[i-seq_length:i])
            # Target is the return value 'horizon' days ahead
            y.append(target_data[i + horizon - 1])
            
        return np.array(X), np.array(y)
    
except Exception as e:
    print(f"Error in data preparation: {e}")
    exit(1)

# Learning rate scheduler
def lr_scheduler(epoch, lr):
    if epoch < 10:
        return float(lr)  # Ensure the learning rate is a float
    else:
        return float(lr * tf.math.exp(-0.1))  # Ensure the learning rate is a float

def build_model_for_horizon(horizon, horizon_name):
    print(f"\n===== Building model for {horizon_name} prediction =====")
    
    try:
        # Parameters
        sequence_length = 20  # Use 20 days of data to predict
        
        # Create scaled datasets - scale each feature independently
        scaler_dict = {}
        scaled_train_data = train_df[features].copy()
        scaled_test_data = test_df[features].copy()
        
        for feature in features:
            scaler = StandardScaler()
            scaled_train_data[feature] = scaler.fit_transform(train_df[feature].values.reshape(-1, 1))
            scaled_test_data[feature] = scaler.transform(test_df[feature].values.reshape(-1, 1))
            scaler_dict[feature] = scaler
            
        # Store target scaler for inverse transformation later
        target_scaler = scaler_dict[target_variable]
            
        # Create sequences
        X_train, y_train = create_sequences(
            scaled_train_data, features, target_variable, sequence_length, horizon
        )
        
        X_test, y_test = create_sequences(
            scaled_test_data, features, target_variable, sequence_length, horizon
        )
        
        print(f"Training sequences: {X_train.shape}")
        print(f"Testing sequences: {X_test.shape}")
        
        # Build an enhanced model
        model = Sequential([
            Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], X_train.shape[2])),
            MaxPooling1D(pool_size=2),
            Bidirectional(LSTM(50, activation='tanh', recurrent_activation='sigmoid', return_sequences=True)),
            Dropout(0.2),
            Bidirectional(LSTM(30, activation='tanh', recurrent_activation='sigmoid')),
            Dropout(0.2),
            Dense(1)
        ])
        
        # Compile with appropriate loss for returns prediction
        initial_learning_rate = 0.001
        optimizer = Adam(learning_rate=initial_learning_rate)
        model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
        
        # Callbacks
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
            ModelCheckpoint(
                filepath=os.path.join(model_dir, f'model_{horizon_name.replace(" ", "_")}.keras'),
                monitor='val_loss', save_best_only=True
            ),
            LearningRateScheduler(lr_scheduler)
        ]
        
        # Train the model
        print(f"Training model for {horizon_name}...")
        history = model.fit(
            X_train, y_train,
            epochs=50,
            batch_size=32,
            validation_split=0.2,
            callbacks=callbacks,
            verbose=1
        )
        
        # Plot training history
        plt.figure(figsize=(10, 5))
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title(f'Training History - {horizon_name} Forecast')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        plt.savefig(os.path.join(model_dir, f'training_history_{horizon_name.replace(" ", "_")}.png'))
        
        # Evaluate the model
        print(f"\nEvaluating {horizon_name} forecast model...")
        
        # Make predictions
        y_pred = model.predict(X_test)
        
        # Convert predictions back to original scale
        y_test_orig = target_scaler.inverse_transform(y_test.reshape(-1, 1)).flatten()
        y_pred_orig = target_scaler.inverse_transform(y_pred).flatten()
        
        # Calculate metrics on original scale data
        mse = mean_squared_error(y_test_orig, y_pred_orig)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test_orig, y_pred_orig)
        r2 = r2_score(y_test_orig, y_pred_orig)
        
        # Calculate MAPE carefully to handle zeros or near-zeros
        mape = np.mean(np.abs((y_test_orig - y_pred_orig) / np.maximum(np.abs(y_test_orig), 1e-7))) * 100
        
        print(f"\n🔹 {horizon_name} Forecast Metrics:")
        print(f"   • Mean Squared Error (MSE): {mse:.6f}")
        print(f"   • Root Mean Squared Error (RMSE): {rmse:.6f}")
        print(f"   • Mean Absolute Error (MAE): {mae:.6f}")
        print(f"   • Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
        print(f"   • R² Score: {r2:.4f}")
        
        # Plot predictions vs actual returns
        plt.figure(figsize=(12, 6))
        plt.plot(y_test_orig, label='Actual Returns', color='blue', alpha=0.6)
        plt.plot(y_pred_orig, label='Predicted Returns', color='red', linestyle='--')
        plt.title(f'{horizon_name} Return Prediction')
        plt.xlabel('Trading Days')
        plt.ylabel('Returns')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.savefig(os.path.join(model_dir, f'returns_prediction_{horizon_name.replace(" ", "_")}.png'))
        
        # Convert returns to actual price predictions
        last_prices = []
        predicted_prices = []
        actual_prices = []
        
        # Get price information
        test_prices = test_df['Close'].values
        
        for i in range(len(y_test_orig)):
            idx = i + sequence_length + horizon - 1
            if idx < len(test_prices):
                # For the first point, use the actual price as base
                if i == 0:
                    prev_price = test_prices[sequence_length - 1]
                else:
                    prev_price = test_prices[sequence_length + i - 2]
                
                # Actual price
                actual_price = test_prices[idx]
                
                # Predicted price based on return prediction
                predicted_return = y_pred_orig[i]
                predicted_price = prev_price * (1 + predicted_return)
                
                last_prices.append(prev_price)
                predicted_prices.append(predicted_price)
                actual_prices.append(actual_price)
        
        # Plot price predictions
        plt.figure(figsize=(12, 6))
        plt.plot(actual_prices, label='Actual Price', color='blue')
        plt.plot(predicted_prices, label='Predicted Price', color='red', linestyle='--')
        plt.title(f'{horizon_name} Price Prediction')
        plt.xlabel('Trading Days')
        plt.ylabel('Price ($)')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.savefig(os.path.join(model_dir, f'price_prediction_{horizon_name.replace(" ", "_")}.png'))
        
        # Calculate price prediction metrics
        price_mse = mean_squared_error(actual_prices, predicted_prices)
        price_rmse = np.sqrt(price_mse)
        price_mae = mean_absolute_error(actual_prices, predicted_prices)
        price_mape = np.mean(np.abs((np.array(actual_prices) - np.array(predicted_prices)) / np.array(actual_prices))) * 100
        price_r2 = r2_score(actual_prices, predicted_prices)
        
        print(f"\n🔹 {horizon_name} Price Prediction Metrics:")
        print(f"   • Mean Squared Error (MSE): {price_mse:.4f}")
        print(f"   • Root Mean Squared Error (RMSE): {price_rmse:.4f}")
        print(f"   • Mean Absolute Error (MAE): {price_mae:.4f}")
        print(f"   • Mean Absolute Percentage Error (MAPE): {price_mape:.2f}%")
        print(f"   • R² Score: {price_r2:.4f}")
        
        # Make future prediction
        last_sequence = scaled_test_data[features].values[-sequence_length:]
        last_sequence = last_sequence.reshape(1, sequence_length, len(features))
        predicted_return = model.predict(last_sequence)[0][0]
        
        # Convert to original scale
        predicted_return_orig = target_scaler.inverse_transform([[predicted_return]])[0][0]
        
        # Get the latest price
        latest_price = df['Close'].iloc[-1]
        
        # Calculate predicted price
        predicted_price = latest_price * (1 + predicted_return_orig)
        
        print(f"\n🔹 Future {horizon_name} Prediction:")
        print(f"   • Current Price: ${latest_price:.2f}")
        print(f"   • Predicted {horizon_name} Return: {predicted_return_orig*100:.2f}%")
        print(f"   • Predicted {horizon_name} Price: ${predicted_price:.2f}")
        
        # Return key metrics for comparison
        return {
            'horizon': horizon_name,
            'return_r2': r2,
            'price_r2': price_r2,
            'price_mape': price_mape,
            'model': model
        }
        
    except Exception as e:
        print(f"Error in {horizon_name} model: {e}")
        return None
# Train models for different prediction horizons
horizons = [
    (1, "1-day"),
    (5, "1-week"),
    (20, "1-month")
]

results = []
for horizon_days, horizon_name in horizons:
    result = build_model_for_horizon(horizon_days, horizon_name)
    if result:
        results.append(result)

# Compare model performance across horizons
if results:
    print("\n===== MODEL COMPARISON =====")
    for result in results:
        print(f"🔹 {result['horizon']}:")
        print(f"   • Return Prediction R²: {result['return_r2']:.4f}")
        print(f"   • Price Prediction R²: {result['price_r2']:.4f}")
        print(f"   • Price Prediction MAPE: {result['price_mape']:.2f}%")

print("\nModel training and evaluation complete. Check the 'stock_models_enhanced' directory for outputs.")

Step 1: Loading and preprocessing data...


[*********************100%***********************]  1 of 1 completed

Successfully downloaded 3823 days of data
Data shape after preprocessing: (3794, 12)

Step 2: Preparing data for time series modeling...
Training set: 3035 samples
Test set: 759 samples

===== Building model for 1-day prediction =====
Training sequences: (3014, 20, 11)
Testing sequences: (738, 20, 11)
Training model for 1-day...
Epoch 1/50



  super().__init__(


[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 15ms/step - loss: 0.7087 - mae: 0.5877 - val_loss: 2.0276 - val_mae: 0.8581 - learning_rate: 0.0010
Epoch 2/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - loss: 0.6946 - mae: 0.5809 - val_loss: 2.0272 - val_mae: 0.8588 - learning_rate: 0.0010
Epoch 3/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.6853 - mae: 0.5757 - val_loss: 2.0250 - val_mae: 0.8575 - learning_rate: 0.0010
Epoch 4/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.6813 - mae: 0.5724 - val_loss: 2.0270 - val_mae: 0.8611 - learning_rate: 0.0010
Epoch 5/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.6784 - mae: 0.5718 - val_loss: 2.0255 - val_mae: 0.8570 - learning_rate: 0.0010
Epoch 6/50
[1m76/76[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 0.6813 - mae: 0.5728 - val_loss: 2.0261 - val_ma

KeyboardInterrupt: 

In [19]:

from keras.models import load_model

# Load the model from a saved file
model = load_model(r"C:\Users\Gourish\Desktop\stockpredictor\models\stock_model_multihorizon_keras.keras")

# Print model summary
model.summary()


  trackable.load_own_variables(weights_store.get(inner_path))
