<a href="https://colab.research.google.com/github/NathanDietrich/Artificial-Intelligence-and-Machine-Learning-portfolio/blob/main/DataPipeLineRollingWindow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install yfinance textblob ta

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=21cdb0937c20725cb6a7654e1a366c91a18d9b83d474c9939671074810d7b285
  Stored in directory: /root/.cache/pip/wheels/a1/d7/29/7781cc5eb9a3659d032d7d15bdd0f49d07d2b24fec29f44bc4
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


In [2]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [3]:
import os
import tensorflow as tf
from tensorflow.keras import mixed_precision

# ✅ Enable GPU & Force TensorFlow to Use It
gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    try:
        tf.config.experimental.set_memory_growth(gpu_devices[0], True)
        print(f"✅ GPU detected: {gpu_devices[0].name} (Memory Growth Enabled)")
    except:
        print("⚠️ GPU found, but could not enable memory growth.")
else:
    print("❌ No GPU detected. Running on CPU.")

# ✅ Enable Mixed Precision for Faster Training (Uses float16 on GPU)
mixed_precision.set_global_policy('mixed_float16')
print("✅ Mixed Precision Enabled (float16) for Faster GPU Training")

# ✅ Check GPU Usage Before Training
!nvidia-smi --query-gpu=memory.used,memory.total --format=csv

# ✅ Function to Monitor GPU Usage Live
def monitor_gpu():
    print("\n🔍 Checking GPU Usage...")
    os.system("nvidia-smi --query-gpu=memory.used,memory.total --format=csv")

monitor_gpu()

✅ GPU detected: /physical_device:GPU:0 (Memory Growth Enabled)
✅ Mixed Precision Enabled (float16) for Faster GPU Training
memory.used [MiB], memory.total [MiB]
2 MiB, 15360 MiB

🔍 Checking GPU Usage...


In [2]:
# %% [code]
import os
import re
import pickle
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import MinMaxScaler
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive', force_remount=True)

# ===== File selection functions =====
def get_valid_files(directory="/content/drive/MyDrive/StockData"):
    """Detects and lists valid raw stock data files in the specified directory."""
    if not os.path.exists(directory):
        print(f"❌ Directory {directory} not found. Check if Google Drive is mounted correctly.")
        return {}
    uploaded_files = os.listdir(directory)
    pattern = re.compile(r"^([A-Za-z0-9]+)_\d{4}-\d{2}-\d{2}_to_\d{4}-\d{2}-\d{2}_raw\.csv$", re.IGNORECASE)
    valid_files = {os.path.join(directory, f): pattern.match(f).group(1).upper()
                   for f in uploaded_files if pattern.match(f)}
    if not valid_files:
        print("❌ No valid raw stock data files found in the folder.")
    return valid_files

def select_file(directory="/content/drive/MyDrive/StockData"):
    """Selects a raw stock data file from the specified directory."""
    valid_files = get_valid_files(directory)
    if not valid_files:
        print("❌ No valid raw stock data files found. Please upload one to the folder.")
        return None, None
    if len(valid_files) == 1:
        full_path = list(valid_files.keys())[0]
        stock_ticker = valid_files[full_path]
        print(f"✅ Automatically selected: {os.path.basename(full_path)} ({stock_ticker})")
        return full_path, stock_ticker
    print("🔍 Multiple stock raw files detected. Please choose one:")
    full_paths = list(valid_files.keys())
    for i, full_path in enumerate(full_paths):
        print(f"{i + 1}. {os.path.basename(full_path)} ({valid_files[full_path]})")
    choice = int(input("Enter the number of the file to use: ")) - 1
    selected_path = full_paths[choice]
    stock_ticker = valid_files[selected_path]
    print(f"✅ Selected: {os.path.basename(selected_path)} ({stock_ticker})")
    return selected_path, stock_ticker

# ===== Technical Indicator Functions =====
def compute_sma(df, window=14):
    return df['Close'].rolling(window=window).mean()

def compute_ema(df, span=14):
    return df['Close'].ewm(span=span, adjust=False).mean()

def compute_rsi(df, window=14):
    delta = df['Close'].diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=window).mean()
    avg_loss = loss.rolling(window=window, min_periods=window).mean()
    rs = avg_gain / (avg_loss + 1e-10)
    rsi = 100 - (100 / (1 + rs))
    return rsi

def compute_macd(df, span_short=12, span_long=26, span_signal=9):
    ema_short = df['Close'].ewm(span=span_short, adjust=False).mean()
    ema_long = df['Close'].ewm(span=span_long, adjust=False).mean()
    macd_line = ema_short - ema_long
    signal_line = macd_line.ewm(span=span_signal, adjust=False).mean()
    return macd_line, signal_line

def compute_bollinger_bands(df, window=20, num_std=2):
    sma = df['Close'].rolling(window=window).mean()
    rstd = df['Close'].rolling(window=window).std()
    upper_band = sma + num_std * rstd
    lower_band = sma - num_std * rstd
    return sma, upper_band, lower_band

# ===== Preprocessing for Predictive Problem =====
def preprocess_data(df):
    # Convert Date column to datetime if needed
    if not np.issubdtype(df['Date'].dtype, np.datetime64):
        df['Date'] = pd.to_datetime(df['Date'])
    df.sort_values('Date', inplace=True)

    # Compute technical indicators
    df['SMA_14'] = compute_sma(df, window=14)
    df['EMA_14'] = compute_ema(df, span=14)
    df['RSI_14'] = compute_rsi(df, window=14)
    df['MACD'], df['MACD_Signal'] = compute_macd(df)
    bb_sma, bb_upper, bb_lower = compute_bollinger_bands(df)
    df['BB_Middle'] = bb_sma
    df['BB_Upper'] = bb_upper
    df['BB_Lower'] = bb_lower

    # Fill missing values and drop any remaining NAs
    df.ffill(inplace=True)
    df.bfill(inplace=True)
    df.dropna(inplace=True)

    # Shift target column by one day (predict tomorrow's Close)
    df['Close_tomorrow'] = df['Close'].shift(-1)
    df = df.iloc[:-1]  # Drop last row as it has no next day value

    # Drop the Date column (keep Close as a feature)
    df.drop(columns=['Date'], inplace=True)
    return df

def split_data(df, train_ratio=0.7, val_ratio=0.15):
    """Splits the DataFrame into train, validation, and test sets."""
    n = len(df)
    train_end = int(n * train_ratio)
    val_end = train_end + int(n * val_ratio)
    train_df = df.iloc[:train_end]
    val_df = df.iloc[train_end:val_end]
    test_df = df.iloc[val_end:]
    return train_df, val_df, test_df

def scale_train_val_test(train_df, val_df, test_df, target_col='Close_tomorrow'):
    """
    Separates the target column from features and fits separate MinMaxScalers
    on the training set for X and y. Then transforms the train, val, and test sets.
    """
    from sklearn.preprocessing import MinMaxScaler
    sentiment_cols = ['sentiment_polarity', 'sentiment_subjectivity']

    # For training set, separate target and features
    if target_col not in train_df.columns:
        raise ValueError(f"Target column '{target_col}' not found.")
    y_train = train_df[target_col]
    X_train = train_df.drop(columns=[target_col])

    # Determine feature columns for scaling (exclude sentiments)
    x_cols = [col for col in X_train.columns if col not in sentiment_cols]

    # Fit scaler on training features
    scaler_x = MinMaxScaler(feature_range=(-1, 1))
    X_train_scaled = scaler_x.fit_transform(X_train[x_cols])
    X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=x_cols, index=X_train.index)
    # Add sentiment columns unchanged
    for col in sentiment_cols:
        if col in X_train.columns:
            X_train_scaled_df[col] = X_train[col].values

    # Fit scaler on training target
    scaler_y = MinMaxScaler(feature_range=(-1, 1))
    y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1))

    # Transform function for val/test
    def transform_features(df_in):
        y_temp = df_in[target_col]
        X_temp = df_in.drop(columns=[target_col])
        X_temp_scaled = scaler_x.transform(X_temp[x_cols])
        X_temp_scaled_df = pd.DataFrame(X_temp_scaled, columns=x_cols, index=X_temp.index)
        for col in sentiment_cols:
            if col in X_temp.columns:
                X_temp_scaled_df[col] = X_temp[col].values
        y_temp_scaled = scaler_y.transform(y_temp.values.reshape(-1, 1))
        return X_temp_scaled_df, y_temp_scaled

    X_val_scaled_df, y_val_scaled = transform_features(val_df)
    X_test_scaled_df, y_test_scaled = transform_features(test_df)

    return (X_train_scaled_df, y_train_scaled,
            X_val_scaled_df, y_val_scaled,
            X_test_scaled_df, y_test_scaled,
            scaler_x, scaler_y)

# ===== New: Create Time-Series Sequences =====
def create_sequences(X, y, window_size=10):
    """
    Create sequences from features X and targets y.
    X: pandas DataFrame or numpy array of shape (n_samples, n_features)
    y: numpy array of shape (n_samples, 1)
    Returns:
      X_seq: numpy array of shape (n_samples - window_size, window_size, n_features)
      y_seq: numpy array of shape (n_samples - window_size, 1)
    """
    X_arr = X.values if isinstance(X, pd.DataFrame) else X
    sequences = []
    targets = []
    for i in range(len(X_arr) - window_size):
        sequences.append(X_arr[i:i+window_size])
        targets.append(y[i+window_size])  # predict the target following the window
    return np.array(sequences), np.array(targets)

# ===== New: Save Pipeline Outputs as Sequences =====
def save_pipeline_outputs_sequences(stock_ticker, scaler_x, scaler_y,
                                    X_train_seq, X_val_seq, X_test_seq,
                                    y_train_seq, y_val_seq, y_test_seq):
    """
    Saves the two scalers and sequence dataset splits in the stock folder.
    Overwrites the old files.
    """
    base_path = "/content/drive/MyDrive/stocks"
    stock_path = os.path.join(base_path, stock_ticker)
    os.makedirs(stock_path, exist_ok=True)

    # Save scalers (overwrite old files)
    scaler_x_path = os.path.join(stock_path, "scaler_x_stock.pkl")
    with open(scaler_x_path, "wb") as f:
        pickle.dump(scaler_x, f)
    print(f"✅ Saved scaler_x to {scaler_x_path}")

    scaler_y_path = os.path.join(stock_path, "scaler_y.pkl")
    with open(scaler_y_path, "wb") as f:
        pickle.dump(scaler_y, f)
    print(f"✅ Saved scaler_y to {scaler_y_path}")

    # Save sequence splits as .npy files
    np.save(os.path.join(stock_path, "X_train.npy"), X_train_seq)
    print(f"✅ Saved X_train.npy (sequences) to {stock_path}")
    np.save(os.path.join(stock_path, "X_val.npy"), X_val_seq)
    print(f"✅ Saved X_val.npy (sequences) to {stock_path}")
    np.save(os.path.join(stock_path, "X_test.npy"), X_test_seq)
    print(f"✅ Saved X_test.npy (sequences) to {stock_path}")

    np.save(os.path.join(stock_path, "y_train.npy"), y_train_seq)
    print(f"✅ Saved y_train.npy (sequences) to {stock_path}")
    np.save(os.path.join(stock_path, "y_val.npy"), y_val_seq)
    print(f"✅ Saved y_val.npy (sequences) to {stock_path}")
    np.save(os.path.join(stock_path, "y_test.npy"), y_test_seq)
    print(f"✅ Saved y_test.npy (sequences) to {stock_path}")

    print(f"✅ All sequence dataset splits saved in folder: {stock_path}")

# ===== New: Process All Stock Files =====
def process_all_stock_files():
    valid_files = get_valid_files()  # uses default directory "/content/drive/MyDrive/StockData"
    if not valid_files:
        return
    for filename, stock_ticker in valid_files.items():
        print("\n" + "="*50)
        print(f"Processing file: {os.path.basename(filename)} for stock {stock_ticker}")

        # Load raw data
        df = pd.read_csv(filename)

        # Preprocess the data (predict tomorrow's Close)
        df = preprocess_data(df)

        # Split the DataFrame into train, validation, and test sets
        train_df, val_df, test_df = split_data(df)

        # Scale train/val/test sets using the shifted target column "Close_tomorrow"
        (X_train_scaled_df, y_train_scaled,
         X_val_scaled_df, y_val_scaled,
         X_test_scaled_df, y_test_scaled,
         scaler_x, scaler_y) = scale_train_val_test(train_df, val_df, test_df, target_col="Close_tomorrow")

        # --- Create time-series sequences from the scaled data ---
        window_size = 10  # Adjust window size as needed
        X_train_seq, y_train_seq = create_sequences(X_train_scaled_df, y_train_scaled, window_size)
        X_val_seq, y_val_seq = create_sequences(X_val_scaled_df, y_val_scaled, window_size)
        X_test_seq, y_test_seq = create_sequences(X_test_scaled_df, y_test_scaled, window_size)

        # Save results (this will overwrite the old files in the stock folder)
        save_pipeline_outputs_sequences(stock_ticker, scaler_x, scaler_y,
                                        X_train_seq, X_val_seq, X_test_seq,
                                        y_train_seq, y_val_seq, y_test_seq)
        print(f"✅ Preprocessing pipeline complete with time-series sequences for {stock_ticker}")

# Run the pipeline for all stocks
process_all_stock_files()


Mounted at /content/drive

Processing file: QQQ_2021-01-01_to_2025-02-27_raw.csv for stock QQQ


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/QQQ/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/QQQ/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/QQQ
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/QQQ
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/QQQ
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/QQQ
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/QQQ
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/QQQ
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/QQQ
✅ Preprocessing pipeline complete with time-series sequences for QQQ

Processing file: SPY_2021-01-01_to_2025-03-04_raw.csv for stock SPY


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/SPY/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/SPY/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/SPY
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/SPY
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/SPY
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/SPY
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/SPY
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/SPY
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/SPY
✅ Preprocessing pipeline complete with time-series sequences for SPY

Processing file: TSLA_2021-01-01_to_2025-03-04_raw.csv for stock TSLA


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/TSLA/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/TSLA/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/TSLA
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/TSLA
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/TSLA
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/TSLA
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/TSLA
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/TSLA
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/TSLA
✅ Preprocessing pipeline complete with time-series sequences for TSLA

Processing file: MSFT_2021-01-01_to_2025-03-04_raw.csv for stock MSFT


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/MSFT/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/MSFT/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/MSFT
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/MSFT
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/MSFT
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/MSFT
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/MSFT
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/MSFT
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/MSFT
✅ Preprocessing pipeline complete with time-series sequences for MSFT

Processing file: AMZN_2021-01-01_to_2025-03-05_raw.csv for stock AMZN


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/AMZN/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/AMZN/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/AMZN
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/AMZN
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/AMZN
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/AMZN
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/AMZN
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/AMZN
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/AMZN
✅ Preprocessing pipeline complete with time-series sequences for AMZN

Processing file: CAT_2021-01-01_to_2025-03-05_raw.csv for stock CAT


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/CAT/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/CAT/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/CAT
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/CAT
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/CAT
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/CAT
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/CAT
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/CAT
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/CAT
✅ Preprocessing pipeline complete with time-series sequences for CAT

Processing file: TQQQ_2021-01-01_to_2025-03-05_raw.csv for stock TQQQ


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/TQQQ/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/TQQQ/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/TQQQ
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/TQQQ
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/TQQQ
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/TQQQ
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/TQQQ
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/TQQQ
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/TQQQ
✅ Preprocessing pipeline complete with time-series sequences for TQQQ

Processing file: AAPL_2021-01-01_to_2025-03-05_raw.csv for stock AAPL


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/AAPL/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/AAPL/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/AAPL
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/AAPL
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/AAPL
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/AAPL
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/AAPL
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/AAPL
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/AAPL
✅ Preprocessing pipeline complete with time-series sequences for AAPL

Processing file: F_2021-01-01_to_2025-03-05_raw.csv for stock F


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/F/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/F/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/F
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/F
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/F
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/F
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/F
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/F
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/F
✅ Preprocessing pipeline complete with time-series sequences for F

Processing file: BTC_2021-01-01_to_2025-03-06_raw.csv for stock BTC


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/BTC/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/BTC/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/BTC
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/BTC
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/BTC
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/BTC
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/BTC
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/BTC
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/BTC
✅ Preprocessing pipeline complete with time-series sequences for BTC

Processing file: MAIN_2021-01-01_to_2025-03-06_raw.csv for stock MAIN


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/MAIN/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/MAIN/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/MAIN
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/MAIN
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/MAIN
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/MAIN
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/MAIN
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/MAIN
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/MAIN
✅ Preprocessing pipeline complete with time-series sequences for MAIN

Processing file: PEP_2021-01-01_to_2025-03-06_raw.csv for stock PEP


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['Date'], inplace=True)


✅ Saved scaler_x to /content/drive/MyDrive/stocks/PEP/scaler_x_stock.pkl
✅ Saved scaler_y to /content/drive/MyDrive/stocks/PEP/scaler_y.pkl
✅ Saved X_train.npy (sequences) to /content/drive/MyDrive/stocks/PEP
✅ Saved X_val.npy (sequences) to /content/drive/MyDrive/stocks/PEP
✅ Saved X_test.npy (sequences) to /content/drive/MyDrive/stocks/PEP
✅ Saved y_train.npy (sequences) to /content/drive/MyDrive/stocks/PEP
✅ Saved y_val.npy (sequences) to /content/drive/MyDrive/stocks/PEP
✅ Saved y_test.npy (sequences) to /content/drive/MyDrive/stocks/PEP
✅ All sequence dataset splits saved in folder: /content/drive/MyDrive/stocks/PEP
✅ Preprocessing pipeline complete with time-series sequences for PEP


In [None]:
# %% [code]
import os
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import joblib
import json
import keras_tuner as kt
from datetime import datetime
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (Input, Conv1D, MaxPooling1D, Flatten,
                                     SimpleRNN, LSTM, Dense, Dropout,
                                     Concatenate, Multiply, Attention, Lambda)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# === Get List of Stock Folders ===
base_stocks_folder = '/content/drive/MyDrive/stocks'
stock_folders = [f for f in os.listdir(base_stocks_folder) if os.path.isdir(os.path.join(base_stocks_folder, f))]
if not stock_folders:
    print("No stock folders found in", base_stocks_folder)
    raise SystemExit

print("Processing the following stock tickers:")
for stock in stock_folders:
    print(f" - {stock}")

# === Checkpoint Setup ===
checkpoint_path = os.path.join(base_stocks_folder, "processed_stocks_rolling.txt")
if os.path.exists(checkpoint_path):
    with open(checkpoint_path, "r") as f:
        processed_stocks = f.read().splitlines()
else:
    processed_stocks = []

# Loop through each stock folder (process ALL stocks)
for selected_ticker in stock_folders:
    if selected_ticker in processed_stocks:
        print(f"Skipping {selected_ticker} (already processed).")
        continue

    print("\n" + "="*50)
    print(f"Processing stock: {selected_ticker}")

    stock_path = os.path.join(base_stocks_folder, selected_ticker)

    # === Define Ensemble Save Folder (inside stock folder) as EnsembleRolling ===
    ensemble_folder = os.path.join(stock_path, "EnsembleRolling")
    os.makedirs(ensemble_folder, exist_ok=True)

    # === Load Preprocessed Data from Stock Folder ===
    X_train = np.load(os.path.join(stock_path, "X_train.npy"))
    y_train = np.load(os.path.join(stock_path, "y_train.npy"))
    X_val   = np.load(os.path.join(stock_path, "X_val.npy"))
    y_val   = np.load(os.path.join(stock_path, "y_val.npy"))
    X_test  = np.load(os.path.join(stock_path, "X_test.npy"))
    y_test  = np.load(os.path.join(stock_path, "y_test.npy"))

    # Load scaler for target variable (for inverse scaling later)
    scaler_y = joblib.load(os.path.join(stock_path, "scaler_y.pkl"))

    print(f"✅ Data Loaded for {selected_ticker}:")
    print(f"   X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"   X_val: {X_val.shape}, y_val: {y_val.shape}")
    print(f"   X_test: {X_test.shape}, y_test: {y_test.shape}")

    # === Reshape Data if Needed ===
    # If data is 2D, add a time-axis (samples, 1, features); if already time-series, do nothing.
    if X_train.ndim == 2:
        X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
        X_val   = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
        X_test  = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))
    input_shape = (X_train.shape[1], X_train.shape[2])
    print(f"✅ Input shape for model: {input_shape}")

    # === Hyperparameter Tuning / Load Best Hyperparameters ===
    best_hps_path = os.path.join(ensemble_folder, "best_hyperparameters.json")
    tuning_flag_path = os.path.join(ensemble_folder, "hp_tuning_complete.flag")

    # Define the model building function for the sigmoid ensemble
    def build_ensemble_model(hp):
        inputs = Input(shape=input_shape)
        # Determine kernel size options based on timesteps
        if input_shape[0] == 1:
            kernel_size_options = [1]
            apply_pooling = False
        else:
            kernel_size_options = [3, 5, 7]
            apply_pooling = True

        # === CNN Branch ===
        cnn = Conv1D(filters=hp.Choice('cnn_filters', [64, 128, 256]),
                     kernel_size=hp.Choice('cnn_kernel_size', kernel_size_options),
                     activation='relu')(inputs)
        if apply_pooling:
            cnn = MaxPooling1D(pool_size=2)(cnn)
        cnn = Flatten()(cnn)
        # Sigmoid weight for CNN branch
        cnn_weight = Dense(1, activation='sigmoid')(cnn)

        # === RNN Branch ===
        rnn = SimpleRNN(units=hp.Choice('rnn_units', [75, 100, 125]), return_sequences=True)(inputs)
        rnn = SimpleRNN(units=hp.Choice('rnn_units_2', [75, 100, 125]), return_sequences=True)(rnn)
        rnn = Attention()([rnn, rnn])
        rnn = Flatten()(rnn)
        # Sigmoid weight for RNN branch
        rnn_weight = Dense(1, activation='sigmoid')(rnn)

        # === LSTM Branch ===
        lstm = LSTM(units=hp.Choice('lstm_units', [50, 75, 100]), return_sequences=True)(inputs)
        lstm = LSTM(units=hp.Choice('lstm_units_2', [50, 75, 100]), return_sequences=True)(lstm)
        lstm = Attention()([lstm, lstm])
        lstm = Flatten()(lstm)
        # Sigmoid weight for LSTM branch
        lstm_weight = Dense(1, activation='sigmoid')(lstm)

        # === Adaptive Weighted Fusion ===
        cnn_scaled = Multiply()([cnn, cnn_weight])
        rnn_scaled = Multiply()([rnn, rnn_weight])
        lstm_scaled = Multiply()([lstm, lstm_weight])
        merged = Concatenate()([cnn_scaled, rnn_scaled, lstm_scaled])
        merged = Dense(units=hp.Choice('dense_units', [50, 100, 150]),
                       activation="relu")(merged)
        merged = Dropout(hp.Choice('dropout_rate', [0.1, 0.2, 0.3]))(merged)
        output = Dense(1)(merged)

        model = Model(inputs, output)
        model.compile(
            optimizer=Adam(learning_rate=hp.Choice('learning_rate', [0.001, 0.0005, 0.0001])),
            loss="mse",
            metrics=["mae"]
        )
        return model

    if not os.path.exists(tuning_flag_path):
        # Force re-tuning the first time: remove existing hyperparameters if present
        if os.path.exists(best_hps_path):
            os.remove(best_hps_path)
        print(f"🔍 No tuning flag found for {selected_ticker}. Running hyperparameter tuning for Sigmoid Ensemble...")
        tuner = kt.RandomSearch(
            build_ensemble_model,
            objective="val_loss",
            max_trials=15,
            executions_per_trial=3,
            directory=os.path.join(ensemble_folder, "tuning"),
            project_name="stock_prediction_ensemble"
        )
        tuner.search(X_train, y_train, epochs=50, validation_data=(X_val, y_val), verbose=1)
        best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
        best_hps_dict = {param: best_hps.get(param) for param in best_hps.values.keys()}
        with open(best_hps_path, "w") as f:
            json.dump(best_hps_dict, f)
        # Create a flag file to indicate that tuning has been completed
        with open(tuning_flag_path, "w") as f:
            f.write("tuning complete")
        best_model = tuner.hypermodel.build(best_hps)
    else:
        print(f"✅ Loading best hyperparameters from file for {selected_ticker}")
        with open(best_hps_path, "r") as f:
            best_hps_dict = json.load(f)
        best_hps = kt.HyperParameters()
        for key, value in best_hps_dict.items():
            best_hps.Fixed(key, value)
        best_model = build_ensemble_model(best_hps)

    print(f"✅ Best hyperparameters for {selected_ticker}:")
    print(best_hps_dict)

    # === Build and Train the Best Model ===
    BATCH_SIZE = 32
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-5, verbose=1)
    early_stop = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    history = best_model.fit(
        X_train, y_train,
        epochs=500,
        batch_size=BATCH_SIZE,
        validation_data=(X_val, y_val),
        callbacks=[reduce_lr, early_stop],
        verbose=1
    )

    # === Save the Best Model and Training History ===
    best_model_path = os.path.join(ensemble_folder, "best_ensemble_model.keras")
    best_model.save(best_model_path)
    print(f"✅ Best Ensemble Model for {selected_ticker} saved to {best_model_path}")

    plt.figure(figsize=(12, 6))
    plt.plot(history.history['loss'], label='Train Loss', color='blue')
    plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training & Validation Loss")
    plt.legend()
    history_plot_path = os.path.join(ensemble_folder, "training_history.png")
    plt.savefig(history_plot_path)
    plt.show()
    print(f"✅ Training history graph for {selected_ticker} saved to {history_plot_path}")

    # === Evaluate the Model ===
    loss, mae = best_model.evaluate(X_test, y_test)
    print(f"✅ Best Model Test Loss for {selected_ticker}: {loss:.4f}")
    print(f"✅ Best Model Test MAE for {selected_ticker}: {mae:.4f}")

    # === Make Predictions and Inverse Scale ===
    predictions = best_model.predict(X_test)
    def inverse_transform_single_feature(scaler, data):
        data = np.array(data).reshape(-1, 1)
        return scaler.inverse_transform(data)
    predictions_rescaled = inverse_transform_single_feature(scaler_y, predictions)
    y_test_rescaled = inverse_transform_single_feature(scaler_y, y_test)

    # === Plot Predicted vs Actual Prices ===
    plt.figure(figsize=(12, 6))
    plt.plot(y_test_rescaled, label="Actual Price", color="blue")
    plt.plot(predictions_rescaled, label="Predicted Price", color="red", linestyle="dashed")
    plt.xlabel("Time")
    plt.ylabel("Stock Price")
    plt.title(f"{selected_ticker} - Predicted vs. Actual Stock Price")
    plt.legend()
    pred_vs_actual_path = os.path.join(ensemble_folder, "pred_vs_actual.png")
    plt.savefig(pred_vs_actual_path)
    plt.show()
    print(f"✅ Prediction vs. Actual plot for {selected_ticker} saved to {pred_vs_actual_path}")

    print(f"\n🎯 Sigmoid Ensemble Model Training & Prediction Complete for {selected_ticker}! 🚀")

    # Update checkpoint file
    processed_stocks.append(selected_ticker)
    with open(os.path.join(base_stocks_folder, "processed_stocks_rolling.txt"), "w") as f:
        for s in processed_stocks:
            f.write(s + "\n")
    print(f"✅ Checkpoint updated. Processed stocks: {processed_stocks}")
