<a href="https://colab.research.google.com/github/NathanDietrich/Artificial-Intelligence-and-Machine-Learning-portfolio/blob/main/Pipeline_and_Daily.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install yfinance textblob



In [2]:
!pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [3]:
import os
import tensorflow as tf
from tensorflow.keras import mixed_precision

# ✅ Enable GPU & Force TensorFlow to Use It
gpu_devices = tf.config.list_physical_devices('GPU')
if gpu_devices:
    try:
        tf.config.experimental.set_memory_growth(gpu_devices[0], True)
        print(f"✅ GPU detected: {gpu_devices[0].name} (Memory Growth Enabled)")
    except:
        print("⚠️ GPU found, but could not enable memory growth.")
else:
    print("❌ No GPU detected. Running on CPU.")

# ✅ Enable Mixed Precision for Faster Training (Uses float16 on GPU)
mixed_precision.set_global_policy('mixed_float16')
print("✅ Mixed Precision Enabled (float16) for Faster GPU Training")

# ✅ Check GPU Usage Before Training
!nvidia-smi --query-gpu=memory.used,memory.total --format=csv

# ✅ Function to Monitor GPU Usage Live
def monitor_gpu():
    print("\n🔍 Checking GPU Usage...")
    os.system("nvidia-smi --query-gpu=memory.used,memory.total --format=csv")

monitor_gpu()

✅ GPU detected: /physical_device:GPU:0 (Memory Growth Enabled)
✅ Mixed Precision Enabled (float16) for Faster GPU Training
memory.used [MiB], memory.total [MiB]
2 MiB, 15360 MiB

🔍 Checking GPU Usage...


In [8]:
import os
import requests
import datetime
import pandas as pd
import time
from textblob import TextBlob
from google.colab import drive
from google.colab import userdata
userdata.get('Polygon_Key')

# Mount Google Drive for saving raw data
drive.mount('/content/drive')

# --- Provided functions for stock & sentiment data collection ---
def fetch_stock_data_polygon(ticker, start_date, end_date, api_key):
    """
    Fetches historical stock data from Polygon.io.
    """
    url = f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/day/{start_date}/{end_date}?apiKey={api_key}"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching stock data for {ticker}: {response.text}")
        return None
    data = response.json()
    if "results" not in data:
        print(f"No results found for {ticker}.")
        return None
    df = pd.DataFrame(data["results"])
    df["Date"] = pd.to_datetime(df["t"], unit="ms").dt.date
    df.rename(columns={"o": "Open", "h": "High", "l": "Low", "c": "Close", "v": "Volume"}, inplace=True)
    df = df[["Date", "Open", "High", "Low", "Close", "Volume"]]
    return df

def fetch_sentiment_data_polygon(ticker, start_date, end_date, api_key, limit=1000):
    """
    Fetches sentiment data from Polygon.io in chunks and computes daily sentiment scores.
    """
    url = "https://api.polygon.io/v2/reference/news"
    all_results = []
    current_start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    final_end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")
    while current_start_date < final_end_date:
        chunk_end_date = current_start_date + datetime.timedelta(days=30)
        if chunk_end_date > final_end_date:
            chunk_end_date = final_end_date
        chunk_start_str = current_start_date.strftime("%Y-%m-%d")
        chunk_end_str = chunk_end_date.strftime("%Y-%m-%d")
        print(f"📡 Fetching sentiment data for {ticker} from {chunk_start_str} to {chunk_end_str}...")
        params = {
            "ticker": ticker,
            "published_utc.gte": chunk_start_str,
            "published_utc.lte": chunk_end_str,
            "apiKey": api_key,
            "limit": limit
        }
        while True:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                data = response.json()
                results = data.get("results", [])
                all_results.extend(results)
                next_cursor = data.get("next_cursor")
                if not next_cursor:
                    break
                params["cursor"] = next_cursor
            else:
                print(f"⚠️ Error fetching sentiment data for {ticker}: {response.status_code}, {response.text}")
                break
        current_start_date = chunk_end_date
        time.sleep(14)  # Avoid hitting API rate limits
    return all_results

def analyze_sentiment(news_data):
    """
    Uses TextBlob to compute sentiment polarity and subjectivity for each news article.
    """
    analyzed_data = []
    for article in news_data:
        title = article.get("title", "")
        description = article.get("description", "")
        full_text = f"{title} {description}"
        sentiment = TextBlob(full_text).sentiment
        analyzed_data.append({
            "title": title,
            "description": description,
            "published_date": article.get("published_utc", ""),
            "sentiment_polarity": sentiment.polarity,
            "sentiment_subjectivity": sentiment.subjectivity
        })
    return analyzed_data

def merge_stock_and_sentiment(stock_df, sentiment_data):
    """
    Merges stock data with sentiment data by date.
    """
    sentiment_df = pd.DataFrame(sentiment_data)
    if sentiment_df.empty:
        print("⚠️ No sentiment data found, proceeding without sentiment.")
        stock_df["sentiment_polarity"] = 0  # Default neutral
        stock_df["sentiment_subjectivity"] = 0
        return stock_df
    sentiment_df['published_date'] = pd.to_datetime(sentiment_df['published_date'], errors='coerce')
    sentiment_df['Date'] = sentiment_df['published_date'].dt.date
    daily_sentiment = sentiment_df.groupby('Date').agg({
        'sentiment_polarity': 'mean',
        'sentiment_subjectivity': 'mean'
    }).reset_index()
    merged_df = pd.merge(stock_df, daily_sentiment, on="Date", how="left")
    merged_df[['sentiment_polarity', 'sentiment_subjectivity']] = (
        merged_df[['sentiment_polarity', 'sentiment_subjectivity']]
        .replace(0, pd.NA)
        .ffill()
    )
    merged_df.fillna(0, inplace=True)
    return merged_df

# --- Main raw data collection for selected tickers ---
def collect_raw_data():
    # Set fixed start/end dates (adjust as needed)
    start_date = "2021-01-01"
    end_date = datetime.date.today().strftime("%Y-%m-%d")
    # List of tickers for stock + sentiment collection
    tickers = ["AAPL", "AMZN", "MSFT", "SPY", "QQQ"]

    # Load your Polygon API key (assume stored in Google Colab user data or environment)
    # For example, you can store it in a file or use environment variables.
    # Here, we simulate fetching it:
    api_key = userdata.get("Polygon_Key")
    if api_key == "YOUR_POLYGON_API_KEY":
        print("Please set your Polygon API key in the environment variable POLYGON_API_KEY")
        return

    for ticker in tickers:
        print(f"\n================== Processing {ticker} ==================")
        print(f"📊 Fetching stock data for {ticker} from {start_date} to {end_date}...")
        stock_df = fetch_stock_data_polygon(ticker, start_date, end_date, api_key)
        if stock_df is None:
            print(f"❌ No stock data found for {ticker}. Skipping.")
            continue
        print(f"📰 Fetching sentiment data for {ticker} from {start_date} to {end_date}...")
        news_data = fetch_sentiment_data_polygon(ticker, start_date, end_date, api_key, limit=1000)
        if not news_data:
            print(f"⚠️ No news data found for {ticker}. Proceeding without sentiment data.")
        print("💡 Performing sentiment analysis...")
        sentiment_data = analyze_sentiment(news_data)
        print("🔗 Merging stock and sentiment data...")
        merged_df = merge_stock_and_sentiment(stock_df, sentiment_data)
        # Save raw merged data to Google Drive
        save_dir = "/content/drive/MyDrive/StockData"
        os.makedirs(save_dir, exist_ok=True)
        filename = os.path.join(save_dir, f"{ticker}_{start_date}_to_{end_date}_raw.csv")
        merged_df.to_csv(filename, index=False)
        print(f"✅ Raw data for {ticker} saved to: {filename}")

# Run raw data collection
collect_raw_data()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

📊 Fetching stock data for AAPL from 2021-01-01 to 2025-03-16...
📰 Fetching sentiment data for AAPL from 2021-01-01 to 2025-03-16...
📡 Fetching sentiment data for AAPL from 2021-01-01 to 2021-01-31...
📡 Fetching sentiment data for AAPL from 2021-01-31 to 2021-03-02...
📡 Fetching sentiment data for AAPL from 2021-03-02 to 2021-04-01...
📡 Fetching sentiment data for AAPL from 2021-04-01 to 2021-05-01...
📡 Fetching sentiment data for AAPL from 2021-05-01 to 2021-05-31...
📡 Fetching sentiment data for AAPL from 2021-05-31 to 2021-06-30...
📡 Fetching sentiment data for AAPL from 2021-06-30 to 2021-07-30...
📡 Fetching sentiment data for AAPL from 2021-07-30 to 2021-08-29...
📡 Fetching sentiment data for AAPL from 2021-08-29 to 2021-09-28...
📡 Fetching sentiment data for AAPL from 2021-09-28 to 2021-10-28...
📡 Fetching sentiment data for AAPL from 2021-10-28 to 2021

  .ffill()


📰 Fetching sentiment data for AMZN from 2021-01-01 to 2025-03-16...
📡 Fetching sentiment data for AMZN from 2021-01-01 to 2021-01-31...
📡 Fetching sentiment data for AMZN from 2021-01-31 to 2021-03-02...
📡 Fetching sentiment data for AMZN from 2021-03-02 to 2021-04-01...
📡 Fetching sentiment data for AMZN from 2021-04-01 to 2021-05-01...
📡 Fetching sentiment data for AMZN from 2021-05-01 to 2021-05-31...
📡 Fetching sentiment data for AMZN from 2021-05-31 to 2021-06-30...
📡 Fetching sentiment data for AMZN from 2021-06-30 to 2021-07-30...
📡 Fetching sentiment data for AMZN from 2021-07-30 to 2021-08-29...
📡 Fetching sentiment data for AMZN from 2021-08-29 to 2021-09-28...
📡 Fetching sentiment data for AMZN from 2021-09-28 to 2021-10-28...
📡 Fetching sentiment data for AMZN from 2021-10-28 to 2021-11-27...
📡 Fetching sentiment data for AMZN from 2021-11-27 to 2021-12-27...
📡 Fetching sentiment data for AMZN from 2021-12-27 to 2022-01-26...
📡 Fetching sentiment data for AMZN from 2022-01-

  .ffill()


📰 Fetching sentiment data for SPY from 2021-01-01 to 2025-03-16...
📡 Fetching sentiment data for SPY from 2021-01-01 to 2021-01-31...
📡 Fetching sentiment data for SPY from 2021-01-31 to 2021-03-02...
📡 Fetching sentiment data for SPY from 2021-03-02 to 2021-04-01...
📡 Fetching sentiment data for SPY from 2021-04-01 to 2021-05-01...
📡 Fetching sentiment data for SPY from 2021-05-01 to 2021-05-31...
📡 Fetching sentiment data for SPY from 2021-05-31 to 2021-06-30...
📡 Fetching sentiment data for SPY from 2021-06-30 to 2021-07-30...
📡 Fetching sentiment data for SPY from 2021-07-30 to 2021-08-29...
📡 Fetching sentiment data for SPY from 2021-08-29 to 2021-09-28...
📡 Fetching sentiment data for SPY from 2021-09-28 to 2021-10-28...
📡 Fetching sentiment data for SPY from 2021-10-28 to 2021-11-27...
📡 Fetching sentiment data for SPY from 2021-11-27 to 2021-12-27...
📡 Fetching sentiment data for SPY from 2021-12-27 to 2022-01-26...
📡 Fetching sentiment data for SPY from 2022-01-26 to 2022-02-2

  .ffill()


📰 Fetching sentiment data for QQQ from 2021-01-01 to 2025-03-16...
📡 Fetching sentiment data for QQQ from 2021-01-01 to 2021-01-31...
📡 Fetching sentiment data for QQQ from 2021-01-31 to 2021-03-02...
📡 Fetching sentiment data for QQQ from 2021-03-02 to 2021-04-01...
📡 Fetching sentiment data for QQQ from 2021-04-01 to 2021-05-01...
📡 Fetching sentiment data for QQQ from 2021-05-01 to 2021-05-31...
📡 Fetching sentiment data for QQQ from 2021-05-31 to 2021-06-30...
📡 Fetching sentiment data for QQQ from 2021-06-30 to 2021-07-30...
📡 Fetching sentiment data for QQQ from 2021-07-30 to 2021-08-29...
📡 Fetching sentiment data for QQQ from 2021-08-29 to 2021-09-28...
📡 Fetching sentiment data for QQQ from 2021-09-28 to 2021-10-28...
📡 Fetching sentiment data for QQQ from 2021-10-28 to 2021-11-27...
📡 Fetching sentiment data for QQQ from 2021-11-27 to 2021-12-27...
📡 Fetching sentiment data for QQQ from 2021-12-27 to 2022-01-26...
📡 Fetching sentiment data for QQQ from 2022-01-26 to 2022-02-2

  .ffill()


In [13]:
import pandas as pd

def calculate_technical_indicators(df):
    """
    Adds technical indicators: SMA, EMA, RSI, MACD, and MACD Signal.
    Drops edge cases (rows with NaN) after calculation.
    Version: 2025-03-15
    """
    df['SMA_10'] = df['Close'].rolling(window=10).mean()
    df['SMA_20'] = df['Close'].rolling(window=20).mean()
    df['EMA_10'] = df['Close'].ewm(span=10, adjust=False).mean()
    df['EMA_20'] = df['Close'].ewm(span=20, adjust=False).mean()

    delta = df['Close'].diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=14, min_periods=14).mean()
    avg_loss = loss.rolling(window=14, min_periods=14).mean()
    rs = avg_gain / avg_loss
    df['RSI'] = 100 - (100 / (1 + rs))

    ema12 = df['Close'].ewm(span=12, adjust=False).mean()
    ema26 = df['Close'].ewm(span=26, adjust=False).mean()
    df['MACD'] = ema12 - ema26
    df['MACD_Signal'] = df['MACD'].ewm(span=9, adjust=False).mean()

    # Drop edge cases that have NaN values (e.g., initial rows)
    df.dropna(inplace=True)

    return df

def preprocess_and_save(ticker, raw_filepath):
    """
    Loads raw CSV, calculates technical indicators, drops edge cases and the Date column, then saves the processed file.
    Version: 2025-03-15
    """
    df = pd.read_csv(raw_filepath, parse_dates=["Date"])
    df = calculate_technical_indicators(df)
    # Drop the Date column as it's not used for training
    df.drop(columns=["Date"], inplace=True, errors="ignore")

    processed_filepath = raw_filepath.replace("_raw.csv", "_processed.csv")
    df.to_csv(processed_filepath, index=False)
    print(f"✅ Processed data for {ticker} saved to: {processed_filepath}")
    return processed_filepath

import glob, os
raw_files = glob.glob("/content/drive/MyDrive/StockData/*_raw.csv")
for file in raw_files:
    ticker = os.path.basename(file).split("_")[0]
    preprocess_and_save(ticker, file)


✅ Processed data for AAPL saved to: /content/drive/MyDrive/StockData/AAPL_2021-01-01_to_2025-03-16_processed.csv
✅ Processed data for AMZN saved to: /content/drive/MyDrive/StockData/AMZN_2021-01-01_to_2025-03-16_processed.csv
✅ Processed data for MSFT saved to: /content/drive/MyDrive/StockData/MSFT_2021-01-01_to_2025-03-16_processed.csv
✅ Processed data for SPY saved to: /content/drive/MyDrive/StockData/SPY_2021-01-01_to_2025-03-16_processed.csv
✅ Processed data for QQQ saved to: /content/drive/MyDrive/StockData/QQQ_2021-01-01_to_2025-03-16_processed.csv


In [16]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import joblib
import os
import datetime

def scale_data(df, target_col='Close', exclude_cols=['sentiment_polarity', 'sentiment_subjectivity']):
    """
    Scales numeric columns (excluding exclude_cols and target_col for separate scaler).
    Returns the scaled dataframe, a features scaler, and a target scaler.
    Version: 2025-03-15
    """
    # Features: all numeric columns except target and excluded sentiment columns
    feature_cols = [col for col in df.select_dtypes(include=['float64','int64']).columns
                    if col not in exclude_cols + [target_col]]
    scaler_features = MinMaxScaler()
    df_features_scaled = df.copy()
    df_features_scaled[feature_cols] = scaler_features.fit_transform(df_features_scaled[feature_cols])

    # Target scaler: scale the target column separately
    scaler_target = MinMaxScaler()
    df_features_scaled[target_col] = scaler_target.fit_transform(df_features_scaled[[target_col]])

    return df_features_scaled, scaler_features, scaler_target

def scale_and_save(ticker, processed_filepath):
    """
    Loads processed data, scales it, and saves the feature and target scalers with the ticker and current date.
    Version: 2025-03-15
    """
    df = pd.read_csv(processed_filepath)
    df_scaled, scaler_features, scaler_target = scale_data(df)

    scaler_dir = "/content/drive/MyDrive/StockScalers"
    os.makedirs(scaler_dir, exist_ok=True)
    today_str = datetime.date.today().strftime("%Y%m%d")
    scaler_features_path = os.path.join(scaler_dir, f"{ticker}_{today_str}_features_scaler.pkl")
    scaler_target_path = os.path.join(scaler_dir, f"{ticker}_{today_str}_target_scaler.pkl")
    joblib.dump(scaler_features, scaler_features_path)
    joblib.dump(scaler_target, scaler_target_path)
    print(f"✅ Feature scaler for {ticker} saved to: {scaler_features_path}")
    print(f"✅ Target scaler for {ticker} saved to: {scaler_target_path}")

    scaled_filepath = processed_filepath.replace("_processed.csv", "_scaled.csv")
    df_scaled.to_csv(scaled_filepath, index=False)
    print(f"✅ Scaled data for {ticker} saved to: {scaled_filepath}")
    return scaled_filepath, scaler_features, scaler_target

import glob, os
processed_files = glob.glob("/content/drive/MyDrive/StockData/*_processed.csv")
for file in processed_files:
    ticker = os.path.basename(file).split("_")[0]
    scale_and_save(ticker, file)


✅ Feature scaler for AAPL saved to: /content/drive/MyDrive/StockScalers/AAPL_20250316_features_scaler.pkl
✅ Target scaler for AAPL saved to: /content/drive/MyDrive/StockScalers/AAPL_20250316_target_scaler.pkl
✅ Scaled data for AAPL saved to: /content/drive/MyDrive/StockData/AAPL_2021-01-01_to_2025-03-16_scaled.csv
✅ Feature scaler for AMZN saved to: /content/drive/MyDrive/StockScalers/AMZN_20250316_features_scaler.pkl
✅ Target scaler for AMZN saved to: /content/drive/MyDrive/StockScalers/AMZN_20250316_target_scaler.pkl
✅ Scaled data for AMZN saved to: /content/drive/MyDrive/StockData/AMZN_2021-01-01_to_2025-03-16_scaled.csv
✅ Feature scaler for MSFT saved to: /content/drive/MyDrive/StockScalers/MSFT_20250316_features_scaler.pkl
✅ Target scaler for MSFT saved to: /content/drive/MyDrive/StockScalers/MSFT_20250316_target_scaler.pkl
✅ Scaled data for MSFT saved to: /content/drive/MyDrive/StockData/MSFT_2021-01-01_to_2025-03-16_scaled.csv
✅ Feature scaler for SPY saved to: /content/drive/My

In [18]:
import glob
import os
import numpy as np
import pandas as pd

# Same helper function for creating sequences:
def create_sequences(df, feature_cols, label_col='Close', sequence_length=60):
    """
    Creates training sequences from the DataFrame.
    Returns X with shape (num_samples, sequence_length, num_features) and y with shape (num_samples,).
    Version: 2025-03-16
    """
    data_array = df[feature_cols].values
    labels = df[label_col].values
    X, y = [], []
    for i in range(len(df) - sequence_length):
        seq_x = data_array[i : i + sequence_length]
        seq_y = labels[i + sequence_length]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

# Customize these to match your columns and desired window length
feature_cols = [
    'Open', 'High', 'Low', 'Close', 'Volume',
    'SMA_10', 'SMA_20', 'EMA_10', 'EMA_20', 'RSI',
    'MACD', 'MACD_Signal', 'sentiment_polarity', 'sentiment_subjectivity'
]
sequence_length = 60

# Location of your scaled CSV files
scaled_files = glob.glob("/content/drive/MyDrive/StockData/*_scaled.csv")

for file in scaled_files:
    filename = os.path.basename(file)  # e.g. "AAPL_2021-01-01_to_2025-03-16_scaled.csv"
    # Parse the ticker from the first underscore-delimited segment
    # e.g. "AAPL" from "AAPL_2021-01-01_to_2025-03-16_scaled.csv"
    ticker = filename.split("_")[0]

    print(f"\n=== Building sequences for {ticker} from file: {filename} ===")

    # Load the scaled CSV
    df_scaled = pd.read_csv(file)

    # Create sequences
    X_all, y_all = create_sequences(df_scaled, feature_cols, label_col='Close', sequence_length=sequence_length)
    total_samples = len(X_all)
    if total_samples == 0:
        print(f"⚠️ Not enough data to create sequences for {ticker}. Skipping.")
        continue

    # Train/Validation/Test Split (70/15/15)
    train_end = int(total_samples * 0.7)
    val_end = int(total_samples * 0.85)

    X_train = X_all[:train_end]
    y_train = y_all[:train_end]
    X_val   = X_all[train_end:val_end]
    y_val   = y_all[train_end:val_end]
    X_test  = X_all[val_end:]
    y_test  = y_all[val_end:]

    # Save the .npy files (so Code Cell 5 can load them)
    # You can choose any folder structure. Here, we’ll save in the same folder as the scaled CSV.
    save_dir = os.path.dirname(file)
    np.save(os.path.join(save_dir, f"{ticker}_X_train.npy"), X_train)
    np.save(os.path.join(save_dir, f"{ticker}_y_train.npy"), y_train)
    np.save(os.path.join(save_dir, f"{ticker}_X_val.npy"), X_val)
    np.save(os.path.join(save_dir, f"{ticker}_y_val.npy"), y_val)
    np.save(os.path.join(save_dir, f"{ticker}_X_test.npy"), X_test)
    np.save(os.path.join(save_dir, f"{ticker}_y_test.npy"), y_test)

    print(f"✅ Sequences created and saved for {ticker}.")
    print(f"   X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"   X_val:   {X_val.shape}, y_val:   {y_val.shape}")
    print(f"   X_test:  {X_test.shape}, y_test:  {y_test.shape}")



=== Building sequences for AAPL from file: AAPL_2021-01-01_to_2025-03-16_scaled.csv ===
✅ Sequences created and saved for AAPL.
   X_train: (294, 60, 14), y_train: (294,)
   X_val:   (63, 60, 14), y_val:   (63,)
   X_test:  (64, 60, 14), y_test:  (64,)

=== Building sequences for AMZN from file: AMZN_2021-01-01_to_2025-03-16_scaled.csv ===
✅ Sequences created and saved for AMZN.
   X_train: (294, 60, 14), y_train: (294,)
   X_val:   (63, 60, 14), y_val:   (63,)
   X_test:  (64, 60, 14), y_test:  (64,)

=== Building sequences for MSFT from file: MSFT_2021-01-01_to_2025-03-16_scaled.csv ===
✅ Sequences created and saved for MSFT.
   X_train: (294, 60, 14), y_train: (294,)
   X_val:   (63, 60, 14), y_val:   (63,)
   X_test:  (64, 60, 14), y_test:  (64,)

=== Building sequences for SPY from file: SPY_2021-01-01_to_2025-03-16_scaled.csv ===
✅ Sequences created and saved for SPY.
   X_train: (294, 60, 14), y_train: (294,)
   X_val:   (63, 60, 14), y_val:   (63,)
   X_test:  (64, 60, 14), y_

In [19]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import joblib
import keras_tuner as kt
import tensorflow as tf
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (Input, Conv1D, MaxPooling1D, Flatten, Dense, Dropout,
                                     SimpleRNN, LSTM, Concatenate, Multiply, Activation, Lambda)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
from matplotlib.lines import Line2D

# ----------------------------------------------------
# Helper: inverse transform a single feature with a scaler
# ----------------------------------------------------
def inverse_transform_single_feature(scaler, data):
    """
    Inverse-transforms a single feature using the given scaler.
    Version: 2025-03-16
    """
    data = np.array(data).reshape(-1, 1)
    return scaler.inverse_transform(data)

# ----------------------------------------------------
# Helper: Build the Ensemble Model (CNN, RNN, LSTM)
# ----------------------------------------------------
def build_ensemble_model(hp, input_shape):
    """
    Builds an ensemble model combining CNN, RNN, and LSTM branches with adaptive fusion.
    Version: 2025-03-16
    """
    inputs = Input(shape=input_shape)

    # --- CNN Branch ---
    cnn = Conv1D(
        filters=hp.Choice('cnn_filters', [64, 128, 256]),
        kernel_size=hp.Choice('cnn_kernel_size', [3, 5, 7]),
        activation='relu',
        padding='same'
    )(inputs)
    if input_shape[0] > 1:
        cnn = MaxPooling1D(pool_size=2)(cnn)
    cnn = Flatten()(cnn)
    cnn = Dense(50, activation='relu')(cnn)

    # --- RNN Branch ---
    rnn = SimpleRNN(units=hp.Choice('rnn_units', [75, 100, 125, 150]), return_sequences=True)(inputs)
    rnn = Dropout(hp.Choice('dropout_rate', [0.05, 0.1, 0.2]))(rnn)
    rnn = SimpleRNN(units=hp.Choice('rnn_units_2', [75, 100, 125, 150]))(rnn)
    rnn = Dropout(hp.Choice('dropout_rate_2', [0.05, 0.1, 0.2]))(rnn)
    rnn = Dense(50, activation='relu')(rnn)

    # --- LSTM Branch ---
    lstm = LSTM(units=hp.Choice('lstm_units', [50, 75, 100]), return_sequences=True)(inputs)
    lstm = LSTM(units=hp.Choice('lstm_units_2', [50, 75, 100]))(lstm)
    lstm = Dense(50, activation='relu')(lstm)
    lstm = Dropout(hp.Choice('dropout_rate_lstm', [0.1, 0.2, 0.3]))(lstm)

    # --- Adaptive Fusion ---
    combined = Concatenate()([cnn, rnn, lstm])
    weight_logits = Dense(3)(combined)
    branch_weights = Activation('softmax')(weight_logits)
    cnn_weight  = Lambda(lambda x: tf.reshape(x[:, 0], (-1, 1)))(branch_weights)
    rnn_weight  = Lambda(lambda x: tf.reshape(x[:, 1], (-1, 1)))(branch_weights)
    lstm_weight = Lambda(lambda x: tf.reshape(x[:, 2], (-1, 1)))(branch_weights)
    cnn_scaled  = Multiply()([cnn, cnn_weight])
    rnn_scaled  = Multiply()([rnn, rnn_weight])
    lstm_scaled = Multiply()([lstm, lstm_weight])
    merged = Concatenate()([cnn_scaled, rnn_scaled, lstm_scaled])

    # --- Dense Layers ---
    merged = Dense(
        units=hp.Choice('dense_units', [50, 100, 150]),
        activation="relu",
        kernel_regularizer=l2(0.001)
    )(merged)
    merged = Dropout(hp.Choice('dropout_rate_dense', [0.1, 0.2, 0.3]))(merged)
    output = Dense(1)(merged)

    model = Model(inputs, output)
    model.compile(
        optimizer=Adam(learning_rate=hp.Choice('learning_rate', [0.001, 0.0005, 0.0001])),
        loss="mse",
        metrics=["mae"]
    )
    return model

# ----------------------------------------------------
# Main training pipeline (Version 2025-03-16)
# ----------------------------------------------------

# Where your *_X_train.npy, *_y_train.npy, etc. are located:
data_dir = "/content/drive/MyDrive/StockData"

# We'll look for files named e.g. AAPL_X_train.npy, AMZN_X_train.npy, etc.
train_files = glob.glob(os.path.join(data_dir, "*_X_train.npy"))

for xtrain_path in train_files:
    # Example: xtrain_path = "/content/drive/MyDrive/StockData/AAPL_X_train.npy"
    filename = os.path.basename(xtrain_path)  # e.g. "AAPL_X_train.npy"
    ticker = filename.split("_")[0]           # e.g. "AAPL"

    # Construct paths for y_train, X_val, y_val, X_test, y_test
    ytrain_path = os.path.join(data_dir, f"{ticker}_y_train.npy")
    xval_path   = os.path.join(data_dir, f"{ticker}_X_val.npy")
    yval_path   = os.path.join(data_dir, f"{ticker}_y_val.npy")
    xtest_path  = os.path.join(data_dir, f"{ticker}_X_test.npy")
    ytest_path  = os.path.join(data_dir, f"{ticker}_y_test.npy")

    # Make sure all files exist
    needed_files = [ytrain_path, xval_path, yval_path, xtest_path, ytest_path]
    if not all(os.path.exists(p) for p in needed_files):
        print(f"⚠️ Missing some .npy files for {ticker}. Skipping.")
        continue

    # Load the data
    X_train = np.load(xtrain_path)
    y_train = np.load(ytrain_path)
    X_val   = np.load(xval_path)
    y_val   = np.load(yval_path)
    X_test  = np.load(xtest_path)
    y_test  = np.load(ytest_path)

    # Check shapes
    if X_train.ndim == 2:
        X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
        X_val   = X_val.reshape((X_val.shape[0], 1, X_val.shape[1]))
        X_test  = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

    input_shape = (X_train.shape[1], X_train.shape[2])
    print(f"\n===== Training Ensemble Model for {ticker} =====")
    print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
    print(f"X_val:   {X_val.shape},   y_val:   {y_val.shape}")
    print(f"X_test:  {X_test.shape},  y_test:  {y_test.shape}")

    # Load target scaler for inverse transform of predictions (optional)
    # Adjust if your naming is different. For example, if you saved them as {ticker}_YYYYMMDD_target_scaler.pkl
    # We'll do a simple approach: pick the most recent target scaler for that ticker.
    scaler_search = glob.glob(os.path.join("/content/drive/MyDrive/StockScalers", f"{ticker}_*_target_scaler.pkl"))
    if not scaler_search:
        print(f"⚠️ No target scaler found for {ticker}. Will skip inverse scaling.")
        scaler_y = None
    else:
        # Just pick the last (or only) match
        scaler_search.sort()
        scaler_y_path = scaler_search[-1]
        scaler_y = joblib.load(scaler_y_path)
        print(f"Loaded target scaler for {ticker} from: {scaler_y_path}")

    # Create a folder for saving the model and outputs
    model_folder = os.path.join(data_dir, f"BestEnsembleModel_{ticker}")
    os.makedirs(model_folder, exist_ok=True)

    # Hyperparameter Tuning
    best_hps_file = os.path.join(model_folder, "best_hyperparameters.json")
    tuning_flag_file = os.path.join(model_folder, "hp_tuning_complete.flag")

    def model_builder(hp):
        return build_ensemble_model(hp, input_shape=input_shape)

    if not os.path.exists(tuning_flag_file):
        # If not tuned yet, run tuner
        if os.path.exists(best_hps_file):
            os.remove(best_hps_file)
        print(f"🔍 Hyperparameter tuning for {ticker}...")
        tuner = kt.BayesianOptimization(
            model_builder,
            objective="val_loss",
            max_trials=15,
            executions_per_trial=2,
            directory=os.path.join(model_folder, "tuning"),
            project_name=f"ensemble_{ticker}"
        )
        tuner.search(X_train, y_train, epochs=50, validation_data=(X_val, y_val), verbose=1)
        best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
        best_hps_dict = {param: best_hps.get(param) for param in best_hps.values.keys()}
        with open(best_hps_file, "w") as f:
            json.dump(best_hps_dict, f)
        with open(tuning_flag_file, "w") as f:
            f.write("tuning complete")
        model = tuner.hypermodel.build(best_hps)
    else:
        # Already tuned; load best hyperparameters
        print(f"✅ Loading best hyperparameters for {ticker} from file...")
        with open(best_hps_file, "r") as f:
            best_hps_dict = json.load(f)
        best_hps = kt.HyperParameters()
        for key, value in best_hps_dict.items():
            best_hps.Fixed(key, value)
        model = build_ensemble_model(best_hps, input_shape)

    print("Best hyperparameters:", best_hps_dict)

    # Train the Model
    BATCH_SIZE = 32
    early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)
    history = model.fit(
        X_train, y_train,
        epochs=500,
        batch_size=BATCH_SIZE,
        validation_data=(X_val, y_val),
        callbacks=[early_stop],
        verbose=1
    )

    best_model_path = os.path.join(model_folder, "best_ensemble_model.keras")
    model.save(best_model_path)
    print(f"✅ Best Ensemble Model for {ticker} saved to {best_model_path}")

    # Plot training history
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['loss'], label='Train Loss', color='blue')
    plt.plot(history.history['val_loss'], label='Validation Loss', color='red')
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title(f"{ticker} - Training & Validation Loss")
    plt.legend()
    history_plot_path = os.path.join(model_folder, "training_history.png")
    plt.savefig(history_plot_path)
    plt.close()
    print(f"✅ Training history graph saved to {history_plot_path}")

    # Evaluate the model on the test set
    loss, mae = model.evaluate(X_test, y_test, verbose=0)
    print(f"Test Loss: {loss:.4f}, Test MAE: {mae:.4f}")

    predictions = model.predict(X_test)
    if scaler_y is not None:
        predictions_rescaled = inverse_transform_single_feature(scaler_y, predictions)
        y_test_rescaled = inverse_transform_single_feature(scaler_y, y_test)
    else:
        predictions_rescaled = predictions.flatten()
        y_test_rescaled = y_test.flatten()

    # Calculate Directional Accuracy
    correct_direction = 0
    for i in range(len(y_test_rescaled)-1):
        if (y_test_rescaled[i+1]-y_test_rescaled[i])*(predictions_rescaled[i+1]-predictions_rescaled[i]) >= 0:
            correct_direction += 1
    directional_accuracy = (correct_direction / (len(y_test_rescaled)-1)) * 100 if len(y_test_rescaled) > 1 else 0
    print(f"Directional Accuracy: {directional_accuracy:.2f}%")

    # Plot predicted vs actual with directional coloring
    x_vals = np.arange(len(y_test_rescaled))
    plt.figure(figsize=(12, 6))
    plt.plot(x_vals, y_test_rescaled, label="Actual Price", color="blue")
    for i in range(len(x_vals)-1):
        color = 'green' if (y_test_rescaled[i+1]-y_test_rescaled[i])*(predictions_rescaled[i+1]-predictions_rescaled[i]) >= 0 else 'red'
        plt.plot(x_vals[i:i+2], predictions_rescaled[i:i+2], color=color)
    blue_line = Line2D([0], [0], color='blue', label='Actual Price')
    green_line = Line2D([0], [0], color='green', label='Predicted (Correct Dir)')
    red_line = Line2D([0], [0], color='red', label='Predicted (Wrong Dir)')
    plt.legend(handles=[blue_line, green_line, red_line])
    plt.xlabel("Time")
    plt.ylabel("Price")
    plt.title(f"{ticker} - Predicted vs Actual")
    pred_plot_path = os.path.join(model_folder, "pred_vs_actual.png")
    plt.savefig(pred_plot_path)
    plt.close()
    print(f"✅ Prediction vs Actual plot saved to {pred_plot_path}")

    # Save performance stats
    stats_filepath = os.path.join(model_folder, "model_performance.txt")
    with open(stats_filepath, "w") as f:
        f.write(f"Test Loss: {loss:.4f}\n")
        f.write(f"Test MAE: {mae:.4f}\n")
        f.write(f"Directional Accuracy: {directional_accuracy:.2f}%\n")
    print(f"✅ Model performance stats saved to {stats_filepath}")

    print(f"🎉 Finished training for {ticker}!\n")


Trial 15 Complete [00h 03m 31s]
val_loss: 0.015526477713137865

Best val_loss So Far: 0.00865392992272973
Total elapsed time: 00h 54m 18s
Best hyperparameters: {'cnn_filters': 128, 'cnn_kernel_size': 7, 'rnn_units': 75, 'dropout_rate': 0.1, 'rnn_units_2': 100, 'dropout_rate_2': 0.05, 'lstm_units': 100, 'lstm_units_2': 50, 'dropout_rate_lstm': 0.2, 'dense_units': 100, 'dropout_rate_dense': 0.1, 'learning_rate': 0.001}
Epoch 1/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 270ms/step - loss: 0.2094 - mae: 0.2479 - val_loss: 0.1195 - val_mae: 0.1025
Epoch 2/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 118ms/step - loss: 0.1146 - mae: 0.0834 - val_loss: 0.1012 - val_mae: 0.0639
Epoch 3/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 118ms/step - loss: 0.1013 - mae: 0.0731 - val_loss: 0.0966 - val_mae: 0.0986
Epoch 4/500
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 124ms/step - loss: 0.0897 - mae: 0.0613 - va

In [12]:
import os
import numpy as np
import datetime
import pandas as pd
import joblib
import requests
import time

def daily_data_pipeline(ticker, date, model, scaler, sequence_length=60):
    """
    Pipeline to:
      1. Fetch the latest daily stock data (with sentiment and technical indicators).
      2. Preprocess and calculate technical indicators.
      3. Scale using the saved scaler.
      4. Build a rolling window sequence and predict.
    """
    # Set your Polygon API key from environment variable
    api_key = os.environ.get("POLYGON_API_KEY", "YOUR_POLYGON_API_KEY")
    if api_key == "YOUR_POLYGON_API_KEY":
        print("Please set your Polygon API key in the environment variable POLYGON_API_KEY")
        return None

    # Fetch stock data for the day (using the raw data function)
    url = f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/day/{date}/{date}?apiKey={api_key}"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching data for {ticker} on {date}: {response.text}")
        return None
    data = response.json()
    if "results" not in data:
        print(f"No stock data available for {ticker} on {date}.")
        return None
    df_stock = pd.DataFrame(data["results"])
    df_stock["Date"] = pd.to_datetime(df_stock["t"], unit="ms").dt.date
    df_stock.rename(columns={"o": "Open", "h": "High", "l": "Low", "c": "Close", "v": "Volume"}, inplace=True)
    df_stock = df_stock[["Date", "Open", "High", "Low", "Close", "Volume"]]

    # (Optionally) Fetch and process sentiment data for the day...
    # For simplicity, assume neutral sentiment if not available.
    df_stock["sentiment_polarity"] = 0
    df_stock["sentiment_subjectivity"] = 0

    # Calculate technical indicators
    df_stock = calculate_technical_indicators(df_stock)

    # In a production setting, you would build your sequence using the previous (historical) data.
    # Here we simulate a rolling window by reading a stored scaled CSV file and appending the new day.
    historical_filepath = f"/content/drive/MyDrive/StockData/{ticker}_processed.csv"
    if not os.path.exists(historical_filepath):
        print("Historical data file not found for", ticker)
        return None
    df_hist = pd.read_csv(historical_filepath, parse_dates=["Date"])
    df_hist = calculate_technical_indicators(df_hist)

    # Append today's data and then scale using the saved scaler
    df_combined = pd.concat([df_hist, df_stock], ignore_index=True)
    df_scaled = df_combined.copy()
    # Identify columns to scale (same as in scaling function)
    numeric_cols = [col for col in df_scaled.select_dtypes(include=['float64','int64']).columns
                    if col not in ['sentiment_polarity', 'sentiment_subjectivity']]
    df_scaled[numeric_cols] = scaler.transform(df_scaled[numeric_cols])

    # Build sequence: take the last 'sequence_length' rows for features.
    try:
        # Define the feature set for prediction (adjust column names as needed)
        feature_cols = ['Open', 'High', 'Low', 'Close', 'Volume',
                        'SMA_10', 'SMA_20', 'EMA_10', 'EMA_20', 'RSI']
        seq = df_scaled[feature_cols].tail(sequence_length).values
    except Exception as e:
        print("Error building feature sequence:", e)
        return None

    if seq.shape[0] < sequence_length:
        pad = np.tile(seq[0], (sequence_length - seq.shape[0], 1))
        seq = np.vstack([pad, seq])
    seq = seq.reshape(1, sequence_length, len(feature_cols))

    prediction = model.predict(seq)
    print(f"Predicted price for {ticker} on {date}: {prediction[0][0]}")
    return prediction[0][0]

# Example usage for daily deployment:
# Load your trained model and the saved scaler (for the target features, if different, adjust accordingly)
# Assume best_model has been loaded and scaler was saved in "/content/drive/MyDrive/StockScalers/{ticker}_scaler.pkl"
ticker = "AAPL"
today_date = datetime.date.today().strftime("%Y-%m-%d")
scaler_path = f"/content/drive/MyDrive/StockScalers/{ticker}_scaler.pkl"
if os.path.exists(scaler_path):
    scaler = joblib.load(scaler_path)
else:
    print("Scaler file not found for", ticker)
    scaler = None

# Load the model (or use the one from training)
from tensorflow.keras.models import load_model
model_path = f"/content/drive/MyDrive/StockModels/Ensemble/best_ensemble_model.keras"
if os.path.exists(model_path):
    deployed_model = load_model(model_path)
else:
    print("Model file not found at", model_path)
    deployed_model = None

if scaler is not None and deployed_model is not None:
    daily_prediction = daily_data_pipeline(ticker, today_date, deployed_model, scaler)


ValueError: The `{arg_name}` of this `Lambda` layer is a Python lambda. Deserializing it is unsafe. If you trust the source of the config artifact, you can override this error by passing `safe_mode=False` to `from_config()`, or calling `keras.config.enable_unsafe_deserialization().