<a href="https://colab.research.google.com/github/NathanDietrich/Iron-Knight-Investments/blob/main/DailyRawData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import requests
import datetime
import pandas as pd
import time
from textblob import TextBlob

def fetch_recent_stock_data_polygon(ticker, lookback_days, api_key):
    """
    Fetches ~lookback_days of daily stock data from Polygon.io, including
    (possibly partial) data for today if the market is open.
    """
    end_date_dt = datetime.date.today()  # "Today"
    start_date_dt = end_date_dt - datetime.timedelta(days=lookback_days)

    start_date = start_date_dt.strftime("%Y-%m-%d")
    end_date   = end_date_dt.strftime("%Y-%m-%d")

    url = (
        f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/day/"
        f"{start_date}/{end_date}?apiKey={api_key}"
    )
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching stock data: {response.text}")
        return None

    data = response.json()
    if "results" not in data:
        print("No results found in Polygon data.")
        return None

    df = pd.DataFrame(data["results"])
    df["Date"] = pd.to_datetime(df["t"], unit="ms").dt.date
    df.rename(columns={"o": "Open", "h": "High", "l": "Low", "c": "Close", "v": "Volume"}, inplace=True)
    df = df[["Date", "Open", "High", "Low", "Close", "Volume"]]
    df.sort_values("Date", inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


def fetch_recent_sentiment_polygon(ticker, lookback_days, api_key, limit=1000):
    """
    Fetches news from Polygon.io for ~lookback_days, computing daily average
    sentiment polarity & subjectivity. This includes today's partial news if available.
    """
    end_date_dt = datetime.date.today()
    start_date_dt = end_date_dt - datetime.timedelta(days=lookback_days)
    start_date = start_date_dt.strftime("%Y-%m-%d")
    end_date   = end_date_dt.strftime("%Y-%m-%d")

    url = "https://api.polygon.io/v2/reference/news"
    all_results = []
    current_start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    final_end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")

    while current_start_date < final_end_date:
        chunk_end_date = current_start_date + datetime.timedelta(days=30)
        if chunk_end_date > final_end_date:
            chunk_end_date = final_end_date

        chunk_start_str = current_start_date.strftime("%Y-%m-%d")
        chunk_end_str = chunk_end_date.strftime("%Y-%m-%d")

        params = {
            "ticker": ticker,
            "published_utc.gte": chunk_start_str,
            "published_utc.lte": chunk_end_str,
            "apiKey": api_key,
            "limit": limit
        }

        while True:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                data = response.json()
                results = data.get("results", [])
                all_results.extend(results)

                next_cursor = data.get("next_cursor")
                if not next_cursor:
                    break
                params["cursor"] = next_cursor
            else:
                print(f"⚠️ Error fetching sentiment data: {response.status_code}, {response.text}")
                break

        current_start_date = chunk_end_date
        time.sleep(1.5)  # small delay

    # Convert to DataFrame
    if not all_results:
        return pd.DataFrame()
    news_df = pd.DataFrame(all_results)

    # Compute sentiment using TextBlob
    sentiments = []
    for _, row in news_df.iterrows():
        title = row.get("title", "")
        description = row.get("description", "")
        full_text = f"{title} {description}"
        sentiment = TextBlob(full_text).sentiment
        sentiments.append({
            "published_utc": row.get("published_utc", ""),
            "sentiment_polarity": sentiment.polarity,
            "sentiment_subjectivity": sentiment.subjectivity
        })
    sentiment_df = pd.DataFrame(sentiments)

    # Group by date
    sentiment_df["published_date"] = pd.to_datetime(sentiment_df["published_utc"], errors="coerce").dt.date
    daily_sentiment = sentiment_df.groupby("published_date").agg({
        "sentiment_polarity": "mean",
        "sentiment_subjectivity": "mean"
    }).reset_index()
    daily_sentiment.rename(columns={"published_date": "Date"}, inplace=True)

    return daily_sentiment


def assemble_today_prediction_row(ticker, api_key, lookback_days=30):
    """
    Fetches enough data (stock & sentiment) to produce a single row of features
    for 'today's predicted close.'
      - 'prev_' columns are from YESTERDAY's EOD
      - 'Open_current' is from TODAY's partial bar (the open)

    Returns a DataFrame with exactly 1 row:
      [prev_Open, prev_High, ..., prev_sentiment_subjectivity, Open_current]
    If not enough data is fetched (e.g. no trading yet), returns None.
    """
    # 1) Fetch historical daily data (includes yesterday + partial for today if available)
    stock_df = fetch_recent_stock_data_polygon(ticker, lookback_days, api_key)
    if stock_df is None or stock_df.empty:
        print("No stock data to assemble features.")
        return None

    # 2) Fetch daily sentiment for the same period
    sentiment_df = fetch_recent_sentiment_polygon(ticker, lookback_days, api_key)

    # 3) Merge
    merged_df = pd.merge(stock_df, sentiment_df, on="Date", how="left")
    # Forward-fill sentiment if missing
    merged_df[["sentiment_polarity", "sentiment_subjectivity"]] = (
        merged_df[["sentiment_polarity", "sentiment_subjectivity"]]
        .ffill()
        .fillna(0)
    )

    # Sort by date
    merged_df.sort_values("Date", inplace=True)
    merged_df.reset_index(drop=True, inplace=True)

    if len(merged_df) < 2:
        # We need at least yesterday + today's row
        print("Not enough data to build a prediction row (need at least 2 days).")
        return None

    # The last row is "today" (partial), the second-to-last is "yesterday"
    today_row = merged_df.iloc[-1]
    yesterday_row = merged_df.iloc[-2]

    # If the last row is truly "today" (check if date is today's date):
    if today_row["Date"] != datetime.date.today():
        print("Polygon data doesn’t have today's partial bar yet. No row created.")
        return None

    # Build a single-row DataFrame for features
    data_dict = {}
    # "prev_" columns from YESTERDAY
    data_dict["prev_Open"] = yesterday_row["Open"]
    data_dict["prev_High"] = yesterday_row["High"]
    data_dict["prev_Low"]  = yesterday_row["Low"]
    data_dict["prev_Close"] = yesterday_row["Close"]
    data_dict["prev_Volume"] = yesterday_row["Volume"]

    data_dict["prev_sentiment_polarity"] = yesterday_row["sentiment_polarity"]
    data_dict["prev_sentiment_subjectivity"] = yesterday_row["sentiment_subjectivity"]

    # "Open_current" is from TODAY’s row
    data_dict["Open_current"] = today_row["Open"]

    # Return as a 1-row DataFrame
    return pd.DataFrame([data_dict])


# Example usage (interactive):
if __name__ == "__main__":
    import getpass

    # Example: Polygon Key
    polygon_key = getpass.getpass("Enter your Polygon.io API key: ")
    ticker = "AAPL"
    one_row_df = assemble_today_prediction_row(ticker, polygon_key)

    if one_row_df is not None:
        print("Single-row DataFrame for prediction:")
        print(one_row_df)
    else:
        print("No row returned.")


Enter your Polygon.io API key: ··········
Polygon data doesn’t have today's partial bar yet. No row created.
No row returned.


hardcoding to make it a trading day

In [2]:
import os
import requests
import datetime
import pandas as pd
import time
from textblob import TextBlob

# -------------------------------------
# Hard-code today's date as 3/7/2025
FAKE_TODAY = datetime.date(2025, 3, 7)
# -------------------------------------

def fetch_recent_stock_data_polygon(ticker, lookback_days, api_key):
    """
    Fetches ~lookback_days of daily stock data from Polygon.io,
    including (possibly partial) data for FAKE_TODAY if we treat it as 'today'.
    """
    # Instead of using datetime.date.today(), use FAKE_TODAY
    end_date_dt = FAKE_TODAY
    start_date_dt = end_date_dt - datetime.timedelta(days=lookback_days)

    start_date = start_date_dt.strftime("%Y-%m-%d")
    end_date   = end_date_dt.strftime("%Y-%m-%d")

    url = (
        f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/day/"
        f"{start_date}/{end_date}?apiKey={api_key}"
    )
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Error fetching stock data: {response.text}")
        return None

    data = response.json()
    if "results" not in data:
        print("No results found in Polygon data.")
        return None

    df = pd.DataFrame(data["results"])
    df["Date"] = pd.to_datetime(df["t"], unit="ms").dt.date
    df.rename(columns={"o": "Open", "h": "High", "l": "Low", "c": "Close", "v": "Volume"}, inplace=True)
    df = df[["Date", "Open", "High", "Low", "Close", "Volume"]]
    df.sort_values("Date", inplace=True)
    df.reset_index(drop=True, inplace=True)
    return df


def fetch_recent_sentiment_polygon(ticker, lookback_days, api_key, limit=1000):
    """
    Fetches news from Polygon.io for ~lookback_days, computing daily average
    sentiment polarity & subjectivity. This includes FAKE_TODAY's partial news if available.
    """
    end_date_dt = FAKE_TODAY
    start_date_dt = end_date_dt - datetime.timedelta(days=lookback_days)
    start_date = start_date_dt.strftime("%Y-%m-%d")
    end_date   = end_date_dt.strftime("%Y-%m-%d")

    url = "https://api.polygon.io/v2/reference/news"
    all_results = []
    current_start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    final_end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")

    while current_start_date < final_end_date:
        chunk_end_date = current_start_date + datetime.timedelta(days=30)
        if chunk_end_date > final_end_date:
            chunk_end_date = final_end_date

        chunk_start_str = current_start_date.strftime("%Y-%m-%d")
        chunk_end_str = chunk_end_date.strftime("%Y-%m-%d")

        params = {
            "ticker": ticker,
            "published_utc.gte": chunk_start_str,
            "published_utc.lte": chunk_end_str,
            "apiKey": api_key,
            "limit": limit
        }

        while True:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                data = response.json()
                results = data.get("results", [])
                all_results.extend(results)

                next_cursor = data.get("next_cursor")
                if not next_cursor:
                    break
                params["cursor"] = next_cursor
            else:
                print(f"⚠️ Error fetching sentiment data: {response.status_code}, {response.text}")
                break

        current_start_date = chunk_end_date
        time.sleep(1.5)  # small delay

    # Convert to DataFrame
    if not all_results:
        return pd.DataFrame()
    news_df = pd.DataFrame(all_results)

    # Compute sentiment using TextBlob
    sentiments = []
    for _, row in news_df.iterrows():
        title = row.get("title", "")
        description = row.get("description", "")
        full_text = f"{title} {description}"
        sentiment = TextBlob(full_text).sentiment
        sentiments.append({
            "published_utc": row.get("published_utc", ""),
            "sentiment_polarity": sentiment.polarity,
            "sentiment_subjectivity": sentiment.subjectivity
        })
    sentiment_df = pd.DataFrame(sentiments)

    # Group by date
    sentiment_df["published_date"] = pd.to_datetime(sentiment_df["published_utc"], errors="coerce").dt.date
    daily_sentiment = sentiment_df.groupby("published_date").agg({
        "sentiment_polarity": "mean",
        "sentiment_subjectivity": "mean"
    }).reset_index()
    daily_sentiment.rename(columns={"published_date": "Date"}, inplace=True)

    return daily_sentiment


def assemble_today_prediction_row(ticker, api_key, lookback_days=30):
    """
    Fetches enough data (stock & sentiment) to produce a single row of features
    for 'FAKE_TODAY's predicted close.'
      - 'prev_' columns are from YESTERDAY's EOD
      - 'Open_current' is from FAKE_TODAY's partial bar (the open)

    Returns a DataFrame with exactly 1 row:
      [prev_Open, prev_High, ..., prev_sentiment_subjectivity, Open_current]
    If not enough data is fetched (e.g. no trading yet), returns None.
    """
    # 1) Fetch historical daily data (includes yesterday + partial for FAKE_TODAY if available)
    stock_df = fetch_recent_stock_data_polygon(ticker, lookback_days, api_key)
    if stock_df is None or stock_df.empty:
        print("No stock data to assemble features.")
        return None

    # 2) Fetch daily sentiment for the same period
    sentiment_df = fetch_recent_sentiment_polygon(ticker, lookback_days, api_key)

    # 3) Merge
    merged_df = pd.merge(stock_df, sentiment_df, on="Date", how="left")
    # Forward-fill sentiment if missing
    merged_df[["sentiment_polarity", "sentiment_subjectivity"]] = (
        merged_df[["sentiment_polarity", "sentiment_subjectivity"]]
        .ffill()
        .fillna(0)
    )

    # Sort by date
    merged_df.sort_values("Date", inplace=True)
    merged_df.reset_index(drop=True, inplace=True)

    if len(merged_df) < 2:
        # We need at least yesterday + 'FAKE_TODAY' row
        print("Not enough data to build a prediction row (need at least 2 days).")
        return None

    # The last row is "FAKE_TODAY" (partial), the second-to-last is "YESTERDAY"
    today_row = merged_df.iloc[-1]
    yesterday_row = merged_df.iloc[-2]

    # Check if the last row is truly FAKE_TODAY
    if today_row["Date"] != FAKE_TODAY:
        print("Polygon data does not have a row for FAKE_TODAY. No row created.")
        return None

    # Build a single-row DataFrame for features
    data_dict = {}
    # "prev_" columns from YESTERDAY
    data_dict["prev_Open"] = yesterday_row["Open"]
    data_dict["prev_High"] = yesterday_row["High"]
    data_dict["prev_Low"]  = yesterday_row["Low"]
    data_dict["prev_Close"] = yesterday_row["Close"]
    data_dict["prev_Volume"] = yesterday_row["Volume"]

    data_dict["prev_sentiment_polarity"] = yesterday_row["sentiment_polarity"]
    data_dict["prev_sentiment_subjectivity"] = yesterday_row["sentiment_subjectivity"]

    # "Open_current" is from FAKE_TODAY's row
    data_dict["Open_current"] = today_row["Open"]

    return pd.DataFrame([data_dict])


# Example usage (interactive):
if __name__ == "__main__":
    import getpass

    # Example: Polygon Key (won't actually return real data for 2025, but demo only)
    polygon_key = getpass.getpass("Enter your Polygon.io API key: ")
    ticker = "AAPL"
    one_row_df = assemble_today_prediction_row(ticker, polygon_key)

    if one_row_df is not None:
        print("Single-row DataFrame for prediction (FAKE 3/7/2025):")
        print(one_row_df)
    else:
        print("No row returned.")


Enter your Polygon.io API key: ··········
Single-row DataFrame for prediction (FAKE 3/7/2025):
   prev_Open  prev_High  prev_Low  prev_Close  prev_Volume  \
0    234.435     237.86  233.1581      235.33   43505844.0   

   prev_sentiment_polarity  prev_sentiment_subjectivity  Open_current  
0                -0.102431                     0.439699       235.105  
