<a href="https://colab.research.google.com/github/NathanDietrich/Artificial-Intelligence-and-Machine-Learning-portfolio/blob/main/BothDatas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This code gathers sentiment and historical stock data with technical indicators from a user selected stock and time period. This program also preprocesses the data besides dropping the date, and saves it to an output file, and saves the scaler in a .pkl file for later use


In [None]:
!pip install yfinance textblob ta

Collecting ta
  Downloading ta-0.11.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: ta
  Building wheel for ta (setup.py) ... [?25l[?25hdone
  Created wheel for ta: filename=ta-0.11.0-py3-none-any.whl size=29412 sha256=77d703dd4912a83813e54e6acf42ef9f9907a5255d4a32e7c12a6eba25cc4398
  Stored in directory: /root/.cache/pip/wheels/a1/d7/29/7781cc5eb9a3659d032d7d15bdd0f49d07d2b24fec29f44bc4
Successfully built ta
Installing collected packages: ta
Successfully installed ta-0.11.0


In [4]:
import requests
import datetime
import time
import numpy as np
import pandas as pd
import yfinance as yf
import joblib
from textblob import TextBlob
from ta.volatility import AverageTrueRange, BollingerBands, DonchianChannel, KeltnerChannel
from sklearn.preprocessing import MinMaxScaler
from ta.momentum import WilliamsRIndicator
from google.colab import userdata

def get_historical_news_chunked(ticker, start_date, end_date, api_key, limit=1000):
    """
    Fetches historical news data from Polygon in 1-month chunks.
    """
    url = "https://api.polygon.io/v2/reference/news"
    all_results = []

    current_start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    final_end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")

    while current_start_date < final_end_date:
        # Calculate the chunk end date (1 month from the start date)
        chunk_end_date = current_start_date + datetime.timedelta(days=30)
        if chunk_end_date > final_end_date:
            chunk_end_date = final_end_date

        chunk_start_str = current_start_date.strftime("%Y-%m-%d")
        chunk_end_str = chunk_end_date.strftime("%Y-%m-%d")

        print(f"Fetching news from {chunk_start_str} to {chunk_end_str}...")

        params = {
            "ticker": ticker,
            "published_utc.gte": chunk_start_str,
            "published_utc.lte": chunk_end_str,
            "apiKey": api_key,
            "limit": limit
        }

        while True:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                data = response.json()
                results = data.get("results", [])
                all_results.extend(results)

                # Check for pagination
                next_cursor = data.get("next_cursor")
                if not next_cursor:
                    break

                # Update the cursor for the next request
                params["cursor"] = next_cursor
            else:
                print(f"Error: {response.status_code}, {response.text}")
                break

        # Move to the next chunk
        current_start_date = chunk_end_date

        # Respect API rate limit (5 requests/min): sleep longer just to be safe
        print("Waiting for 1 minute to respect API rate limits...")
        time.sleep(14)

    return all_results

def analyze_sentiment(news_data):
    """
    Uses TextBlob to compute sentiment polarity and subjectivity for each news article.
    """
    analyzed_data = []
    for article in news_data:
        title = article.get("title", "")
        description = article.get("description", "")
        # Combine title and description for sentiment analysis
        full_text = f"{title} {description}"
        sentiment = TextBlob(full_text).sentiment

        analyzed_data.append({
            "title": title,
            "description": description,
            "published_date": article.get("published_utc", ""),
            "sentiment_polarity": sentiment.polarity,
            "sentiment_subjectivity": sentiment.subjectivity
        })
    return analyzed_data

def fetch_and_calculate_technical_indicators(ticker, start_date, end_date):
    """
    Fetch historical stock data using yfinance and calculate technical indicators.
    Returns a DataFrame with the technical indicators included.
    """
    # Fetch data from Yahoo Finance
    df = yf.Ticker(ticker).history(start=start_date, end=end_date)

    # Keep only necessary columns
    df = df[['Open', 'High', 'Low', 'Close', 'Volume']]

    # Average True Range (ATR)
    df['ATR'] = AverageTrueRange(high=df['High'], low=df['Low'], close=df['Close']).average_true_range()

    # Bollinger Bands
    bb = BollingerBands(close=df['Close'])
    df['BB_High'] = bb.bollinger_hband()
    df['BB_low'] = bb.bollinger_lband()

    # Donchian Channel
    dc = DonchianChannel(high=df['High'], low=df['Low'], close=df['Close'])
    df['DC_High'] = dc.donchian_channel_hband()
    df['DC_low'] = dc.donchian_channel_lband()

    # Keltner Channel
    kc = KeltnerChannel(high=df['High'], low=df['Low'], close=df['Close'])
    df['KC_High'] = kc.keltner_channel_hband()
    df['KC_Low'] = kc.keltner_channel_lband()

    # Chaikin Volatility
    high_low_range = df['High'] - df['Low']
    df['Chaikin_volatility'] = (
        high_low_range.rolling(window=10).mean() /
        high_low_range.rolling(window=10).std()
    )

    # Historical Volatility
    log_returns = np.log(df['Close'] / df['Close'].shift(1))
    df['Historical_volatility'] = log_returns.rolling(window=30).std() * np.sqrt(252)

    # Standard Deviation
    df['Standard_Deviation'] = df['Close'].rolling(window=14).std()

    # Williams %R
    wr = WilliamsRIndicator(high=df['High'], low=df['Low'], close=df['Close'])
    df['Williams_%R'] = wr.williams_r()

    # Commodity Channel Index (CCI)
    df['CCI'] = (
        (df['Close'] - df['Close'].rolling(20).mean())
        / (0.015 * df['Close'].rolling(20).std())
    )

    # RSI-Based Volatility (simple example)
    rsi_diff = df['Close'].rolling(window=14).apply(lambda x: max(x) - min(x))
    df['RSI_Based_Volatility'] = rsi_diff / df['Close']

    # Ulcer Index
    df['Ulcer_Index'] = (
        (df['Close'] - df['Close'].rolling(window=14).max()) ** 2
    ).rolling(window=14).mean()

    # True Strength Index (TSI) example
    df['TSI'] = (
        log_returns.ewm(span=25).mean() /
        log_returns.abs().ewm(span=13).mean()
    ) * 100

    # Fractal Chaos Oscillator
    df['Fractal_chaos_Oscillator'] = df['Close'].rolling(window=14).apply(lambda x: np.ptp(x))

    # Drop any rows with NaN caused by rolling calculations
    df.dropna(inplace=True)

    # Convert index to a column named "Date"
    df.reset_index(inplace=True)
    df.rename(columns={'Date': 'Date'}, inplace=True)
    # Keep date in date format
    df['Date'] = pd.to_datetime(df['Date']).dt.date

    return df

def main():
    # === Step 1: Get user inputs ===
    ticker = input("Enter the stock ticker (e.g., TSLA): ").strip().upper()
    start_date = input("Enter start date (YYYY-MM-DD): ").strip()
    end_date = input("Enter end date (YYYY-MM-DD): ").strip()

    # === Step 2: Load Polygon API key from Colab secrets ===
    api_key = userdata.get('Polygon_Key')
    if not api_key:
        print("Polygon API key not found in userdata. Please set it in Colab secrets.")
        return

    # === Step 3: Fetch and process stock data / technical indicators ===
    print(f"\nFetching stock data for {ticker} from {start_date} to {end_date}...")
    stock_df = fetch_and_calculate_technical_indicators(ticker, start_date, end_date)

    # === Step 4: Fetch news data, perform sentiment analysis ===
    print(f"\nFetching news for {ticker} from {start_date} to {end_date}...")
    news_data = get_historical_news_chunked(ticker, start_date, end_date, api_key, limit=1000)

    if not news_data:
        print("No news data found. Proceeding without sentiment data.")
        final_csv = f"combined_{ticker}_{start_date}_to_{end_date}.csv"
        stock_df.to_csv(final_csv, index=False)
        print(f"Output saved to {final_csv}")
        return

    print("\nPerforming sentiment analysis on fetched news...")
    analyzed_news = analyze_sentiment(news_data)
    sentiment_df = pd.DataFrame(analyzed_news)

    # Convert published_date to datetime, then to just date
    sentiment_df['published_date'] = pd.to_datetime(sentiment_df['published_date'], errors='coerce')
    sentiment_df['Date'] = sentiment_df['published_date'].dt.date

    # Group sentiment by Date to get daily average (or any other aggregation you prefer)
    daily_sentiment = sentiment_df.groupby('Date').agg({
        'sentiment_polarity': 'mean',
        'sentiment_subjectivity': 'mean'
    }).reset_index()

    # === Step 5: Merge sentiment with stock data ===
    print("\nMerging sentiment data with stock data...")
    combined_df = pd.merge(
        stock_df,
        daily_sentiment,
        on='Date',
        how='left'  # keep all stock rows, match sentiment where available
    )

    # === Step 6: Additional Preprocessing for LSTM/RNN/CNN ===
    print("\nPerforming final preprocessing on the combined data...")

    # (A) Fill any remaining NaN values (e.g., sentiment on days without news)
    combined_df.fillna(0, inplace=True)

    # (B) Scale all numeric columns except 'Date' and sentiment scores
    numeric_cols = combined_df.select_dtypes(include=[np.number]).columns.difference(['sentiment_polarity', 'sentiment_subjectivity'])
    scaler = MinMaxScaler()
    combined_df[numeric_cols] = scaler.fit_transform(combined_df[numeric_cols])

    # Save the scaler
    joblib.dump(scaler, 'scaler.pkl')

    # Reshape for RNN/LSTM/CNN (batch_size, timesteps, features)
    X_values = combined_df.drop(columns=['Date']).values
    X_reshaped = X_values.reshape((X_values.shape[0], 1, X_values.shape[1]))

    # === Step 7: Save final combined data to CSV ===
    final_csv = f"combined_{ticker}_{start_date}_to_{end_date}.csv"
    combined_df.to_csv(final_csv, index=False)

    print(f"\nAll done! Preprocessed data (scaled technical indicators + daily sentiment) saved to {final_csv} and scaler saved to scaler.pkl")

if __name__ == "__main__":
    main()


Enter the stock ticker (e.g., TSLA): TSLA
Enter start date (YYYY-MM-DD): 2024-10-10
Enter end date (YYYY-MM-DD): 2025-02-17

Fetching stock data for TSLA from 2024-10-10 to 2025-02-17...

Fetching news for TSLA from 2024-10-10 to 2025-02-17...
Fetching news from 2024-10-10 to 2024-11-09...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2024-11-09 to 2024-12-09...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2024-12-09 to 2025-01-08...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2025-01-08 to 2025-02-07...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2025-02-07 to 2025-02-17...
Waiting for 1 minute to respect API rate limits...

Performing sentiment analysis on fetched news...

Merging sentiment data with stock data...

Performing final preprocessing on the combined data...

All done! Preprocessed data (scaled technical indicators + daily sentiment) saved to combined_TSLA_2024-10-10_to_202