<a href="https://colab.research.google.com/github/NathanDietrich/Artificial-Intelligence-and-Machine-Learning-portfolio/blob/main/polygonsentimentcollection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ta
import yfinance as yf
import pandas as pd
import numpy as np
from ta.volatility import AverageTrueRange, BollingerBands, DonchianChannel, KeltnerChannel
from ta.momentum import WilliamsRIndicator
#define start and end dates to line up with sentiment collection
start_date = "2022-09-29"
end_date = "2025-01-08"
# Fetch the historical data
tsla = yf.Ticker("TSLA")
data = tsla.history(start = start_date, end = end_date)
data = data[['Open', 'High', 'Low', 'Close', 'Volume']]

# Calculate Technical Indicators
# Average True Range (ATR)
data['ATR'] = AverageTrueRange(high=data['High'], low=data['Low'], close=data['Close']).average_true_range()

#bollinger Bands
bb = BollingerBands(close=data['Close'])
data['BB_High'] = bb.bollinger_hband()
data['BB_low'] = bb.bollinger_lband()

# Donichian Channel
dc = DonchianChannel(high=data['High'], low=data['Low'], close=data['Close'])
data['DC_High'] = dc.donchian_channel_hband()
data['DC_low'] = dc.donchian_channel_lband()

# Keltner Channel
kc = KeltnerChannel(high=data['High'], low=data['Low'], close=data['Close'])
data['KC_High'] = kc.keltner_channel_hband()
data['KC_Low'] = kc.keltner_channel_lband()

#Chaikin Volatility
high_low_range = data['High'] - data['Low']
data['Chaikin_volatility'] = high_low_range.rolling(window=10).mean() / high_low_range.rolling(window=10).std()

#Historical Volatility
log_returns = np.log(data['Close'] / data['Close'].shift(1))
data['Historical_volatility'] = log_returns.rolling(window=30).std() * np.sqrt(252)

# Standard Deviation
data['Standard_Deviation'] = data['Close']. rolling(window=14).std()

# Williams %R
wr = WilliamsRIndicator(high=data['High'], low=data['Low'], close=data['Close'])
data['Williams_%R'] = wr.williams_r()

# Commodity Channel Index(CCI)
data['CCI'] = (data['Close'] - data['Close'].rolling(20).mean()) / (0.015 * data['Close'].rolling(20).std())

# RSI-Based Volatility
rsi_diff = data['Close'].rolling(window=14).apply(lambda x: max(x) - min(x))
data['RSI_Based_Volatility'] = rsi_diff / data['Close']

#Ulcer Index
data['Ulcer_Index'] = ((data['Close'] - data['Close'].rolling(window=14).max()) ** 2).rolling(window=14).mean()

# True Strength Index (TSI)
data['TSI'] = (log_returns.ewm(span=25).mean() / log_returns.abs().ewm(span=13).mean()) * 100

# Fractal Chaos Oscillator
data['Fractal_chaos_Oscillator'] = data['Close'].rolling(window=14).apply(lambda x: np.ptp(x))

# Drop NAN values
data.dropna(inplace=True)

#save data to a csv
output_path = f"tsla_with_technical_indicators_{start_date}_to_{end_date}.csv"
data.to_csv(output_path)

print(f"Data with technical indicators saved to {output_path}")
print(data)

Sentiment analysis completed. Results saved to TSLA_news_sentiment_2021-02-02_to_2025-01-15.csv


In [None]:
print

this program collects news data from polygon api and collects 1000 articles in chunks of 1 month but can only have 5 api requests per minute

In [None]:
import requests
from textblob import TextBlob
import pandas as pd
import datetime
import time
from google.colab import userdata

def get_historical_news_chunked(ticker, start_date, end_date, api_key, limit=1000):
    url = f"https://api.polygon.io/v2/reference/news"
    all_results = []

    current_start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    final_end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")

    while current_start_date < final_end_date:
        # Calculate the chunk end date (1 months from the start date)
        chunk_end_date = current_start_date + datetime.timedelta(days=1 * 30)
        if chunk_end_date > final_end_date:
            chunk_end_date = final_end_date

        chunk_start_str = current_start_date.strftime("%Y-%m-%d")
        chunk_end_str = chunk_end_date.strftime("%Y-%m-%d")

        print(f"Fetching news from {chunk_start_str} to {chunk_end_str}...")

        params = {
            "ticker": ticker,
            "published_utc.gte": chunk_start_str,
            "published_utc.lte": chunk_end_str,
            "apiKey": api_key,
            "limit": limit
        }

        while True:
            response = requests.get(url, params=params)
            if response.status_code == 200:
                data = response.json()
                results = data.get("results", [])
                all_results.extend(results)

                # Check for pagination
                next_cursor = data.get("next_cursor")
                if not next_cursor:
                    break

                # Update the cursor for the next request
                params["cursor"] = next_cursor
            else:
                print(f"Error: {response.status_code}, {response.text}")
                break

        # Move to the next chunk
        current_start_date = chunk_end_date

        # Pause once after processing the entire chunk
        print("Waiting for 1 minute to respect API rate limits...")
        time.sleep(14)

    return all_results

def analyze_sentiment(news_data):
    analyzed_data = []

    for article in news_data:
        title = article.get("title", "")
        description = article.get("description", "")

        # Combine title and description
        full_text = f"{title} {description}"
        sentiment = TextBlob(full_text).sentiment

        analyzed_data.append({
            "title": title,
            "description": description,
            "published_date": article.get("published_utc", ""),
            "sentiment_polarity": sentiment.polarity,
            "sentiment_subjectivity": sentiment.subjectivity
        })

    return analyzed_data

def main():
    # API config
    api_key = userdata.get('Polygon_Key')
    ticker = "TSLA"
    start_date = (datetime.datetime.now() - datetime.timedelta(days=2.5 * 365)).strftime("%Y-%m-%d")
    end_date = datetime.datetime.now().strftime("%Y-%m-%d")  # 2.5 years backward from today

    # Fetch news in chunks
    news_data = get_historical_news_chunked(ticker, start_date, end_date, api_key, limit=1000)

    if not news_data:
        print("No news data found.")
        return

    # Perform sentiment analysis
    analyzed_news = analyze_sentiment(news_data)

    # Convert to DataFrame for better visualization
    df = pd.DataFrame(analyzed_news)

    # Sort the data by published_date
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
    df = df.sort_values(by='published_date')

    # Save to CSV file
    csv_file = f"{ticker}_news_sentiment_{start_date}_to_{end_date}.csv"
    df.to_csv(csv_file, index=False)

    print(f"Sentiment analysis completed. Results saved to {csv_file}")

if __name__ == "__main__":
    main()

Fetching news from 2022-07-26 to 2022-08-25...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2022-08-25 to 2022-09-24...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2022-09-24 to 2022-10-24...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2022-10-24 to 2022-11-23...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2022-11-23 to 2022-12-23...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2022-12-23 to 2023-01-22...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2023-01-22 to 2023-02-21...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2023-02-21 to 2023-03-23...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2023-03-23 to 2023-04-22...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2023-04-22 to 2023-05-22...
Waiting for 1 minute to respect API rate limits...
Fetching news from 2

In [None]:
import pandas as pd

def process_news_data(input_file, output_file):
    """
    Orders the dataset by date, removes duplicates, and saves it to a specified file.

    Parameters:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to save the processed CSV file.

    Returns:
        None
    """
    # Load the dataset
    data = pd.read_csv(input_file)

    # Convert 'published_date' to datetime for sorting
    if 'published_date' in data.columns:
        data['published_date'] = pd.to_datetime(data['published_date'])
    else:
        raise ValueError("The dataset must contain a 'published_date' column.")

    # Sort the dataset by 'published_date'
    data = data.sort_values('published_date')

    # Remove duplicate rows
    data = data.drop_duplicates()

    # Save the cleaned and sorted dataset to a new CSV file
    data.to_csv(output_file, index=False)

    print(f"Processed dataset saved as: {output_file}")


process_news_data('TSLA_news_sentiment_2022-07-26_to_2025-01-23.csv', 'TSLA_news_sentiment_sorted.csv')


Processed dataset saved as: TSLA_news_sentiment_sorted.csv


In [None]:
import pandas as pd

def preprocess_for_lstm(input_file, output_file):
    # Load the CSV file
    df = pd.read_csv(input_file)

    # Ensure the published_date is in datetime format
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')

    # Drop rows with invalid or missing dates
    df = df.dropna(subset=['published_date'])

    # Sort the data by published_date
    df = df.sort_values(by='published_date')

    # Keep only the columns needed for LSTM: published_date, sentiment_polarity, sentiment_subjectivity
    lstm_data = df[['published_date', 'sentiment_polarity', 'sentiment_subjectivity']]

    # Save the preprocessed data to a new CSV file
    lstm_data.to_csv(output_file, index=False)

    print(f"Preprocessed data saved to {output_file}")

# Example usage
input_file = 'TSLA_news_sentiment_sorted.csv'  # Replace with your file path
output_file = 'TSLA_news_preprocessed_for_lstm.csv'  # Replace with your desired output file path
preprocess_for_lstm(input_file, output_file)

Preprocessed data saved to TSLA_news_preprocessed_for_lstm.csv


In [2]:
import pandas as pd
import numpy as np

def preprocess_sentiment_and_stock(sentiment_file, stock_file, output_file):
    # Load the sentiment data
    sentiment_df = pd.read_csv(sentiment_file)
    sentiment_df['published_date'] = pd.to_datetime(sentiment_df['published_date'], errors='coerce')

    # Drop rows with invalid or missing dates
    sentiment_df = sentiment_df.dropna(subset=['published_date'])

    # Group sentiment data by day and calculate daily averages and article counts
    daily_sentiment = sentiment_df.groupby(sentiment_df['published_date'].dt.date).agg({
        'sentiment_polarity': 'mean',
        'sentiment_subjectivity': 'mean',
        'published_date': 'count'  # Count the number of articles per day
    }).rename(columns={'published_date': 'article_count'}).reset_index()
    daily_sentiment.rename(columns={'published_date': 'date'}, inplace=True)

    # Load the stock price data
    stock_df = pd.read_csv(stock_file)
    stock_df['date'] = pd.to_datetime(stock_df['date'], errors='coerce', utc=True).dt.date

    # Drop rows with invalid or missing dates
    stock_df = stock_df.dropna(subset=['date'])

    # Merge stock data with sentiment data on the 'date' column
    combined_df = pd.merge(stock_df, daily_sentiment, on='date', how='left')

    # Handle missing sentiment data
    combined_df['sentiment_polarity'].fillna(0, inplace=True)  # Fill missing polarity with 0
    combined_df['sentiment_subjectivity'].fillna(0, inplace=True)  # Fill missing subjectivity with 0
    combined_df['article_count'].fillna(0, inplace=True)  # Fill missing article counts with 0

    # Forward-fill stock prices for non-trading days (if required)
    combined_df.sort_values(by='date', inplace=True)
    combined_df.ffill(inplace=True)

    # Dynamically normalize all numerical features, including article_count
    numeric_features = combined_df.select_dtypes(include=[np.number]).columns
    combined_df[numeric_features] = (combined_df[numeric_features] - combined_df[numeric_features].mean()) / combined_df[numeric_features].std()

    # Save the preprocessed data to a new CSV file
    combined_df.to_csv(output_file, index=False)
    print(f"Preprocessed data saved to {output_file}")



# Example usage
sentiment_file = 'TSLA_news_preprocessed_for_lstm_input.csv'  # Replace with your sentiment data file
stock_file = 'tsla_with_technical_indicators2022-09-29_to_2025-01-08.csv'  # Replace with your stock price data file
output_file = 'TSLA_combined_preprocessed_fixed1.csv'  # Output file path
preprocess_sentiment_and_stock(sentiment_file, stock_file, output_file)


Preprocessed data saved to TSLA_combined_preprocessed_fixed1.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['sentiment_polarity'].fillna(0, inplace=True)  # Fill missing polarity with 0
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['sentiment_subjectivity'].fillna(0, inplace=True)  # Fill missing subjectivity with 0
The behavior will change in pandas 3.0. This