Extraction and quantification of market sentiment from textual data such as news headlines or social media posts related to stock tickers. It fetches recent text data using AlphaVantage Global News API, preprocesses and cleans the text for analysis, and then applies the VADER sentiment analyser to assign sentiment scores (positive, negaitve, neutral and compound) to each piece of text. 
Scores are then aggregated over chosen time intervals to create a time-aligned sentiment dataset that can be merged with market price data for further modeling and visualisation. 

In [43]:
# import libraries 
import requests 
import pandas as pd 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime, timedelta
import yfinance as yf 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import feedparser
import re


In [2]:
def fetch_alphavantage_news_df(api_url: str) -> pd.DataFrame: 
    """
    Fetches news data from Alphavantage Global News API URL and returns
    a cleaned pandas DataFrame with parsed dates.
    
    Parameters:
        api_url (str): Fully constructed API URL with key and parameters.
    
    Returns:
        pd.DataFrame: DataFrame with news articles.
    """
    response = requests.get(api_url)
    data = response.json()

    # Extract the "feed" list from the JSON response 
    news_list = data.get("feed", [])
    
    # Convert list of news dictionaries into a DataFrame
    df = pd.DataFrame(news_list)

    # Parse the published dates into datetime objects 
    df["time_published"] = pd.to_datetime(df["time_published"], errors = "coerce")

    return df

In [None]:
# Dynamically create the url in order to specify the ticker, date, and apikey
ALPHA_API = "F54RUITJVPIPTGZF"

def build_alphavantage_news_url(ticker, date):
    url = f"https://www.alphavantage.co/query?function=NEWS_SENTIMENT&date={date}&tickers={ticker}&apikey={ALPHA_API}"
    return url

In [None]:
news_url = build_alphavantage_news_url("AAPL", "10-08-2025")

In [6]:
news_df = fetch_alphavantage_news_df(news_url)

In [7]:
# Visualise the dataframe 
print(news_df.columns.tolist())
print(news_df.head())

['title', 'url', 'time_published', 'authors', 'summary', 'banner_image', 'source', 'category_within_source', 'source_domain', 'topics', 'overall_sentiment_score', 'overall_sentiment_label', 'ticker_sentiment']
                                               title  \
0  GameStop Short Seller Andrew Left Is Taking On...   
1          3 No-Brainer Chip Stocks to Buy Right Now   
2  Perplexity Pulls A Tesla: All Talk, No Deals…Y...   
3  Berkshire Hathaway 13F Preview: Did Buffett Tr...   
4  Performance Comparison: Apple And Competitors ...   

                                                 url      time_published  \
0  https://www.benzinga.com/trading-ideas/short-i... 2025-08-13 20:44:36   
1  https://www.fool.com/investing/2025/08/13/3-no... 2025-08-13 19:30:00   
2  https://www.benzinga.com/markets/tech/25/08/47... 2025-08-13 16:48:07   
3  https://www.benzinga.com/trading-ideas/long-id... 2025-08-13 15:33:22   
4  https://www.benzinga.com/insights/news/25/08/4... 2025-08-13 15:00:54 

In [8]:
# Apply VADER sentiment analysis on the news headlines using the "title" and "summary" columns

sia = SentimentIntensityAnalyzer()
def get_sentiment_scores(text): 
    if isinstance(text, str):
        return sia.polarity_scores(text)
    else:
        return {'neg': None, 'neu': None, 'pos': None, 'compound': None}


In [9]:
# Create sentiment score columns 
news_df[["neg", "neu", "pos", "compound"]] = news_df["title"].apply(get_sentiment_scores).apply(pd.Series)

In [10]:
news_df

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,category_within_source,source_domain,topics,overall_sentiment_score,overall_sentiment_label,ticker_sentiment,neg,neu,pos,compound
0,GameStop Short Seller Andrew Left Is Taking On...,https://www.benzinga.com/trading-ideas/short-i...,2025-08-13 20:44:36,[Chris Katje],Short seller Andrew Left announced he is short...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Retail & Wholesale', 'relevance_sc...",0.218479,Somewhat-Bullish,"[{'ticker': 'AAPL', 'relevance_score': '0.1141...",0.0,1.0,0.0,0.0
1,3 No-Brainer Chip Stocks to Buy Right Now,https://www.fool.com/investing/2025/08/13/3-no...,2025-08-13 19:30:00,[Keithen Drury],Chip companies are the ones making serious mon...,https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Financial Markets', 'relevance_sco...",0.229756,Somewhat-Bullish,"[{'ticker': 'NVDA', 'relevance_score': '0.3056...",0.0,1.0,0.0,0.0
2,"Perplexity Pulls A Tesla: All Talk, No Deals…Y...",https://www.benzinga.com/markets/tech/25/08/47...,2025-08-13 16:48:07,[Surbhi Jain],Perplexity mirrors Tesla's early years with he...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",0.127612,Neutral,"[{'ticker': 'GOOG', 'relevance_score': '0.4579...",0.155,0.845,0.0,-0.296
3,Berkshire Hathaway 13F Preview: Did Buffett Tr...,https://www.benzinga.com/trading-ideas/long-id...,2025-08-13 15:33:22,[Chris Katje],Berkshire Hathaway has to file its Q2 13F by A...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Energy & Transportation', 'relevan...",0.116881,Neutral,"[{'ticker': 'AAPL', 'relevance_score': '0.1368...",0.0,1.0,0.0,0.0
4,Performance Comparison: Apple And Competitors ...,https://www.benzinga.com/insights/news/25/08/4...,2025-08-13 15:00:54,[Benzinga Insights],Amidst the fast-paced and highly competitive b...,https://www.benzinga.com/next-assets/images/sc...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.8...",0.266536,Somewhat-Bullish,"[{'ticker': 'AAPL', 'relevance_score': '0.5400...",0.0,1.0,0.0,0.0
5,50 Bps Rate Cut Fever Grips Market; Small Caps...,https://www.benzinga.com/markets/equities/25/0...,2025-08-13 14:48:59,[The Arora Report],"To gain an edge, this is what you need to know...",https://www.benzinga.com/next-assets/images/sc...,Benzinga,Markets,www.benzinga.com,"[{'topic': 'Economy - Monetary', 'relevance_sc...",0.206369,Somewhat-Bullish,"[{'ticker': 'KBH', 'relevance_score': '0.09239...",0.095,0.905,0.0,-0.2732
6,"S&P 500 ETFs Hit Record Highs as Index Tops 6,...",https://www.zacks.com/stock/news/2707453/sp-50...,2025-08-13 14:30:00,[Sweta Killa],"ETFs like VOO, SPY and IVV hit record highs as...",https://staticx-tuner.zacks.com/images/article...,Zacks Commentary,,www.zacks.com,"[{'topic': 'Economy - Monetary', 'relevance_sc...",0.02921,Neutral,"[{'ticker': 'MSFT', 'relevance_score': '0.2478...",0.0,0.769,0.231,0.5106
7,"How To Trade SPY, Top Tech Stocks Using Techni...",https://www.benzinga.com/markets/equities/25/0...,2025-08-13 13:38:09,[RIPS],Today's economic calendar unfolds with an extr...,https://www.benzinga.com/next-assets/images/sc...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Economy - Monetary', 'relevance_sc...",-0.0025,Neutral,"[{'ticker': 'MSFT', 'relevance_score': '0.1169...",0.0,0.899,0.101,0.2023
8,Nvidia and Microsoft Stocks Have Reached a $4 ...,https://www.fool.com/investing/2025/08/13/nvid...,2025-08-13 12:30:00,[Jennifer Saibil],"Is Apple Falling behind, or will it come from ...",https://g.foolcdn.com/image/?url=https%3A%2F%2...,Motley Fool,,www.fool.com,"[{'topic': 'Retail & Wholesale', 'relevance_sc...",0.182456,Somewhat-Bullish,"[{'ticker': 'MSFT', 'relevance_score': '0.2888...",0.0,0.896,0.104,0.1027
9,"Apple, PG&E, D.R. Horton And An Industrial Sto...",https://www.benzinga.com/trading-ideas/long-id...,2025-08-13 12:18:15,[Avi Kapoor],Jim Lebenthal names PG&E as his final trade. S...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,Trading,www.benzinga.com,"[{'topic': 'Financial Markets', 'relevance_sco...",0.276358,Somewhat-Bullish,"[{'ticker': 'AAPL', 'relevance_score': '0.3950...",0.0,1.0,0.0,0.0


In [11]:
news_df.isnull().sum()

title                      0
url                        0
time_published             0
authors                    0
summary                    0
banner_image               0
source                     0
category_within_source     0
source_domain              0
topics                     0
overall_sentiment_score    0
overall_sentiment_label    0
ticker_sentiment           0
neg                        0
neu                        0
pos                        0
compound                   0
dtype: int64

In [12]:
### Exploration of Sentiment Data 

## Structural inspection 
# Check the first few rows 
print(news_df.head())

# Check the column names 
print("\nColumn names: ")
print(news_df.columns.tolist())

# Check data types and missing values 
print("\nDataFrame info:")
print(news_df.info())

# Check for missing values 
print("\nMissing values per column:")
print(news_df.isnull().sum())

# Quick statistics for numerical columns: 
print("\nSummary statistics:")
print(news_df.describe())

                                               title  \
0  GameStop Short Seller Andrew Left Is Taking On...   
1          3 No-Brainer Chip Stocks to Buy Right Now   
2  Perplexity Pulls A Tesla: All Talk, No Deals…Y...   
3  Berkshire Hathaway 13F Preview: Did Buffett Tr...   
4  Performance Comparison: Apple And Competitors ...   

                                                 url      time_published  \
0  https://www.benzinga.com/trading-ideas/short-i... 2025-08-13 20:44:36   
1  https://www.fool.com/investing/2025/08/13/3-no... 2025-08-13 19:30:00   
2  https://www.benzinga.com/markets/tech/25/08/47... 2025-08-13 16:48:07   
3  https://www.benzinga.com/trading-ideas/long-id... 2025-08-13 15:33:22   
4  https://www.benzinga.com/insights/news/25/08/4... 2025-08-13 15:00:54   

               authors                                            summary  \
0        [Chris Katje]  Short seller Andrew Left announced he is short...   
1      [Keithen Drury]  Chip companies are the ones 

All the columns have the expected names and types 
There are 0 missing values 
The time_published is a datetime object which is to be expected

In [13]:
### Extract relevant ticker-level information from the ticker_sentiment column 

# Extract "ticker" and "relevance_score" into separate columns 
news_df_exploded = news_df.explode("ticker_sentiment").reset_index(drop = True)
ticker_sentiment_expanded = pd.json_normalize(news_df_exploded["ticker_sentiment"])
news_df_final = pd.concat([news_df_exploded.drop(columns = ["ticker_sentiment"]), ticker_sentiment_expanded], axis = 1)
news_df_final.drop(columns = "category_within_source", inplace = True)
news_df_final

Unnamed: 0,title,url,time_published,authors,summary,banner_image,source,source_domain,topics,overall_sentiment_score,overall_sentiment_label,neg,neu,pos,compound,ticker,relevance_score,ticker_sentiment_score,ticker_sentiment_label
0,GameStop Short Seller Andrew Left Is Taking On...,https://www.benzinga.com/trading-ideas/short-i...,2025-08-13 20:44:36,[Chris Katje],Short seller Andrew Left announced he is short...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,www.benzinga.com,"[{'topic': 'Retail & Wholesale', 'relevance_sc...",0.218479,Somewhat-Bullish,0.0,1.000,0.000,0.0000,AAPL,0.114137,0.100949,Neutral
1,GameStop Short Seller Andrew Left Is Taking On...,https://www.benzinga.com/trading-ideas/short-i...,2025-08-13 20:44:36,[Chris Katje],Short seller Andrew Left announced he is short...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,www.benzinga.com,"[{'topic': 'Retail & Wholesale', 'relevance_sc...",0.218479,Somewhat-Bullish,0.0,1.000,0.000,0.0000,FSLR,0.114137,0.0,Neutral
2,GameStop Short Seller Andrew Left Is Taking On...,https://www.benzinga.com/trading-ideas/short-i...,2025-08-13 20:44:36,[Chris Katje],Short seller Andrew Left announced he is short...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,www.benzinga.com,"[{'topic': 'Retail & Wholesale', 'relevance_sc...",0.218479,Somewhat-Bullish,0.0,1.000,0.000,0.0000,AMZN,0.170475,0.088005,Neutral
3,GameStop Short Seller Andrew Left Is Taking On...,https://www.benzinga.com/trading-ideas/short-i...,2025-08-13 20:44:36,[Chris Katje],Short seller Andrew Left announced he is short...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,www.benzinga.com,"[{'topic': 'Retail & Wholesale', 'relevance_sc...",0.218479,Somewhat-Bullish,0.0,1.000,0.000,0.0000,GME,0.280294,0.101056,Neutral
4,GameStop Short Seller Andrew Left Is Taking On...,https://www.benzinga.com/trading-ideas/short-i...,2025-08-13 20:44:36,[Chris Katje],Short seller Andrew Left announced he is short...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,www.benzinga.com,"[{'topic': 'Retail & Wholesale', 'relevance_sc...",0.218479,Somewhat-Bullish,0.0,1.000,0.000,0.0000,PLTR,0.170475,0.197071,Somewhat-Bullish
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,Alibaba Faces Fierce AI Talent Poaching As Riv...,https://www.benzinga.com/markets/tech/25/08/47...,2025-08-11 13:29:07,[Anusuya Lahiri],Alibaba loses top Tongyi Lab talent to rivals ...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.1...",0.134427,Neutral,0.0,0.796,0.204,0.5574,BABA,0.731515,0.192174,Somewhat-Bullish
246,Alibaba Faces Fierce AI Talent Poaching As Riv...,https://www.benzinga.com/markets/tech/25/08/47...,2025-08-11 13:29:07,[Anusuya Lahiri],Alibaba loses top Tongyi Lab talent to rivals ...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.1...",0.134427,Neutral,0.0,0.796,0.204,0.5574,GOOG,0.097854,0.253501,Somewhat-Bullish
247,Alibaba Faces Fierce AI Talent Poaching As Riv...,https://www.benzinga.com/markets/tech/25/08/47...,2025-08-11 13:29:07,[Anusuya Lahiri],Alibaba loses top Tongyi Lab talent to rivals ...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.1...",0.134427,Neutral,0.0,0.796,0.204,0.5574,META,0.194242,0.358744,Bullish
248,Alibaba Faces Fierce AI Talent Poaching As Riv...,https://www.benzinga.com/markets/tech/25/08/47...,2025-08-11 13:29:07,[Anusuya Lahiri],Alibaba loses top Tongyi Lab talent to rivals ...,https://cdn.benzinga.com/files/images/story/20...,Benzinga,www.benzinga.com,"[{'topic': 'Earnings', 'relevance_score': '0.1...",0.134427,Neutral,0.0,0.796,0.204,0.5574,AAPL,0.194242,-0.018988,Neutral


In [14]:
### Aggregate sentiment data at the level appropriate such that it matches the market data- daily per ticker 

# Sort the dataframe by ticker 
news_df_sorted = news_df_final.sort_values(by = "ticker")

# Convert time_published to date only 
news_df_sorted["time_published"] = pd.to_datetime(news_df_sorted["time_published"], errors = "coerce")
news_df_sorted["date"] = news_df_sorted["time_published"].dt.day

# Sort the values by ticker and date 
news_df_sorted.sort_values(by = ["ticker", "date"], inplace = True)

## Combine the multiple sentiment scores into a single summary value per group 

# New column with weighted sentiment per article 
news_df_sorted["ticker_sentiment_score"] = pd.to_numeric(news_df_sorted["ticker_sentiment_score"], errors = "coerce")
news_df_sorted["relevance_score"] = pd.to_numeric(news_df_sorted["relevance_score"], errors = "coerce")
news_df_sorted["weighted_sentiment_score"] = news_df_sorted["ticker_sentiment_score"] * news_df_sorted["relevance_score"]

# Group by ticket and date and then aggregate to sum the weighted sentiments, relevance scores and compute the weighted average sentiment by dividng these sums for each news article 
grouped = news_df_sorted.groupby(["ticker", "date"]).agg(
    total_weighted_sentiment = ("weighted_sentiment_score", "sum"),
    total_relevance = ("relevance_score", "sum")
).reset_index()  # optional, to turn MultiIndex into columns

grouped["weighted_avg_sentiment"] = grouped["total_weighted_sentiment"] / grouped["total_relevance"]

grouped = grouped.rename(columns = {"ticker": "Ticker", "date":"Date"})
grouped

Unnamed: 0,Ticker,Date,total_weighted_sentiment,total_relevance,weighted_avg_sentiment
0,AAPL,11,0.921635,3.376728,0.272937
1,AAPL,12,1.651391,6.584226,0.250810
2,AAPL,13,0.709400,3.920045,0.180967
3,AMAT,12,0.005357,0.035661,0.150228
4,AMD,12,0.000000,0.109956,0.000000
...,...,...,...,...,...
111,VRSN,13,0.014213,0.091509,0.155316
112,VRTS,13,0.000000,0.102888,0.000000
113,VZ,13,0.000000,0.091509,0.000000
114,WMT,13,0.037966,0.323078,0.117514


In [15]:
### Coordinate the sentiment data and the market data 

# Determine date range from sentiment data to define the market data window
start_date = (news_df_final["time_published"]).dt.date.min()
end_date = (news_df_final["time_published"]).dt.date.max()

# Extract unique tickers from news_df_final["ticker"]
unique_tickers = news_df_final["ticker"].unique()

print(start_date, end_date)

2025-08-11 2025-08-13


In [16]:
unique_tickers

array(['AAPL', 'FSLR', 'AMZN', 'GME', 'PLTR', 'NVDA', 'AVGO', 'GOOG',
       'META', 'TSLA', 'C', 'BAC', 'CVX', 'VZ', 'BRK-A', 'NU', 'SIRI',
       'HEI', 'DPZ', 'TMUS', 'DMPZF', 'OXY', 'STZ', 'POOL', 'UPS', 'COF',
       'FDX', 'CHTR', 'CB', 'VRSN', 'KBH', 'MSFT', 'APG', 'IVZ', 'NFLX',
       'ORCL', 'BLD', 'VRTS', 'PCG', 'TGT', 'COST', 'RH', 'WMT', 'ON',
       'TSM', 'FOREX:USD', 'SSNLF', 'UPST', 'INTC', 'AXP', 'KO', 'AMD',
       'BROS', 'LPLA', 'SFTBF', 'DASH', 'ETN', 'MDNDF', 'OPEN', 'BMNR',
       'SOUN', 'HOOD', 'UBER', 'JHG', 'AMAT', 'GFS', 'TXN', 'SPOT',
       'STKL', 'PTON', 'SFM', 'BODY', 'GLW', 'ROKU', 'XOM', 'GE', 'SPGI',
       'LCID', 'RIVN', 'SNOW', 'MS', 'MU', 'ASCCF', 'FOREX:AMD', 'BABA',
       'TCTZF'], dtype=object)

In [17]:
# Check Yahoo Finance download results

data = yf.download(unique_tickers.tolist(), start=start_date, end=end_date, group_by='ticker')
print("Downloaded data shape:", data.shape)
print("Downloaded data columns:", data.columns[:10])  # first 10 for preview
print("First few rows:\n", data.head())

  data = yf.download(unique_tickers.tolist(), start=start_date, end=end_date, group_by='ticker')
[*************         28%                       ]  24 of 86 completedHTTP Error 404: 
[**************        30%                       ]  26 of 86 completedHTTP Error 404: 
[*********************100%***********************]  86 of 86 completed

3 Failed downloads:
['FOREX:USD', 'FOREX:AMD', 'BODY']: YFTzMissingError('possibly delisted; no timezone found')


Downloaded data shape: (2, 433)
Downloaded data columns: MultiIndex([('UBER',   'Open'),
            ('UBER',   'High'),
            ('UBER',    'Low'),
            ('UBER',  'Close'),
            ('UBER', 'Volume'),
            ( 'GFS',   'Open'),
            ( 'GFS',   'High'),
            ( 'GFS',    'Low'),
            ( 'GFS',  'Close'),
            ( 'GFS', 'Volume')],
           names=['Ticker', 'Price'])
First few rows:
 Ticker           UBER                                                GFS  \
Price            Open    High        Low      Close    Volume       Open   
Date                                                                       
2025-08-11  89.809998  91.430  88.919998  90.580002  16668900  32.349998   
2025-08-12  91.190002  92.555  91.139999  91.730003  12509000  31.920000   

Ticker                                             ...          CB  \
Price            High     Low      Close   Volume  ...        Open   
Date                                          

In [18]:
data

Ticker,UBER,UBER,UBER,UBER,UBER,GFS,GFS,GFS,GFS,GFS,...,CB,CB,CB,CB,CB,BLD,BLD,BLD,BLD,BLD
Price,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume,...,Open,High,Low,Close,Volume,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2025-08-11,89.809998,91.43,88.919998,90.580002,16668900,32.349998,32.77,31.51,31.620001,3192700,...,271.660004,272.829987,270.160004,271.0,1276300,415.410004,418.839996,406.470001,415.089996,343100
2025-08-12,91.190002,92.555,91.139999,91.730003,12509000,31.92,33.139999,31.875,32.84,2654900,...,271.700012,272.269989,269.709991,269.769989,1436500,418.160004,429.959991,412.640015,429.660004,499000


In [19]:
# Download data from yfinance dataset 

def fetch_and_process_daily_data(tickers, start_date, end_date):
    if isinstance(tickers, str):
        tickers = [tickers]

    df = yf.download(tickers.tolist(), start=start_date, end=end_date, interval="1d", group_by=None)

    if isinstance(df.columns, pd.MultiIndex):
        df = df.stack(level=0, future_stack=True).reset_index()
        df = df.rename(columns={'level_1': 'Ticker'})
    else:
        df = df.reset_index()
        df['Ticker'] = tickers[0]

    df["log_return"] = df.groupby("Ticker")["Close"].transform(lambda x: np.log(x / x.shift(1)))
    print(df[["Ticker", "Date", "Close", "log_return"]].head(10))
    print(df.groupby("Ticker").size())
    df = df.dropna(subset=["log_return"])

    return df


In [20]:
final_df = fetch_and_process_daily_data(unique_tickers, "2025-03-03", "2025-08-12")

  df = yf.download(tickers.tolist(), start=start_date, end=end_date, interval="1d", group_by=None)
[*********************100%***********************]  86 of 86 completed

3 Failed downloads:
['FOREX:USD', 'FOREX:AMD', 'BODY']: YFTzMissingError('possibly delisted; no timezone found')


Price Ticker       Date       Close  log_return
0         RH 2025-03-03  307.230011         NaN
1       RIVN 2025-03-03   11.510000         NaN
2        IVZ 2025-03-03   16.535105         NaN
3       CHTR 2025-03-03  370.660004         NaN
4       SNOW 2025-03-03  173.649994         NaN
5        DPZ 2025-03-03  479.997101         NaN
6        PCG 2025-03-03   16.376520         NaN
7       UBER 2025-03-03   74.440002         NaN
8        GFS 2025-03-03   36.880001         NaN
9       AMZN 2025-03-03  205.020004         NaN
Ticker
AAPL    112
AMAT    112
AMD     112
AMZN    112
APG     112
       ... 
VRSN    112
VRTS    112
VZ      112
WMT     112
XOM     112
Length: 86, dtype: int64


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [44]:
### Process sentiment data 

# Group by date & ticker to get daily average sentiment using weighted sentiment score
sentiment_daily = (
    grouped
    .groupby(["Date", "Ticker"])["weighted_avg_sentiment"]
    .mean()
    .reset_index()
)

# Make sure both are Date columns are datetime 
final_df["Date"] = pd.to_datetime(final_df["Date"]).dt.normalize()
sentiment_daily["Date"] = pd.to_datetime(sentiment_daily["Date"]).dt.normalize()

### Process market data
final_df["Date"] = pd.to_datetime(final_df["Date"])

### Merge market and sentiment data 
merged_df = pd.merge(final_df, sentiment_daily, on=["Date", "Ticker"], how="left")

In [45]:
merged_df

Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,Adj Close,log_return,weighted_avg_sentiment
0,2025-03-04,RH,300.000000,300.000000,278.540009,292.179993,1477500.0,,-0.050227,
1,2025-03-04,RIVN,11.150000,11.580000,10.850000,11.260000,31774700.0,,-0.021960,
2,2025-03-04,IVZ,16.278594,16.278594,15.509061,15.785303,5165400.0,,-0.046406,
3,2025-03-04,CHTR,376.480011,377.489990,361.859985,368.359985,1316700.0,,-0.006225,
4,2025-03-04,SNOW,171.750000,176.639999,165.800003,173.600006,6988000.0,,-0.000288,
...,...,...,...,...,...,...,...,...,...,...
9140,2025-08-11,MS,143.570007,144.190002,142.660004,143.880005,4024700.0,,0.004877,
9141,2025-08-11,SSNLF,40.599899,40.599899,40.599899,40.599899,0.0,,0.000000,
9142,2025-08-11,SFTBF,94.000000,96.910004,94.000000,95.440002,6400.0,,0.015203,
9143,2025-08-11,OXY,44.349998,44.450001,43.619999,43.709999,8904100.0,,-0.013859,
