In [3]:
import pandas as pd
import warnings
# Ignore all warnings
warnings.filterwarnings('ignore')

# Load stock price data
stock_data = pd.read_parquet("Cleaned_Stock_Data.parquet")

# Load sentiment data
news_data = pd.read_parquet("Yahoo_Finance_News_with_Sentiment.parquet")

# Ensure Date format consistency
stock_data['Date'] = pd.to_datetime(stock_data['Date'])
news_data['providerPublishTime'] = pd.to_datetime(news_data['providerPublishTime'])

# Extract only the date (no time) for merging
news_data['Date'] = news_data['providerPublishTime'].dt.date
stock_data['Date'] = stock_data['Date'].dt.date

In [4]:
# Compute daily average sentiment score
daily_sentiment = news_data.groupby('Date')['sentiment_score'].mean().reset_index()

# Merge with stock price data
stock_data = stock_data.merge(daily_sentiment, on='Date', how='left')

# Fill missing sentiment values with neutral (0)
stock_data['sentiment_score'].fillna(0, inplace=True)

# Display merged dataset
print(stock_data.head())

# Save the final dataset with sentiment
stock_data.to_parquet("Stock_Data_with_Sentiment.parquet", index=False)

         Date      Open      High       Low     Close     Volume  \
0  2010-01-04  6.422877  6.455077  6.391278  6.440331  493729600   
1  2010-01-05  6.458088  6.487880  6.417461  6.451467  601904800   
2  2010-01-06  6.451466  6.477045  6.342226  6.348846  552160000   
3  2010-01-07  6.372319  6.379843  6.291067  6.337110  477131200   
4  2010-01-08  6.328681  6.379840  6.291366  6.379238  447610800   

   sentiment_score  
0              0.0  
1              0.0  
2              0.0  
3              0.0  
4              0.0  


In [6]:
stock_data.describe()

Unnamed: 0,Open,High,Low,Close,Volume,sentiment_score
count,3774.0,3774.0,3774.0,3774.0,3774.0,3774.0
mean,67.46551,68.176227,66.797248,67.52169,229920000.0,0.0
std,65.523835,66.212381,64.904099,65.602672,218072600.0,0.0
min,5.789106,5.898343,5.725309,5.779477,23234700.0,0.0
25%,17.465267,17.645444,17.316001,17.461676,84922750.0,0.0
50%,35.354725,35.72874,34.963342,35.323717,140392000.0,0.0
75%,126.339099,127.999182,124.827834,126.909637,308138600.0,0.0
max,257.906429,259.814335,257.347047,258.735504,1880998000.0,0.0
