In [35]:
import sys
import os


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob

# Go one folder up
sys.path.append(os.path.abspath(".."))

from analysis_utils import preprocess_news, preprocess_stock, merge_and_correlate

In [36]:
# new_data
news_data = pd.read_csv("../data/raw_analyst_ratings.csv")  # shared news dataset


In [37]:
stock_df_apple = pd.read_csv("../Data/AAPL_historical_data.csv")  
stock_df_amzn = pd.read_csv("../Data/AMZN_historical_data.csv")  
stock_df_goog = pd.read_csv("../Data/GOOG_historical_data.csv")  
stock_df_meta = pd.read_csv("../Data/META_historical_data.csv")  
stock_df_msft = pd.read_csv("../Data/MSFT_historical_data.csv")  
stock_df_nvda = pd.read_csv("../Data/NVDA_historical_data.csv")  
stock_df_tsla = pd.read_csv("../Data/TSLA_historical_data.csv")  


In [38]:

# Normalize Date Formats (correct way)
stock_df_apple['Date'] = pd.to_datetime(stock_df_apple['Date']).dt.date
stock_df_amzn['Date'] = pd.to_datetime(stock_df_amzn['Date']).dt.date
stock_df_goog['Date'] = pd.to_datetime(stock_df_goog['Date']).dt.date
stock_df_meta['Date'] = pd.to_datetime(stock_df_meta['Date']).dt.date
stock_df_msft['Date'] = pd.to_datetime(stock_df_msft['Date']).dt.date
stock_df_nvda['Date'] = pd.to_datetime(stock_df_nvda['Date']).dt.date
news_data['date'] = pd.to_datetime(news_data['date'], errors='coerce')  # Let pandas infer the format
news_data = news_data.dropna(subset=['date'])  # Drop rows where date parsing failed
news_data['date'] = news_data['date'].dt.date  # Extract only the date part

# Drop rows with invalid dates
news_data = news_data.dropna(subset=['date'])



In [39]:
# Sentiment Analysis Function
def get_sentiment(text):
    return TextBlob(str(text)).sentiment.polarity

# Apply Sentiment Analysis
news_data['sentiment'] = news_data['headline'].apply(get_sentiment)

# Aggregate Sentiment by Date
daily_sentiment = news_data.groupby('date')['sentiment'].mean().reset_index()
daily_sentiment.columns = ['Date', 'Average_Sentiment']



In [None]:
# Sort and Compute Daily Returns in Stock Data of apple
stock_df_apple = stock_df_apple.sort_values('Date')
stock_df_apple['Daily_Return'] = stock_df_apple['Close'].pct_change()

# Merge Stock and News Data on Date of apple
merged_data_apple = pd.merge(stock_df_apple, daily_sentiment, on='Date', how='inner')

# Correlation Analysis
correlation = merged_data_apple[['Daily_Return', 'Average_Sentiment']].corr().iloc[0, 1]

# Print Sample Merged Data and Correlation Result
print("Sample Merged Data APPLE:")
print(merged_data_apple[['Date', 'Close', 'Daily_Return', 'Average_Sentiment']].head())

print("\nPearson Correlation between Sentiment and Daily Return:")
print(round(correlation, 4))


Sample Merged Data apple:
         Date      Close  Daily_Return  Average_Sentiment
0  2011-04-27  12.505357     -0.000771           0.000000
1  2011-04-28  12.383929     -0.009710           0.068182
2  2011-04-29  12.504643      0.009748           0.166667
3  2011-05-02  12.367143     -0.010996          -0.009259
4  2011-05-03  12.435714      0.005545           0.000000

Pearson Correlation between Sentiment and Daily Return:
-0.002


In [42]:
# Sort and Compute Daily Returns in Stock Data of Amazon
stock_df_amzn = stock_df_amzn.sort_values('Date')
stock_df_amzn['Daily_Return'] = stock_df_amzn['Close'].pct_change()

# Merge Stock and News Data on Date of apple
merged_data_amazon = pd.merge(stock_df_amzn, daily_sentiment, on='Date', how='inner')

# Correlation Analysis
correlation = merged_data_amazon[['Daily_Return', 'Average_Sentiment']].corr().iloc[0, 1]

# Print Sample Merged Data and Correlation Result
print("Sample Merged Data AMAZON:")
print(merged_data_amazon[['Date', 'Close', 'Daily_Return', 'Average_Sentiment']].head())

print("\nPearson Correlation between Sentiment and Daily Return:")
print(round(correlation, 4))

Sample Merged Data AMAZON:
         Date    Close  Daily_Return  Average_Sentiment
0  2011-04-27   9.8315      0.078607           0.000000
1  2011-04-28   9.7535     -0.007934           0.068182
2  2011-04-29   9.7905      0.003793           0.166667
3  2011-05-02  10.0595      0.027476          -0.009259
4  2011-05-03   9.9225     -0.013619           0.000000

Pearson Correlation between Sentiment and Daily Return:
-0.0194


In [44]:

# Sort and Compute Daily Returns in Stock Data of Amazon
stock_df_goog = stock_df_goog.sort_values('Date')
stock_df_goog['Daily_Return'] = stock_df_goog['Close'].pct_change()

# Merge Stock and News Data on Date of apple
merged_data_goog = pd.merge(stock_df_goog, daily_sentiment, on='Date', how='inner')

# Correlation Analysis
correlation = merged_data_goog[['Daily_Return', 'Average_Sentiment']].corr().iloc[0, 1]

# Print Sample Merged Data and Correlation Result
print("Sample Merged Data GOOGLE")
print(merged_data_goog[['Date', 'Close', 'Daily_Return', 'Average_Sentiment']].head())

print("\nPearson Correlation between Sentiment and Daily Return:")
print(round(correlation, 4))


Sample Merged Data GOOGLE
         Date      Close  Daily_Return  Average_Sentiment
0  2011-04-27  13.393797      0.009271           0.000000
1  2011-04-28  13.399027      0.000390           0.068182
2  2011-04-29  13.551705      0.011395           0.166667
3  2011-05-02  13.413722     -0.010182          -0.009259
4  2011-05-03  13.297408     -0.008671           0.000000

Pearson Correlation between Sentiment and Daily Return:
0.0143


In [45]:
# Sort and Compute Daily Returns in Stock Data of Amazon
stock_df_meta = stock_df_meta.sort_values('Date')
stock_df_meta['Daily_Return'] = stock_df_meta['Close'].pct_change()

# Merge Stock and News Data on Date of apple
merged_data_meta = pd.merge(stock_df_meta, daily_sentiment, on='Date', how='inner')

# Correlation Analysis
correlation = merged_data_meta[['Daily_Return', 'Average_Sentiment']].corr().iloc[0, 1]

# Print Sample Merged Data and Correlation Result
print("Sample Merged Data META:")
print(merged_data_meta[['Date', 'Close', 'Daily_Return', 'Average_Sentiment']].head())

print("\nPearson Correlation between Sentiment and Daily Return:")
print(round(correlation, 4))

Sample Merged Data META:
         Date      Close  Daily_Return  Average_Sentiment
0  2012-12-12  27.580000           NaN           0.000000
1  2012-12-13  28.240000      0.023930           0.008333
2  2012-12-14  26.809999     -0.050637          -0.070000
3  2012-12-17  26.750000     -0.002238           0.000000
4  2012-12-18  27.709999      0.035888           0.000000

Pearson Correlation between Sentiment and Daily Return:
-0.0061


In [46]:
# Sort and Compute Daily Returns in Stock Data of Amazon
stock_df_msft = stock_df_msft.sort_values('Date')
stock_df_msft['Daily_Return'] = stock_df_msft['Close'].pct_change()

# Merge Stock and News Data on Date of apple
merged_data_msft = pd.merge(stock_df_msft, daily_sentiment, on='Date', how='inner')

# Correlation Analysis
correlation = merged_data_msft[['Daily_Return', 'Average_Sentiment']].corr().iloc[0, 1]

# Print Sample Merged Data and Correlation Result
print("Sample Merged Data MICROSOFT:")
print(merged_data_msft[['Date', 'Close', 'Daily_Return', 'Average_Sentiment']].head())

print("\nPearson Correlation between Sentiment and Daily Return:")
print(round(correlation, 4))

Sample Merged Data MICROSOFT:
         Date      Close  Daily_Return  Average_Sentiment
0  2011-04-27  26.379999      0.007255           0.000000
1  2011-04-28  26.709999      0.012509           0.068182
2  2011-04-29  25.920000     -0.029577           0.166667
3  2011-05-02  25.660000     -0.010031          -0.009259
4  2011-05-03  25.809999      0.005846           0.000000

Pearson Correlation between Sentiment and Daily Return:
-0.0128


In [47]:
# Sort and Compute Daily Returns in Stock Data of Amazon
stock_df_nvda = stock_df_nvda.sort_values('Date')
stock_df_nvda['Daily_Return'] = stock_df_nvda['Close'].pct_change()

# Merge Stock and News Data on Date of apple
merged_data_nvda = pd.merge(stock_df_nvda, daily_sentiment, on='Date', how='inner')

# Correlation Analysis
correlation = merged_data_nvda[['Daily_Return', 'Average_Sentiment']].corr().iloc[0, 1]

# Print Sample Merged Data and Correlation Result
print("Sample Merged Data NIVIDA:")
print(merged_data_nvda[['Date', 'Close', 'Daily_Return', 'Average_Sentiment']].head())

print("\nPearson Correlation between Sentiment and Daily Return:")
print(round(correlation, 4))

Sample Merged Data NIVIDA:
         Date    Close  Daily_Return  Average_Sentiment
0  2011-04-27  0.48250      0.000000           0.000000
1  2011-04-28  0.48775      0.010881           0.068182
2  2011-04-29  0.50000      0.025115           0.166667
3  2011-05-02  0.49325     -0.013500          -0.009259
4  2011-05-03  0.46975     -0.047643           0.000000

Pearson Correlation between Sentiment and Daily Return:
0.0093


In [None]:
# Sort and Compute Daily Returns in Stock Data of Amazon
stock_df_tsla = stock_df_tsla.sort_values('Date')
stock_df_tsla['Daily_Return'] = stock_df_tsla['Close'].pct_change()

# Merge Stock and News Data on Date of apple
merged_data_tsla = pd.merge(stock_df_tsla, daily_sentiment, on='Date', how='inner')

# Correlation Analysis
correlation = merged_data_tsla[['Daily_Return', 'Average_Sentiment']].corr().iloc[0, 1]

# Print Sample Merged Data and Correlation Result
print("Sample Merged Data TSLA:")
print(merged_data_tsla[['Date', 'Close', 'Daily_Return', 'Average_Sentiment']].head())

print("\nPearson Correlation between Sentiment and Daily Return:")
print(round(correlation, 4))

Sample Merged Data TSLA:
Empty DataFrame
Columns: [Date, Close, Daily_Return, Average_Sentiment]
Index: []

Pearson Correlation between Sentiment and Daily Return:
nan
