### load dataset

In [1]:
import pandas as pd

# Path to the CSV file relative to the current working directory
file_path = "../data/raw_analyst_ratings.csv"

# Load the CSV file into a DataFrame
data = pd.read_csv(file_path)

# Inspect the data
print(data.head())

   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4  B of A Securities Maintains Neutral on Agilent...   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   

                        date stock  
0  2020-06-05 10:30:54-04:00     A  
1  2020-06-03 10:45:20-04:00     A  
2  2020-05-26 04:30:07-04:00 

##### Inspect and Explore the Dataset

In [2]:
# Check the first few rows
print(data.head())

# Check the column names and data types
print(data.info())

# Check for missing values
print(data.isnull().sum())

   Unnamed: 0                                           headline  \
0           0            Stocks That Hit 52-Week Highs On Friday   
1           1         Stocks That Hit 52-Week Highs On Wednesday   
2           2                      71 Biggest Movers From Friday   
3           3       46 Stocks Moving In Friday's Mid-Day Session   
4           4  B of A Securities Maintains Neutral on Agilent...   

                                                 url          publisher  \
0  https://www.benzinga.com/news/20/06/16190091/s...  Benzinga Insights   
1  https://www.benzinga.com/news/20/06/16170189/s...  Benzinga Insights   
2  https://www.benzinga.com/news/20/05/16103463/7...         Lisa Levin   
3  https://www.benzinga.com/news/20/05/16095921/4...         Lisa Levin   
4  https://www.benzinga.com/news/20/05/16095304/b...         Vick Meyer   

                        date stock  
0  2020-06-05 10:30:54-04:00     A  
1  2020-06-03 10:45:20-04:00     A  
2  2020-05-26 04:30:07-04:00 

###  Normalize Dates

In [3]:
# Check the column names
print(data.columns)

Index(['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock'], dtype='object')


In [4]:
# Sort the data by date (optional but useful for alignment)
data = data.sort_values('date')

##### Preprocessing

In [5]:
# Convert 'date' column to datetime format, handling any errors by coercing them
data['date'] = pd.to_datetime(data['date'], errors='coerce', utc=True)

# Sort the data by date
data = data.sort_values('date')

# Verify the changes
print(data.head())

         Unnamed: 0                                           headline  \
879310       883755                       How Treasuries and ETFs Work   
519806       522587      Update on the Luxury Sector: 2nd Quarter 2009   
1390006     1396488      Update on the Luxury Sector: 2nd Quarter 2009   
1432           1834                             Going Against the Herd   
67712         68387  Charles Sizemore Radio Interview Saturday Morning   

                                                       url  \
879310   https://www.benzinga.com/28044/how-treasuries-...   
519806   https://www.benzinga.com/charles-lewis-sizemor...   
1390006  https://www.benzinga.com/charles-lewis-sizemor...   
1432     https://www.benzinga.com/charles-lewis-sizemor...   
67712    https://www.benzinga.com/11218/charles-sizemor...   

                          publisher                      date stock  
879310                 Paco Ahlgren 2009-02-14 00:00:00+00:00   NAV  
519806   Charles Lewis Sizemore CFA 2009-0

### Apply Sentiment Analysis:

In [26]:
from textblob import TextBlob

# Define a function to get sentiment polarity
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply the sentiment function to each headline
data['sentiment'] = data['headline'].apply(get_sentiment)

# Check the first few rows to confirm the sentiment column has been added
print(data[['headline', 'sentiment']].head())

ModuleNotFoundError: No module named 'textblob'

###  Aggregate Daily Sentiment Scores

In [25]:
import pandas as pd
from textblob import TextBlob

# Sample DataFrame (replace this with your actual data)
data = pd.DataFrame({
    'Unnamed: 0': [1, 2, 3],
    'headline': ['Good news for the market', 'Bad news for stocks', 'Market remains stable'],
    'date': ['2023-01-01', '2023-01-01', '2023-01-02'],
    'stock': ['AAPL', 'AAPL', 'AAPL']
})  # Ensure this is properly formatted

# Function to calculate sentiment polarity
def get_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Apply sentiment analysis to headlines
data['sentiment'] = data['headline'].apply(get_sentiment)

# Group by 'date' and calculate the mean sentiment score
daily_sentiment = data.groupby('date')['sentiment'].mean().reset_index()

# Rename columns for clarity
daily_sentiment.columns = ['date', 'average_sentiment']

# Check the results
print(daily_sentiment.head())

ModuleNotFoundError: No module named 'textblob'

###  Calculate Daily Stock Returns

In [8]:
print(data.columns)

Index(['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock'], dtype='object')


In [9]:
# Ensure the 'stock' column is in numeric format (in case it's stored as strings)
data['stock'] = pd.to_numeric(data['stock'], errors='coerce')

# Check if the 'stock' column exists
if 'stock' in data.columns:
    # Calculate daily stock returns as percentage change
    data['stock_returns'] = data['stock'].pct_change() * 100
else:
    print("Stock column not found.")
    
# Check the first few rows to confirm the stock returns are calculated
print(data[['date', 'stock', 'stock_returns']].head())

                             date  stock  stock_returns
879310  2009-02-14 00:00:00+00:00    NaN            NaN
519806  2009-04-27 00:00:00+00:00    NaN            NaN
1390006 2009-04-27 00:00:00+00:00    NaN            NaN
1432    2009-04-29 00:00:00+00:00    NaN            NaN
67712   2009-05-22 00:00:00+00:00    NaN            NaN


  data['stock_returns'] = data['stock'].pct_change() * 100


### Align Sentiment Data with Stock Data

In [12]:
import pandas as pd

# Sample DataFrames (replace these with your actual data)
# Example DataFrame for stock prices
data = pd.DataFrame({
    'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
    'stock_price': [150, 152, 153]
})

# Example DataFrame for sentiment analysis results
daily_sentiment = pd.DataFrame({
    'date': ['2023-01-01', '2023-01-02', '2023-01-03'],
    'sentiment_score': [0.1, 0.2, -0.1]
})

# Ensure the 'date' columns are in datetime format for proper merging
data['date'] = pd.to_datetime(data['date'])
daily_sentiment['date'] = pd.to_datetime(daily_sentiment['date'])

# Merge sentiment and stock data on 'date'
merged_data = pd.merge(data, daily_sentiment, on='date')

# Inspect the merged data
print(merged_data.head())

        date  stock_price  sentiment_score
0 2023-01-01          150              0.1
1 2023-01-02          152              0.2
2 2023-01-03          153             -0.1


### Perform Correlation Analysis

In [11]:
import pandas as pd

try:
    # Load the data (adjust file path as necessary)
    data = pd.read_csv('../data/raw_analyst_ratings.csv')  # Replace with your actual file path
    print("Data loaded successfully!")

except Exception as e:
    print(f"Error loading data: {e}")
    exit()

# Print the column names to inspect them
print("Column names in the dataset:", data.columns)

# Check if the necessary columns exist
if 'stock' in data.columns and 'sentiment' in data.columns and 'date' in data.columns:
    # Step 1: Calculate daily returns
    data['Daily_Return'] = data['stock'].pct_change() * 100

    # Step 2: Ensure 'date' is in datetime format
    data['date'] = pd.to_datetime(data['date'], errors='coerce')

    # Step 3: Group by 'date' and calculate average sentiment for each day
    daily_sentiment = data.groupby('date')['sentiment'].mean().reset_index()

    # Rename columns for clarity
    daily_sentiment.columns = ['date', 'Average_Sentiment']

    # Step 4: Merge sentiment and stock returns data on 'date'
    merged_data = pd.merge(data, daily_sentiment, on='date')

    # Step 5: Calculate Pearson correlation
    correlation = merged_data['Daily_Return'].corr(merged_data['Average_Sentiment'])

    # Step 6: Print the correlation result
    print(f"The correlation between sentiment and stock returns is: {correlation}")
else:
    print("Missing necessary columns: 'stock', 'sentiment', or 'date'. Please check your data.")

Data loaded successfully!
Column names in the dataset: Index(['Unnamed: 0', 'headline', 'url', 'publisher', 'date', 'stock'], dtype='object')
Missing necessary columns: 'stock', 'sentiment', or 'date'. Please check your data.
