In [6]:
import pandas as pd
import os

# Load historical stock data
stock_data = {}
data_folder = '../yfinance_data'

# Load each historical data file
for file in os.listdir(data_folder):
    if file.endswith('_historical_data.csv'):
        ticker = file.split('_')[0]  # Extract ticker symbol
        file_path = os.path.join(data_folder, file)
        stock_data[ticker] = pd.read_csv(file_path)
        stock_data[ticker]['Date'] = pd.to_datetime(stock_data[ticker]['Date'])

fns_pid_data = pd.read_csv(os.path.join(data_folder, 'raw_analyst_ratings.csv'))

# Convert the 'date' column to datetime
fns_pid_data['date'] = pd.to_datetime(fns_pid_data['date'], errors='coerce')

In [12]:
# Basic statistics for headline lengths
fns_pid_data['headline_length'] = fns_pid_data['headline'].str.len()
print(fns_pid_data['headline_length'].describe())

# Count articles per publisher
articles_per_publisher = fns_pid_data['publisher'].value_counts()
print(articles_per_publisher)

# Analyze publication dates
fns_pid_data['date'] = pd.to_datetime(fns_pid_data['date'])
publication_trends = fns_pid_data['date'].dt.date.value_counts()
print(publication_trends.sort_index())

count    1.407328e+06
mean     7.312051e+01
std      4.073531e+01
min      3.000000e+00
25%      4.700000e+01
50%      6.400000e+01
75%      8.700000e+01
max      5.120000e+02
Name: headline_length, dtype: float64
publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
                      ...  
Matthew Ely               1
Frank Ochoa               1
Jeremie Capron            1
Marvin Dumont             1
Igor Gonta                1
Name: count, Length: 1034, dtype: int64
date
2011-04-27      1
2011-04-28      2
2011-04-29      2
2011-04-30      1
2011-05-01      1
             ... 
2020-06-07     25
2020-06-08    765
2020-06-09    804
2020-06-10    806
2020-06-11    544
Name: count, Length: 2528, dtype: int64


In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(fns_pid_data['headline'])
common_words = X.sum(axis=0).A1
words = vectorizer.get_feature_names_out()
word_counts = dict(zip(words, common_words))
sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
print(sorted_words[:10])  # Top 10 common words

[('vs', np.int64(162099)), ('stocks', np.int64(161776)), ('est', np.int64(140604)), ('eps', np.int64(128897)), ('market', np.int64(120558)), ('shares', np.int64(114313)), ('reports', np.int64(108710)), ('update', np.int64(91723)), ('earnings', np.int64(87399)), ('sales', np.int64(79645))]


In [11]:
from textblob import TextBlob
for ticker in stock_data:
    if 'headline' not in stock_data[ticker].columns:
        stock_data[ticker]['headline'] = ''  # Fill with empty string if missing
# Calculate the sentiment for each stock's headlines
for ticker in stock_data:
    stock_data[ticker]['Sentiment'] = stock_data[ticker]['headline'].apply(lambda x: TextBlob(x).sentiment.polarity )

def get_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity  # Value between -1 (negative) and 1 (positive)

# Apply sentiment analysis
fns_pid_data['Sentiment'] = fns_pid_data['headline'].apply(get_sentiment)    