In [1]:
import pandas as pd
import nltk
nltk.download("vader_lexicon")

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\MIT\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [2]:
df = pd.read_csv("../../raw_analyst_ratings.csv")
df.dtypes

Unnamed: 0     int64
headline      object
url           object
publisher     object
date          object
stock         object
dtype: object

In [3]:
df.sample()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock
1245557,1251464,Stocks That Hit 52-Week Highs On Thursday,https://www.benzinga.com/news/20/01/15101221/s...,Lisa Levin,2020-01-09 00:00:00,TRI


In [4]:
df.isnull().sum()

Unnamed: 0    0
headline      0
url           0
publisher     0
date          0
stock         0
dtype: int64

**Descriptive Statistics**

In [5]:
def count_words(column):
    return len(column.split())

df['word_count'] = df['headline'].apply(count_words)

In [6]:
df['word_count'].describe().round(2)

count    1407328.00
mean          11.42
std            6.35
min            1.00
25%            7.00
50%           10.00
75%           13.00
max           77.00
Name: word_count, dtype: float64

The average number of words in the headlines for the given dataset is around 11 words. From the summary above, it is also noticeable that there is a headline consisting of a single word to the minimum and there is a headline containing up to 77 words to the maximum.

In [7]:
articles_per_pub = df.groupby('publisher')['headline'].count().reset_index()

In [8]:
sorted_articlesPerPub = articles_per_pub.sort_values(by='headline', ascending=False)
sorted_articlesPerPub.head(10)

Unnamed: 0,publisher,headline
686,Paul Quintaro,228373
532,Lisa Levin,186979
96,Benzinga Newsdesk,150484
169,Charles Gross,96732
622,Monica Gerson,82380
275,Eddie Staley,57254
353,Hal Lindon,49047
270,ETF Professor,28489
478,Juan Lopez,28438
98,Benzinga Staff,28114


These are the top 10 most active publishers based on the number of articles they publish.

In [9]:
articles_per_date = df.groupby('date')['headline'].count().reset_index()

In [10]:
articles_per_date.sample(5)

Unnamed: 0,date,headline
26816,2020-02-25 18:21:37-04:00,1
6502,2015-05-07 17:32:01-04:00,1
26364,2020-02-12 18:08:11-04:00,1
21710,2019-05-16 09:32:46-04:00,1
28140,2020-03-23 08:39:28-04:00,1


In [11]:
date_indexed_df = articles_per_date.copy()
date_indexed_df["date"] = pd.to_datetime(date_indexed_df["date"],format='ISO8601',utc=True)

date_indexed_df.set_index("date",inplace=True)
monthly_data = date_indexed_df.resample('ME').sum()
sorted_articlesPerDate = monthly_data.sort_values(by='headline', ascending=False)
sorted_articlesPerDate.head(10)

Unnamed: 0_level_0,headline
date,Unnamed: 1_level_1
2020-03-31 00:00:00+00:00,24995
2020-04-30 00:00:00+00:00,20757
2020-02-29 00:00:00+00:00,18878
2020-05-31 00:00:00+00:00,17992
2018-08-31 00:00:00+00:00,16681
2019-10-31 00:00:00+00:00,16560
2019-08-31 00:00:00+00:00,16462
2018-10-31 00:00:00+00:00,16294
2020-01-31 00:00:00+00:00,16145
2018-11-30 00:00:00+00:00,14829


The top 2 periods in which news related to the stock market was frequent are March and April of 2020. This may be due to the crash that followed instability due to the COVID-19 pandemic, which spanned the period from February 20 to April 7.

**Text Analysis**

In [12]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Instantiate the sentiment intensity analyzer with the existing lexicon
vader = SentimentIntensityAnalyzer ()

# Iterate through the headlines and get the polarity scores
scores = [vader.polarity_scores (headline) for headline in df['headline'].values ]
scores_df = pd.DataFrame(scores)

# Join the DataFrames
scored_news = pd.concat([df, scores_df], axis = 1)

# Convert the date column from string to datetime
scored_news['date'] = pd.to_datetime(scored_news.date,format='ISO8601',utc=True)
scored_news.sample()

Unnamed: 0.1,Unnamed: 0,headline,url,publisher,date,stock,word_count,neg,neu,pos,compound
128983,129888,10x Return Stocks - True-Growth Stocks (Part 3),https://www.benzinga.com/general/10/10/534237/...,Daniel Ho,2010-10-20 04:00:00+00:00,AZO,8,0.0,1.0,0.0,0.0


In [14]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter

# Download NLTK resources (run only once)
nltk.download('punkt')
nltk.download('stopwords')

# Tokenization and stopword removal
stop_words = set(stopwords.words('english'))
keywords = []
for headline in df['headline']:
    words = word_tokenize(headline.lower())  # Tokenize and convert to lowercase
    words = [word for word in words if word.isalnum() and word not in stop_words]  # Remove stopwords and non-alphanumeric characters
    keywords.extend(words)

# Frequency analysis to identify common keywords
keyword_freq = Counter(keywords)
common_keywords = keyword_freq.most_common(20)  # Get the top 5 most common keywords

# Display common keywords
print("Common Keywords:")
for keyword, frequency in common_keywords:
    print(f"{keyword}: {frequency} times",end=' | ')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MIT\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MIT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Common Keywords:
stocks: 161702 times | vs: 138835 times | eps: 128801 times | est: 122289 times | shares: 114140 times | reports: 108688 times | update: 91645 times | market: 91080 times | earnings: 87183 times | sales: 79528 times | top: 78493 times | benzinga: 74466 times | pt: 73059 times | announces: 66531 times | price: 64217 times | buy: 63928 times | downgrades: 61942 times | trading: 61146 times | raises: 57793 times | upgrades: 56802 times | 