In [1]:
import sys
import pandas as pd
import plotly.express as px
import os
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
cur=os.getcwd()
par=os.path.dirname(cur)
sys.path.insert(0,par)
from scripts.classes import FinancialAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\abenet\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
#reading the data
news_df=pd.read_csv('C:/Users/abenet/Desktop/data/raw_analyst_ratings.csv')

Obtaining basic statistics for headline

In [13]:
#news_df=news_df.drop(['Unnamed: 0'],axis=1)
news_df['headline length']=news_df['headline'].apply(lambda x: len(x.split()))
news_df.head()

Unnamed: 0,headline,url,publisher,date,stock,headline length
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 14:30:54+00:00,A,7
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 14:45:20+00:00,A,7
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 08:30:07+00:00,A,5
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 16:45:06+00:00,A,7
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 15:38:59+00:00,A,14


In [4]:
print(f"The mean headline length is {round(news_df['headline length'].mean())} words")

The mean headline length is 11 words


In [5]:
#Top 10 publishers with the most articles
news_df['publisher'].value_counts().head(10)

publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
Eddie Staley          57254
Hal Lindon            49047
ETF Professor         28489
Juan Lopez            28438
Benzinga Staff        28114
Name: count, dtype: int64

In [6]:
news_df['date']=pd.to_datetime(news_df['date'],errors='coerce',utc=True)
dates=news_df.groupby(by=news_df['date'].dt.date).size()
dates=pd.DataFrame({'date':dates.index,'headline_count':dates.values})
dates

Unnamed: 0,date,headline_count
0,2011-04-28,3
1,2011-04-29,2
2,2011-04-30,1
3,2011-05-01,1
4,2011-05-02,9
...,...,...
2497,2020-06-07,25
2498,2020-06-08,765
2499,2020-06-09,803
2500,2020-06-10,807


In [7]:
px.line(dates,x=dates['date'],y=dates['headline_count'])

In [14]:
#creating the instance of the sentiment analyzer
sia=SentimentIntensityAnalyzer()
news_df['sentiment'] = news_df['headline'].apply(lambda x: sia.polarity_scores(text=x)['compound'])
news_df.head()

Unnamed: 0,headline,url,publisher,date,stock,headline length,sentiment
0,Stocks That Hit 52-Week Highs On Friday,https://www.benzinga.com/news/20/06/16190091/s...,Benzinga Insights,2020-06-05 14:30:54+00:00,A,7,0.0
1,Stocks That Hit 52-Week Highs On Wednesday,https://www.benzinga.com/news/20/06/16170189/s...,Benzinga Insights,2020-06-03 14:45:20+00:00,A,7,0.0
2,71 Biggest Movers From Friday,https://www.benzinga.com/news/20/05/16103463/7...,Lisa Levin,2020-05-26 08:30:07+00:00,A,5,0.0
3,46 Stocks Moving In Friday's Mid-Day Session,https://www.benzinga.com/news/20/05/16095921/4...,Lisa Levin,2020-05-22 16:45:06+00:00,A,7,0.0
4,B of A Securities Maintains Neutral on Agilent...,https://www.benzinga.com/news/20/05/16095304/b...,Vick Meyer,2020-05-22 15:38:59+00:00,A,14,0.296


In [15]:
news_df['sentiment_category'] = pd.cut(news_df['sentiment'], bins=[-1, -0.5, -0.0001,0.0001, 0.5, 1], labels=['Very Negative', 'Slightly Negative', 'Neutral', 'Slightly Positive','Very Postive'])
news_df['sentiment_category'].value_counts()

sentiment_category
Neutral              731893
Slightly Positive    339022
Slightly Negative    182864
Very Postive         109718
Very Negative         43831
Name: count, dtype: int64

Publisher Analysis

In [16]:
#Top 10 publishers with the most articles
news_df['publisher'].value_counts().head(10)

publisher
Paul Quintaro        228373
Lisa Levin           186979
Benzinga Newsdesk    150484
Charles Gross         96732
Monica Gerson         82380
Eddie Staley          57254
Hal Lindon            49047
ETF Professor         28489
Juan Lopez            28438
Benzinga Staff        28114
Name: count, dtype: int64

In [17]:
#Publisher who used email adresses as publisher name
dr=news_df[news_df['publisher'].str.contains('@')]
dr=dr['publisher'].str.split('@',expand=True)
dr.columns=['user','handle']
dr['handle'].value_counts()

handle
benzinga.com              7937
gmail.com                  139
andyswan.com                 5
investdiva.com               2
tothetick.com                2
eosdetroit.io                1
forextraininggroup.com       1
stockmetrix.net              1
Name: count, dtype: int64

In [18]:
finance=FinancialAnalyzer('AAPL','2020-01-01','2020-12-30')
df=finance.ticker_data()
df=finance.technical_indicators(df)
row=finance.plot_macd(df)
row
