In [7]:
# Import dependencies
import pandas as pd
from bs4 import BeautifulSoup as soup
from urllib.request import Request, urlopen
from pandasgui import show

In [8]:
# Set up scraper
url = "https://finviz.com/news.ashx"
req = Request(url, headers={"User-Agent": "Mozilla/5.0"})
webpage = urlopen(req).read()
html = soup(webpage, "html.parser")

In [9]:
# Define function to scrape and process news
def scrape_news(html, idx):
  try:
    news = pd.read_html(str(html))[idx]
    news.columns = ["0", "Time", "Headlines"]
    news = news.drop(columns=["0"])
    news = news.set_index("Time")
    return news
  except Exception as e:
    print(f"Error: {e}")
    return None

In [10]:
# Scrape and show general news
news_df = scrape_news(html, 5)
if news_df is not None:
  print("\nGeneral News: ")
  show(news_df)

PandasGUI INFO — pandasgui.gui — Opening PandasGUI



General News: 


In [None]:
# Scrape and show blog news
blog_news_df = scrape_news(html, 6)
if blog_news_df is not None:
  print("\nBlog News: ")
  show(blog_news_df)

# NLP Article Analysis

In [12]:
# Import libraries
from urllib.request import urlopen, Request
from bs4 import BeautifulSoup
import os
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
# NLTK VADER for sentiment analysis
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/khaled/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [18]:
finwiz_url = 'https://finviz.com/quote.ashx?t='
news_tables = {}
tickers = ["AAPL","MSFT","AMZN","META", "NFLX", "NVDA", "TSLA", "AMD","INTC","SNOW","PLTR", 
               "ORCL","IVV","GOOG","CSCO","MRO", "MRO","QCOM","ARM", "DAL", "NKE", "DIS", 
               "MCD", "GM", "HD", "HD", "ADBE", "EQT", "VOO","XLE"]
for ticker in tickers:
    print("treating "+ticker)
    url = finwiz_url + ticker
    req = Request(url=url,headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'}) 
    response = urlopen(req)    
    # Read the contents of the file into 'html'
    html = BeautifulSoup(response)
    # Find 'news-table' in the Soup and load it into 'news_table'
    news_table = html.find(id='news-table')
    # Add the table to our dictionary
    news_tables[ticker] = news_table

treating AAPL
treating MSFT
treating AMZN
treating META
treating NFLX
treating NVDA
treating TSLA
treating AMD
treating INTC
treating SNOW
treating PLTR
treating ORCL
treating IVV
treating GOOG
treating CSCO
treating MRO
treating MRO
treating QCOM
treating ARM
treating DAL
treating NKE
treating DIS
treating MCD
treating GM
treating HD
treating HD
treating ADBE
treating EQT
treating VOO
treating XLE


In [3]:
# Read one single day of headlines for ‘AMZN’ 
amzn = news_tables['AMZN']
# Get all the table rows tagged in HTML with <tr> into ‘amzn_tr’
amzn_tr = amzn.findAll('tr')
for i, table_row in enumerate(amzn_tr):
    # Read the text of the element ‘a’ into ‘link_text’
    a_text = table_row.a.text
    # Read the text of the element ‘td’ into ‘data_text’
    td_text = table_row.td.text
    # Print the contents of ‘link_text’ and ‘data_text’ 
    print(a_text)
    print(td_text)
    # Exit after printing 4 rows of data
    if i == 3:
        break

3 Stocks That Could Create Lasting Generational Wealth

            Today 05:25AM
        
Better Buy: Amazon vs. Apple

            05:05AM
        
Meta Connect 2023, Two Meta Pivots, The Elephant in the Room

            Oct-02-23 11:39PM
        
SentinelOne (S) Benefits from Robust Portfolio, Partner Base Strength

            09:45PM
        


In [19]:
from datetime import datetime
parsed_news = []
# Iterate through the news
for file_name, news_table in news_tables.items():
    # Iterate through all tr tags in 'news_table'
    for x in news_table.findAll('tr'):
        # read the text from each tr tag into text
        # get text from a only
        text = x.a.get_text() 
        # splite text in the td tag into a list 
        date_scrape = x.td.text.split()
        # if the length of 'date_scrape' is 1, load 'time' as the only element
        if len(date_scrape) == 1:
                    time = date_scrape[0]
                    
                # else load 'date' as the 1st element and 'time' as the second    
        else:
                    date = date_scrape[0]
                    if date=="Today":
                        date=datetime.today()
                    time = date_scrape[1]
                # Extract the ticker from the file name, get the string up to the 1st '_'  
        ticker = file_name.split('_')[0]
                
                # Append ticker, date, time and headline as a list to the 'parsed_news' list
        parsed_news.append([ticker, date, time, text])
                
parsed_news[:5] # print first 5 rows of news

[['AAPL',
  datetime.datetime(2023, 10, 4, 15, 21, 32, 866831),
  '08:41AM',
  'These Stocks Are Moving the Most Today: Apple, Intel, Moderna, Cal-Maine Foods, A10 Networks, Fluor, and More'],
 ['AAPL',
  datetime.datetime(2023, 10, 4, 15, 21, 32, 866831),
  '08:26AM',
  '2 Dow Stocks to Hold for a Decade or More'],
 ['AAPL',
  datetime.datetime(2023, 10, 4, 15, 21, 32, 866831),
  '08:13AM',
  'Apple stock slides after rare KeyBanc downgrade on iPhone sales concern'],
 ['AAPL',
  datetime.datetime(2023, 10, 4, 15, 21, 32, 866831),
  '06:53AM',
  'Apple CEO Tim Cook Gets $41 Million From\xa0Biggest Share Sale Since 2021'],
 ['AAPL',
  datetime.datetime(2023, 10, 4, 15, 21, 32, 866831),
  '06:51AM',
  '1 Easily Overlooked Reason Apple Is the Most Attractive FAANG Stock']]

In [20]:
# Instantiate the sentiment intensity analyzer
vader = SentimentIntensityAnalyzer()
# Set column names
columns = ['ticker', 'date', 'time', 'headline']
# Convert the parsed_news list into a DataFrame called 'parsed_and_scored_news'
parsed_and_scored_news = pd.DataFrame(parsed_news, columns=columns)
# Iterate through the headlines and get the polarity scores using vader
scores = parsed_and_scored_news['headline'].apply(vader.polarity_scores).tolist()
# Convert the 'scores' list of dicts into a DataFrame
scores_df = pd.DataFrame(scores)
# Join the DataFrames of the news and the list of dicts
parsed_and_scored_news = parsed_and_scored_news.join(scores_df, rsuffix='_right')
# Convert the date column from string to datetime
parsed_and_scored_news['date'] = pd.to_datetime(parsed_and_scored_news.date).dt.date
parsed_and_scored_news

Unnamed: 0,ticker,date,time,headline,neg,neu,pos,compound
0,AAPL,2023-10-04,08:41AM,"These Stocks Are Moving the Most Today: Apple,...",0.000,1.000,0.000,0.0000
1,AAPL,2023-10-04,08:26AM,2 Dow Stocks to Hold for a Decade or More,0.000,1.000,0.000,0.0000
2,AAPL,2023-10-04,08:13AM,Apple stock slides after rare KeyBanc downgrad...,0.000,1.000,0.000,0.0000
3,AAPL,2023-10-04,06:53AM,Apple CEO Tim Cook Gets $41 Million From Bigge...,0.000,0.845,0.155,0.2960
4,AAPL,2023-10-04,06:51AM,1 Easily Overlooked Reason Apple Is the Most A...,0.080,0.511,0.408,0.6697
...,...,...,...,...,...,...,...,...
2795,XLE,2022-11-17,04:11PM,"Markets slide ahead of the closing bell, energ...",0.000,0.710,0.290,0.5423
2796,XLE,2022-11-14,09:15AM,Energy ETF (XLE) Hits New 52-Week High,0.000,0.741,0.259,0.2732
2797,XLE,2022-11-07,03:11PM,Markets trend upward ahead of midterm election...,0.000,0.690,0.310,0.5423
2798,XLE,2022-11-03,04:13PM,"Markets under pressure heading into the close,...",0.179,0.650,0.171,-0.0258


In [23]:
for ticker in tickers:
    print(ticker+": "+str(round(parsed_and_scored_news[parsed_and_scored_news["ticker"]==ticker]["compound"].mean(),3)))

AAPL: 0.085
MSFT: 0.138
AMZN: 0.192
META: 0.054
NFLX: 0.05
NVDA: 0.175
TSLA: -0.027
AMD: 0.236
INTC: 0.055
SNOW: 0.161
PLTR: 0.229
ORCL: 0.21
IVV: 0.081
GOOG: 0.116
CSCO: 0.134
MRO: 0.188
MRO: 0.188
QCOM: 0.11
ARM: 0.107
DAL: 0.053
NKE: 0.094
DIS: 0.073
MCD: 0.167
GM: -0.019
HD: 0.158
HD: 0.158
ADBE: 0.12
EQT: 0.162
VOO: 0.133
XLE: 0.074


In [29]:
parsed_and_scored_news.to_csv("/tmp/parsed.csv",index=False)