In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options
from bs4 import BeautifulSoup
import time
import urllib.request
import pandas as pd
from collections import Counter
from plotly import graph_objs as go
import plotly.express as px
import plotly.figure_factory as ff
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
lemma = WordNetLemmatizer()
import re
from textblob import TextBlob

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid =SentimentIntensityAnalyzer()

Function to scroll

In [2]:
def scroll(driver, timeout,ScrollNumber):
    for i in range(1,ScrollNumber):
        driver.execute_script("window.scrollTo(1,50000)")
        time.sleep(timeout)

In [3]:


def get_all_links(url,tmot,scrln):
    print("\n Getting all links to the articles")
    # Setup the driver. This one uses firefox with some options and a path to the geckodriver
    driver = webdriver.Chrome()
    # implicitly_wait tells the driver to wait before throwing an exception
    driver.implicitly_wait(30)
    # driver.get(url) opens the page
    driver.get(url)

    # This starts the scrolling by passing the driver and a timeout
    scroll(driver, timeout=tmot, ScrollNumber=scrln)
    # Once scroll returns bs4 parsers the page_source
    soup_a = BeautifulSoup(driver.page_source, 'lxml')
    # Them we close the driver as soup_a is storing the page source
    driver.close()

    # Empty array to store the links
    links = []
    article_urls = soup_a.findAll("a",{"class":"js-content-viewer wafer-caas Fw(b) Fz(18px) Lh(23px) LineClamp(2,46px) Fz(17px)--sm1024 Lh(19px)--sm1024 LineClamp(2,38px)--sm1024 mega-item-header-link Td(n) C(#0078ff):h C(#000) LineClamp(2,46px) LineClamp(2,38px)--sm1024 not-isInStreamVideoEnabled"})


    # Looping through all the a elements in the page source
    for link in article_urls:
        # link.get('href') gets the href/url out of the a element
        links.append(link.get('href'))

    return links

In [4]:

def get_article(url):
  user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
  headers={'User-Agent':user_agent,} 
  request=urllib.request.Request(url,None,headers)
  try:
    response = urllib.request.urlopen(request)
  except:
    time.sleep(3)
  html = response.read()
  bsObj1 = BeautifulSoup(html,'lxml')
  try:    
    article = bsObj1.find('div',{'class':'caas-body'})
    a = article.get_text()
    time = bsObj1.find('time')
    t = time.get_text()
    
  except:
    a = None
    t = None
  return t,a
 
    
def get_all_articles(list_urls):
    print("\n Getting all articles from links")
    print(" *This will take some time please be patient*")
    articles = []
    time = []
    base = 'https://finance.yahoo.com/'
    for link in list_urls:
        u = base+link
        t,a = get_article(u)
        time.append(t)
        articles.append(a)
    return time,articles    

In [5]:
def make_df(time,articles):
    df = pd.DataFrame({'Datetime':time,'Article':articles})
    return df

def clean_text(text):
     totalStopwords = set([word.replace("'",'') for word in stopwords.words('english')])
     text = text.lower()
     text = text.replace("'",'')
     text = re.sub('[^a-zA-Z]',' ',text)
     words = text.split()
     words = [lemma.lemmatize(word) for word in words if (word not in totalStopwords) and (len(word)>1)] # Remove stop words
     text = " ".join(words)

     return text





def cleaning_sentiment_scoring(df):
    print("\n Cleaning Text")
    df['Article'] = df['Article'].apply(lambda x:clean_text(x))
    df['Datetime'] = pd.to_datetime(df.Datetime)
    print("\n Analyzing Sentiment")
    
    new_words = {
        'fall':-2.0,
        'edge':1,
        'rise':2.0,
        'slip':-2.0,
        'drop':-2.0,
        'gain':2.0,
        'jump':2.0,
        'climb':2.0,
        'rally':2.0,
        'hit':-1,
        'end':0.4
        
    }


    sid.lexicon.update(new_words)
    
    desc_blob = [TextBlob(desc) for desc in df['Article']]
    #add the sentiment metrics to the dataframe
    df['Polarity'] = [b.sentiment.polarity for b in desc_blob]
    df['Subjectivity'] = [b.sentiment.subjectivity for b in desc_blob]
    #load VADER
    analyzer = SentimentIntensityAnalyzer()
    #Add VADER metrics to dataframe
    df['compound'] = [sid.polarity_scores(v)['compound'] for v in df['Article']]
    df['neg'] = [sid.polarity_scores(v)['neg'] for v in df['Article']]
    df['neu'] = [sid.polarity_scores(v)['neu'] for v in df['Article']]
    df['pos'] = [sid.polarity_scores(v)['pos'] for v in df['Article']]
    return df

In [6]:
def show_sentiment_by_hr(df):
    ndf = df.set_index('Datetime').resample('H')['compound'].mean().dropna().plot(color='r', label='Sentiment')
    ndf.legend(loc="upper right")
    ndf.set_xlabel('Datetime')
    ndf.set_ylabel('Sentiment')
    ndf.yaxis.label.set_color('blue')
    ndf.xaxis.label.set_color('blue')

    


def show_sentiment_by_day(df):    
    ndf = df.set_index('Datetime').resample('D')['compound'].mean().plot(color='r', label='Sentiment')
    ndf.legend(loc="upper right")
    ndf.set_xlabel('Datetime')
    ndf.set_ylabel('Sentiment')
    ndf.yaxis.label.set_color('blue')
    ndf.xaxis.label.set_color('blue')


    
def show_sentiment_by_week(df):    
    ndf = df.set_index('Datetime').resample('W')['compound'].mean().plot(color='r', label='Sentiment')
    ndf.legend(loc="upper right")
    ndf.set_xlabel('Datetime')
    ndf.set_ylabel('Sentiment')
    ndf.yaxis.label.set_color('blue')
    ndf.xaxis.label.set_color('blue')

    
    
    
    
    
def save_to_excel(df,name):
    df.to_excel('yahoonews/'+name+'NewsSentiment.xlsx')
    print('File saved as: ', name+'NewsSentiment.xlsx')

Function to search

In [7]:
def search():
    base = 'https://finance.yahoo.com/quote/'
    link = input('Please enter tag for example: Apple:AAPL :')
    timeout = int(input('Please enter time to retry i.e. 3 for 3 sec. :'))
    scroll = int(input('Please enter pages to scroll i.e. 10 for 10 scrolls :'))
    all_links_to_visit = get_all_links(base+link,timeout,scroll)
    ti,ar = get_all_articles(all_links_to_visit)
    df = make_df(ti,ar)
    df = cleaning_sentiment_scoring(df)
    save_to_excel(df,link)
    print('\n **Done**')
    return df
    

In [9]:
df = search()

Please enter tag for example: Apple:AAPL :aapl
Please enter time to retry i.e. 3 for 3 sec. :1
Please enter pages to scroll i.e. 10 for 10 scrolls :10

 Getting all links to the articles

 Getting all articles from links
 *This will take some time please be patient*

 Cleaning Text

 Analyzing Sentiment
File saved as:  aaplNewsSentiment.xlsx

 **Done**


Sentiment by hour

In [None]:
#Horly sentiment
show_sentiment_by_hr(df)

Sentiment by day

In [None]:
#Daily sentiment
show_sentiment_by_day(df)