## 1. Web Crawling for Stock Related News

In [22]:
#import the library
import requests
from bs4 import BeautifulSoup
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import sent_tokenize
import pandas as pd
from pandas_datareader import data as web
import datetime
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.chrome.options import Options
import time
import numpy as np
import statsmodels.api as sm

In [10]:
class Context:
    '''
    Authors and users initiate global context.
    '''
    def __init__(self, stock_code = None, universe_condition = 'DJIA'):
        '''
        Users choose the stock they are going to predict as well as 
        the stock universe the prediction based on.
        
        Params:
            stock_code:           stock code, the stock to predict
                                        'AAPL' - apple; 'GOOG' - google, etc.
            universe_conditon:  stock index, the training of estimator is 
                                        based on the Index constituent stocks.
                                        'DJIA' - Dow Jones Industrial Average, 
                                        'S&P500' - Standard and Poor 500 Index,
                                        'nasdaq100' -  NASDAQ-100.
        '''
        self.stock_code = stock_code
        self.universe_condition = universe_condition
        assert [universe_condition in ['DJIA', 'S&P500', 'nasdaq100']]
        

In [None]:
class news(Context):
    def __init__(self, stock_code, universe_condition):
        Context.__init__(self, stock_code, universe_condition)
        
    def _scrolling_down_page(self, stock_name):
        '''
        Scroll Down Page to Get More News, use chromedriver to open the web.
        '''
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        url = "https://finance.yahoo.com/quote/"+ str(stock_name) +"/?p=" + str(stock_name)
        driver = webdriver.Chrome(chrome_options=chrome_options)
        driver.get(url)
        time.sleep(3) 

        #Selenium automates browsers, scrolling down the page until contents are fully updated
        from selenium.webdriver import ActionChains
        for i in range(1000): 
            ActionChains(driver).key_down(Keys.DOWN).perform() 
            print(f'have finished {i} times')
            time.sleep(1)

        #Acquire page source code
        html_ = driver.page_source.encode('utf-8')
        results_page = BeautifulSoup(html_,'lxml')
        return results_page
        
# then we combine all these into a def function
# stock_name should be 'AAPL' & 'GOOG' cannot be 'apple' or 'google'
    def _get_article_link(self, results_page):
        '''
        Get links of article about the stock from Yahoo Finance.
        '''
        all_links = []
        #url = "https://finance.yahoo.com/quote/"+ str(self.stock_name) +"/?p=" + str(self.stock_name)
        #send the request and get the response
        #response = requests.get(url)
        #if not response.status_code == 200:
        #    return None
        try:
            # we find the href contains in the tag_h3
            all_h3_tags = results_page.find_all('h3', {'class': "Mb(5px)"})
            article_link = []
            # try to find the href in the tag_h3
            for tag in all_h3_tags:
                try:
                    article_link.append(tag.find('a').get('href'))
                except:
                    return None 
            for link in article_link:
                if 'http' in link:
                    all_links.append(link)
                else:
                    home_url = 'https://finance.yahoo.com'
                    url = home_url + link
                    all_links.append(url)
            return all_links
        except:
            return None 
        
    def _get_article_content(self, all_links):
        '''
        Combine the time and article into a list and construct a list.
        '''
        article_content = []
        try:
            for link in all_links:
                contents = []
                response = requests.get(link)
                if not response.status_code == 200:
                    return None
                result_page = BeautifulSoup(response.content,'lxml')
                time_tag = result_page.find('time')
                time = time_tag.get_text()
                contents.append(time)
                try:
                    content_tag = result_page.find_all('p',{'class':'canvas-atom canvas-text Mb(1.0em) Mb(0)--sm Mt(0.8em)--sm'})
                    for content in content_tag:
                        article = content.get_text()
                        contents.append(article)
                except:
                    return None
                article_content.append(contents)  
                return article_content
        except:
            return article_content
        
    def _artical_format(self, artical_content):
        '''
        change artical content format for text mining
        '''
        artical_texts = []
        for i in artical_content:
            if len(i)>1:
                artical_texts.append([i[0]]+[''.join(i[1:])])
        return artical_texts

    def web_crawling(self, stock_name):
        results_page = self._scrolling_down_page(stock_name)
        all_links = self._get_article_link(results_page)
        article_content = self._get_article_content(all_links)
        artical_texts = self._artical_format(artical_content)
        print('Web crawling for', stock_name, 'done!')
        returm artical_texts
        
    
    
    def vader_comparison(self):
        self._artical_format()
        headers = ['Name','pos','neg','neu','compound']
        self.sentiment_result=pd.DataFrame(columns=headers)
        print("Name\t",'  pos\t','neg\t','neu\t','compound')
        analyzer = SentimentIntensityAnalyzer()
        for i in range(len(texts)):
            name = self.artical_texts[i][0]
            sentences = sent_tokenize(self.artical_texts[i][1])
            pos=compound=neu=neg=0
            for sentence in sentences:
                vs = analyzer.polarity_scores(sentence)
                pos+=vs['pos']/(len(sentences))
                neu+=vs['neu']/(len(sentences))
                neg+=vs['neg']/(len(sentences))
                compound+=vs['compound']/(len(sentences))
            self.sentiment_result=self.sentiment_result.append(pd.DataFrame([[name,pos,neg,neu,compound]],columns=headers))
            #print('%-10s'%name,'%1.2f\t'%pos,'%1.2f\t'%neg,'%1.2f\t'%neu,'%1.2f\t'%compound)
        self.sentiment_result=self.sentiment_result.sort_values(by='Name')
        self.sentiment_result.index=range(len(self.sentiment_result))
        return self.sentiment_result


In [None]:
get_article_content(all_links)

## 2. Scrolling Down Page to Get More New

In [25]:
!pip install --upgrade pip

Requirement already up-to-date: pip in /usr/local/lib/python3.6/site-packages (18.1)


In [11]:
!pip install selenium

Collecting selenium
[?25l  Downloading https://files.pythonhosted.org/packages/80/d6/4294f0b4bce4de0abf13e17190289f9d0613b0a44e5dd6a7f5ca98459853/selenium-3.141.0-py2.py3-none-any.whl (904kB)
[K    100% |################################| 911kB 10.7MB/s ta 0:00:01    99% |############################### | 901kB 44.1MB/s eta 0:00:01
Installing collected packages: selenium
Successfully installed selenium-3.141.0


In [27]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.chrome.options import Options

import time

In [28]:
chrome_options = Options()
chrome_options.add_argument('--headless')

In [29]:
def scrolling_down_page(stock_name):
    #Using chromedriver to open the web
    url = "https://finance.yahoo.com/quote/"+ str(stock_name) +"/?p=" + str(stock_name)
    driver = webdriver.Chrome(chrome_options=chrome_options)
    time.sleep(3) 
    
    #Selenium automates browsers, scrolling down the page until contents are fully updated
    from selenium.webdriver import ActionChains
    for i in range(1000): 
        ActionChains(driver).key_down(Keys.DOWN).perform() 
        print(f'have finished {i} times')
        time.sleep(1)
    
    #Acquire page source code
    html_ = driver.page_source.encode('utf-8')
    results_page = BeautifulSoup(html_,'lxml')
    
    return results_page

## 3. Text Mining

In [None]:
#import the library
import requests
from bs4 import BeautifulSoup

In [None]:
#change artical content format for text mining
def artical_format(artical_content):
    artical_texts = []
    for i in artical_content:
        if len(i)>1:
            artical_texts.append([i[0]]+[''.join(i[1:])])
    return artical_texts

In [None]:
#return text analysis results
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import sent_tokenize
import pandas as pd
def vader_comparison(texts):
    headers = ['Name','pos','neg','neu','compound']
    result=pd.DataFrame(columns=headers)
    print("Name\t",'  pos\t','neg\t','neu\t','compound')
    analyzer = SentimentIntensityAnalyzer()
    for i in range(len(texts)):
        name = texts[i][0]
        sentences = sent_tokenize(texts[i][1])
        pos=compound=neu=neg=0
        for sentence in sentences:
            vs = analyzer.polarity_scores(sentence)
            pos+=vs['pos']/(len(sentences))
            neu+=vs['neu']/(len(sentences))
            neg+=vs['neg']/(len(sentences))
            compound+=vs['compound']/(len(sentences))
        result=result.append(pd.DataFrame([[name,pos,neg,neu,compound]],columns=headers))
        print('%-10s'%name,'%1.2f\t'%pos,'%1.2f\t'%neg,'%1.2f\t'%neu,'%1.2f\t'%compound)
    result=result.sort_values(by='Name')
    result.index=range(len(result))
    return result

In [None]:
stock_name = 'AAPL'
all_links=get_article_link(stock_name)
artical_content=get_article_content(all_links)
artical_texts=artical_format(artical_content)
news_result=vader_comparison(artical_texts)

## 4.  Acquiring Stock Data

In [None]:
class stock(Context):
    def __init__(self, stock_code, universe_condition):
        Context.__init__(self, stock_code, universe_condition)

    def _get_djia_return(self):
        djia = requests.get('https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average')
        soup = BeautifulSoup(djia.text, 'lxml')
        table = soup.find('table', {'class': 'wikitable sortable'})
        self.codes = []
        for row in table.findAll('tr')[1:]:
            code = row.findAll('td')[2].text
            self.codes.append(code)
    
    def _get_sp500_codes(self):
        sp500 = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
        soup = BeautifulSoup(sp500.text, 'lxml')
        table = soup.find('table', {'class': 'wikitable sortable'})
        self.codes = []
        for row in table.findAll('tr')[1:]:
            code = row.findAll('td')[0].text
            self.codes.append(code)
        
    def _get_nasdaq100_codes(self):
        nasdaq = requests.get('https://en.wikipedia.org/wiki/NASDAQ-100')
        soup = BeautifulSoup(nasdaq.text, 'lxml')
        table = soup.find('div', class_='div-col columns column-width')
        self.codes = []
        pattern = r'\(([A-Z]+)\)'
        for row in table.findAll('li'):
            a = re.findall(pattern, str(row.text))
            self.codes.append(a[0])
    
    def get_stock_return(self):
        start=datetime.datetime.today() - datetime.timedelta(3)
        end=datetime.datetime.today()
        self.ret_all = []
        if self.universe_condition == 'DJIA':
            self._get_djia_return()
        elif self.universe_condition == 'S&P500':
            self._get_sp500_codes()
        elif self.universe_condition == 'nasdaq':
            self._get_nasdaq100_codes()
        for i in self.codes:
            df = web.DataReader(i, 'yahoo', start, end)
            ma1 = df['Close'].pct_change() 
            ret = ma1[-1]
            self.ret_all.append(ret)
        %matplotlib inline
        return self.ret_all, ma1.plot()


In [None]:
class news_stock(news, stock):
    def __init__(self, stock_code, universe_condition):
        Context.__init__(self, stock_code, universe_condition)
    
    def get_sentiment(self):
        sentiment_com_tr = []
        sentiment_pos_tr = []
        sentiment_neg_tr = []
        sentiment_com_te = []
        sentiment_pos_te = []
        sentiment_neg_te = []
        error_code = []
        for i in stock_code:
            try:
                all_links = get_article_link(i)
                artical_content = get_article_content(all_links)
                texts = artical_format(artical_content)
                a = vader_comparison(texts)
                a['Name'] = pd.to_datetime(a['Name'])
                a = a.sort_values(['Name'],ascending = 0)
                a_train = a[-round(len(a)*0.7):]
                a_test = a[:round(len(a)*0.3)]
                sentiment_com_tr.append(a_train.mean()['compound'])
                sentiment_pos_tr.append(a_train.mean()['pos'])
                sentiment_neg_tr.append(a_train.mean()['neg'])
                sentiment_com_te.append(a_test.mean()['compound'])
                sentiment_pos_te.append(a_test.mean()['pos'])
                sentiment_neg_te.append(a_test.mean()['neg'])
            except:
                error_code.append(i)
        return [sentiment_com_tr,sentiment_pos_tr,sentiment_neg_tr,sentiment_com_te,sentiment_pos_te,sentiment_neg_te,error_code]
    


In [30]:

def get_stock_return(stock_name):
    start=datetime.datetime(2007, 1, 1)
    end=datetime.datetime.today()
    print(start,end)
    
    #Get Stock data from Yahoo Finance
    df = web.DataReader('stock_name', 'yahoo', start, end)
    df.describe() #Get summary statistics
    
    #Calculate percent changes
    ma1 = df['Close'].pct_change() 
    
    %matplotlib inline
    return ma1.plot()

## 5. Data Visualization

In [None]:
from pandas_datareader import data as web
from datetime import datetime
import math
def stock_analysis(start, end, stock_name):
    print(start,end)
    stock_df = web.DataReader(stock_name, 'yahoo', start, end)
    stock_df['logReturn']=pd.Series([math.log(i) for i in stock_df['Close']/stock_df['Open']],
                                 index=stock_df.index)
    stock_df.describe() #Get summary statistics
    return stock_df

In [None]:
start=datetime(2000, 1, 1)
end=datetime.today()
stock_data=stock_analysis(start, end, stock_name)

In [None]:
from datetime import datetime
def news_stock_analysis(news_result, stock_data):
    headers = ['Name','pos','neg','neu','compound']
    news_result=news_result.groupby(['Name'])[headers[1:]].mean()
    news_result['Date']=[str(datetime.strptime(i,'%B %d, %Y'))[:10] for i in news_result.index]
    news_result.index=range(len(news_result))
    
    stock_data['Date']=[str(i)[:10] for i in stock_data.index]
    stock_data.index=range(len(stock_data))
    
    compound_data=news_result.merge(stock_data, on='Date', how='left')
    return compound_data

In [None]:
c=news_stock_analysis(news_result, stock_data)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates
def compare_plot(c, data1, data2):
    df=pd.DataFrame({'Date':matplotlib.dates.datestr2num(c['Date']),
                     'news_compound':np.array(c[data1]),
                     'stock_logReturn':np.array(c[data2])})
    plt.plot('Date',data1,data=df,marker='', color='olive', linewidth=2)
    plt.plot('Date',data2,data=df,marker='', color='blue', linewidth=2, linestyle='dashed')
    plt.legend()