In [39]:
#import the library
import requests
from bs4 import BeautifulSoup
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import sent_tokenize
import pandas as pd
from pandas_datareader import data as web
import datetime
import re
from selenium import webdriver
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.chrome.options import Options
import time
import numpy as np
import statsmodels.api as sm
from mpl_toolkits.mplot3d import Axes3D
from sklearn import linear_model

In [10]:
class Context:
    '''
    Authors and users initiate global context.
    '''
    def __init__(self, stock_code = None, universe_condition = 'DJIA'):
        '''
        Users choose the stock they are going to predict as well as 
        the stock universe the prediction based on.
        
        Params:
            stock_code:           stock code, the stock to predict
                                        'AAPL' - apple; 'GOOG' - google, etc.
            universe_conditon:  stock index, the training of estimator is 
                                        based on the Index constituent stocks.
                                        'DJIA' - Dow Jones Industrial Average, 
                                        'S&P500' - Standard and Poor 500 Index,
                                        'nasdaq100' -  NASDAQ-100.
        '''
        self.stock_code = stock_code
        self.universe_condition = universe_condition
        assert universe_condition in ['DJIA', 'S&P500', 'nasdaq100']
        

In [None]:
class news(Context):
    def __init__(self, stock_code, universe_condition):
        Context.__init__(self, stock_code, universe_condition)
        
    def _scrolling_down_page(self, stock_name):
        '''
        Scroll Down Page to Get More News, use chromedriver to open the web.
        '''
        chrome_options = Options()
        chrome_options.add_argument('--headless')
        url = "https://finance.yahoo.com/quote/"+ str(stock_name) +"/?p=" + str(stock_name)
        driver = webdriver.Chrome(executable_path = '/Users/zhaonian/downloads/chromedriver' ,chrome_options=chrome_options)
        driver.get(url)
        time.sleep(3) 

        #Selenium automates browsers, scrolling down the page until contents are fully updated
        from selenium.webdriver import ActionChains
        for i in range(1000): 
            ActionChains(driver).key_down(Keys.DOWN).perform() 
            print(f'have finished {i} times')
            time.sleep(1)

        #Acquire page source code
        html_ = driver.page_source.encode('utf-8')
        results_page = BeautifulSoup(html_,'lxml')
        return results_page
        
# then we combine all these into a def function
# stock_name should be 'AAPL' & 'GOOG' cannot be 'apple' or 'google'
    def _get_article_link(self, results_page):
        '''
        Get links of article about the stock from Yahoo Finance.
        '''
        all_links = []
        try:
            # we find the href contains in the tag_h3
            all_h3_tags = results_page.find_all('h3', {'class': "Mb(5px)"})
            article_link = []
            # try to find the href in the tag_h3
            for tag in all_h3_tags:
                try:
                    article_link.append(tag.find('a').get('href'))
                except:
                    return None 
            for link in article_link:
                if 'http' in link:
                    all_links.append(link)
                else:
                    home_url = 'https://finance.yahoo.com'
                    url = home_url + link
                    all_links.append(url)
            return all_links
        except:
            return None 
        
    def _get_article_content(self, all_links):
        '''
        Combine the time and article into a list and construct a list.
        '''
        article_content = []
        try:
            for link in all_links:
                contents = []
                response = requests.get(link)
                if not response.status_code == 200:
                    return None
                result_page = BeautifulSoup(response.content,'lxml')
                time_tag = result_page.find('time')
                time = time_tag.get_text()
                contents.append(time)
                try:
                    content_tag = result_page.find_all('p',{'class':'canvas-atom canvas-text Mb(1.0em) Mb(0)--sm Mt(0.8em)--sm'})
                    for content in content_tag:
                        article = content.get_text()
                        contents.append(article)
                except:
                    return None
                article_content.append(contents)  
                return article_content
        except:
            return article_content
        
    def _article_format(self, article_content):
        '''
        change article content format for text mining
        '''
        article_texts = []
        for i in article_content:
            if len(i)>1:
                article_texts.append([i[0]]+[''.join(i[1:])])
        return article_texts

    def web_crawling(self, stock_name):
        results_page = self._scrolling_down_page(stock_name)
        all_links = self._get_article_link(results_page)
        article_content = self._get_article_content(all_links)
        article_texts = self._article_format(article_content)
        print('Web crawling for', stock_name, 'done!')
        return article_texts

    def vader_comparison(self, article_texts):
        headers = ['Name','pos','neg','neu','compound']
        sentiment_result=pd.DataFrame(columns=headers)
        #print("Name\t",'  pos\t','neg\t','neu\t','compound')
        analyzer = SentimentIntensityAnalyzer()
        for i in range(len(article_texts)):
            name = article_texts[i][0]
            sentences = sent_tokenize(article_texts[i][1])
            pos=compound=neu=neg=0
            for sentence in sentences:
                vs = analyzer.polarity_scores(sentence)
                pos+=vs['pos']/(len(sentences))
                neu+=vs['neu']/(len(sentences))
                neg+=vs['neg']/(len(sentences))
                compound+=vs['compound']/(len(sentences))
            sentiment_result=sentiment_result.append(pd.DataFrame([[name,pos,neg,neu,compound]],columns=headers))
            #print('%-10s'%name,'%1.2f\t'%pos,'%1.2f\t'%neg,'%1.2f\t'%neu,'%1.2f\t'%compound)
        sentiment_result=sentiment_result.sort_values(by='Name')
        sentiment_result.index=range(len(sentiment_result))
        return sentiment_result




In [None]:
class stock(Context):
    def __init__(self, stock_code, universe_condition):
        Context.__init__(self, stock_code, universe_condition)

    def _get_djia_return(self):
        djia = requests.get('https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average')
        soup = BeautifulSoup(djia.text, 'lxml')
        table = soup.find('table', {'class': 'wikitable sortable'})
        self.codes = []
        for row in table.findAll('tr')[1:]:
            code = row.findAll('td')[2].text
            self.codes.append(code[:-1])
    
    def _get_sp500_codes(self):
        sp500 = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
        soup = BeautifulSoup(sp500.text, 'lxml')
        table = soup.find('table', {'class': 'wikitable sortable'})
        self.codes = []
        for row in table.findAll('tr')[1:]:
            code = row.findAll('td')[0].text
            self.codes.append(code)
        
    def _get_nasdaq100_codes(self):
        nasdaq = requests.get('https://en.wikipedia.org/wiki/NASDAQ-100')
        soup = BeautifulSoup(nasdaq.text, 'lxml')
        table = soup.find('div', class_='div-col columns column-width')
        self.codes = []
        pattern = r'\(([A-Z]+)\)'
        for row in table.findAll('li'):
            a = re.findall(pattern, str(row.text))
            self.codes.append(a[0][:-1])
    
    def get_stock_return(self):
        start=datetime.datetime.today() - datetime.timedelta(4)
        end=datetime.datetime.today()
        self.ret_all = []
        if self.universe_condition == 'DJIA':
            self._get_djia_return()
        elif self.universe_condition == 'S&P500':
            self._get_sp500_codes()
        elif self.universe_condition == 'nasdaq100':
            self._get_nasdaq100_codes()
        for i in self.codes:
            df = web.DataReader(i, 'yahoo', start, end)
            ret = df['Close'].pct_change()[-1]
            self.ret_all.append(ret)
        #%matplotlib inline
        #return self.ret_all#, ma1.plot()



In [None]:
class news_stock(news, stock):
    def __init__(self, stock_code, universe_condition):
        Context.__init__(self, stock_code, universe_condition)
    
    def get_sentiment(self):
        sentiment_com_tr = []
        sentiment_pos_tr = []
        sentiment_neg_tr = []
        sentiment_com_te = []
        sentiment_pos_te = []
        sentiment_neg_te = []
        error_code = []
        for i in self.codes:
            #try:
            article_texts = news.web_crawling(self, i)
            a = news.vader_comparison(self, article_texts)
            a['Name'] = pd.to_datetime(a['Name'])
            a = a.sort_values(['Name'],ascending = 0)
            a = a.fillna(0)
            a_train = a[-round(len(a)*0.7):]
            a_test = a[:round(len(a)*0.3)]
            sentiment_com_tr.append(a_train.mean()['compound'])
            sentiment_pos_tr.append(a_train.mean()['pos'])
            sentiment_neg_tr.append(a_train.mean()['neg'])
            sentiment_com_te.append(a_test.mean()['compound'])
            sentiment_pos_te.append(a_test.mean()['pos'])
            sentiment_neg_te.append(a_test.mean()['neg'])
            #except:
             #   error_code.append(i)
        self.sentiment = [sentiment_com_tr,sentiment_pos_tr,sentiment_neg_tr,sentiment_com_te,sentiment_pos_te,sentiment_neg_te,error_code]
    
    def regress(self):
        X = np.column_stack((self.sentiment[0], self.sentiment[1], self.sentiment[2]))
        X = sm.add_constant(X)
        X = np.nan_to_num(X)
        y = self.ret_all
        model = sm.OLS(y,X)
        self.OLS_results = model.fit()
        #print(results.params)
        
    def prediction(self):
        if isinstance(self.stock_code, str):
            stock_code = [self.stock_code] 
        self.predict_result_all = []
        for i in range(len(stock_code)):
            predict_return = np.dot([1, self.sentiment[3][i],self.sentiment[4][i],self.sentiment[5][i]],self.OLS_results.params)
            print('Predicted return for stock ', self.stock_code, ' is ', predict_return)
            self.predict_result_all.append(predict_return)
        return self.predict_result
    
    def plot_scatter(self, stock_return):
        x=self.sentiment
        y=stock_return
        plt.scatter(x[0],y, marker='x', color='r', s=30)
        plt.scatter(x[1],y, marker='o', color='r', s=20)
        plt.scatter(x[2],y, marker='^', color='r', s=30)
        plt.show()
        
    def plot_3D_scatter(self, stock_return):
        fig=plt.figure()
        ax=fig.add_subplot(111,projection='3d')
        color_list=['r','b','g']
        marker_list=['o','^','+']
        x_list=self.sentiment
        z_list=list(range(1,4))
        for i in range(3):
            x=x_list[i]
            y=stock_return
            z=[z_list[i]]*len(x)
            ax.scatter(x,y,z,c=color_list[i],marker=marker_list[i])
        ax.set_xlabel('Sentiment')
        ax.set_ylabel('Stock Return')
        plt.show()
    
    def reg_scatter_plot(self,stock_return):    
        f, (ax1, ax2, ax3)=plt.subplots(3, sharex=False, sharey=True)
        
        x=self.sentiment
        regr1=regr2=regr3=linear_model.LinearRegression()
        regr1.fit(np.array(x[0]).reshape(-1,1),np.array(stock_return).reshape(-1,1))
        ax1.scatter(x[0], stock_return, s=30,c='r',marker='+',label='Sample')
        ax1.plot(sorted(x[0]),regr1.predict(np.array(stock_return).reshape(-1,1)),color='black',linewidth=2)

        regr2.fit(np.array(x[1]).reshape(-1,1),np.array(stock_return).reshape(-1,1))
        ax2.scatter(x[1], stock_return, s=20,c='b',marker='o',label='Sample')
        ax2.plot(sorted(x[1]),regr2.predict(np.array(stock_return).reshape(-1,1)),color='black',linewidth=2)

        regr3.fit(np.array(x[2]).reshape(-1,1),np.array(stock_return).reshape(-1,1))
        ax3.scatter(x[2], stock_return, s=30,c='g',marker='^',label='Sample')
        ax3.plot(sorted(x[2]),regr3.predict(np.array(stock_return).reshape(-1,1)),color='black',linewidth=2)

        f.subplots_adjust(hspace=0.5)
        plt.xlabel('Sentiment')
        plt.ylabel('Stock Return')
        plt.show()

In [None]:
a = news_stock('AAPL','DJIA')

In [None]:
stock_return=a.get_stock_return()

In [None]:
a.get_sentiment()a.regress()

In [None]:
a.regress()
a.prediction()
a.plot_scatter(stock_return)
a.plot_3D_scatter(stock_return)
a.reg_scatter_plot(stock_return)