## 1. Web Crawling for Stock Related News

In [None]:
#import the library
import requests
from bs4 import BeautifulSoup

In [None]:
# then we combine all these into a def function
# stock_name should be 'AAPL' & 'GOOG' cannot be 'apple' or 'google'
def get_article_link(stock_name):
    all_links = []
    import requests
    from bs4 import BeautifulSoup
    url = "https://finance.yahoo.com/quote/"+ str(stock_name) +"/?p=" + str(stock_name)
    #send the request and get the response
    response = requests.get(url)
    if not response.status_code == 200:
        return None
    try:
        results_page = BeautifulSoup(response.content,'lxml')
        # we find the href contains in the tag_h3
        all_h3_tags = results_page.find_all('h3', {'class': "Mb(5px)"})
        article_link = []
        # try to find the href in the tag_h3
        for tag in all_h3_tags:
            try:
                article_link.append(tag.find('a').get('href'))
            except:
                return None
        for link in article_link:
            if 'http' in link:
                all_links.append(url)
            else:
                home_url = 'https://finance.yahoo.com'
                url = home_url + link
                all_links.append(url)
        return all_links
    except:
        return None

In [None]:
stock_name = 'AAPL'
get_article_link(stock_name)

In [None]:
# we have already find the link of certain stock article
# so we have to get the article content and artcile issued data through that link
# we firstly use one link and then combines them into a def function
get_article_link(stock_name)[1]

In [None]:
response = requests.get(url)
if not response.status_code == 200:
    print('there is something wrong with link')

In [None]:
result_page = BeautifulSoup(response.content,'lxml')
result_page

In [None]:
# so we have find time is hidden in the tag_time from the result_page
time_tag = result_page.find('time')
print(time_tag)
time = time_tag.get_text()
time  

In [None]:
# after we get datetime, we have to get the article content which we use it to do sentiment analysis
content_tag = result_page.find_all('p',{'class':'canvas-atom canvas-text Mb(1.0em) Mb(0)--sm Mt(0.8em)--sm'})
articles = []
for content in content_tag:
    article = content.get_text()
    articles.append(article)
print(articles)

In [None]:
# we combine the time and article into a list and construct a list
def get_article_content(all_links):
    article_content = []
    import requests
    from bs4 import BeautifulSoup
    for link in all_links:
        contents = []
        response = requests.get(link)
        if not response.status_code == 200:
            return None
        result_page = BeautifulSoup(response.content,'lxml')
        time_tag = result_page.find('time')
        time = time_tag.get_text()
        contents.append(time)
        content_tag = result_page.find_all('p',{'class':'canvas-atom canvas-text Mb(1.0em) Mb(0)--sm Mt(0.8em)--sm'})
        for content in content_tag:
            article = content.get_text()
            contents.append(article)
        article_content.append(contents)

## 2. Scrolling Down Page to Get More New

In [25]:
!pip install --upgrade pip

Requirement already up-to-date: pip in /usr/local/lib/python3.6/site-packages (18.1)


In [26]:
!pip install selenium



In [27]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys  
from selenium.webdriver.chrome.options import Options

import time

In [28]:
chrome_options = Options()
chrome_options.add_argument('--headless')

In [29]:
def scrolling_down_page(stock_name):
    #Using chromedriver to open the web
    url = "https://finance.yahoo.com/quote/"+ str(stock_name) +"/?p=" + str(stock_name)
    driver = webdriver.Chrome(chrome_options=chrome_options)
    time.sleep(3) 
    
    #Selenium automates browsers, scrolling down the page until contents are fully updated
    from selenium.webdriver import ActionChains
    for i in range(1000): 
        ActionChains(driver).key_down(Keys.DOWN).perform() 
        print(f'have finished {i} times')
        time.sleep(1)
    
    #Acquire page source code
    html_ = driver.page_source.encode('utf-8')
    results_page = BeautifulSoup(html_,'lxml')
    
    return results_page

## 3. Text Mining

In [None]:
#import the library
import requests
from bs4 import BeautifulSoup

In [None]:
#change artical content format for text mining
def artical_format(artical_content):
    artical_texts = []
    for i in artical_content:
        if len(i)>1:
            artical_texts.append([i[0]]+[''.join(i[1:])])
    return artical_texts

In [None]:
#return text analysis results
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import sent_tokenize
import pandas as pd
def vader_comparison(texts):
    headers = ['Name','pos','neg','neu','compound']
    result=pd.DataFrame(columns=headers)
    print("Name\t",'  pos\t','neg\t','neu\t','compound')
    analyzer = SentimentIntensityAnalyzer()
    for i in range(len(texts)):
        name = texts[i][0]
        sentences = sent_tokenize(texts[i][1])
        pos=compound=neu=neg=0
        for sentence in sentences:
            vs = analyzer.polarity_scores(sentence)
            pos+=vs['pos']/(len(sentences))
            neu+=vs['neu']/(len(sentences))
            neg+=vs['neg']/(len(sentences))
            compound+=vs['compound']/(len(sentences))
        result=result.append(pd.DataFrame([[name,pos,neg,neu,compound]],columns=headers))
        print('%-10s'%name,'%1.2f\t'%pos,'%1.2f\t'%neg,'%1.2f\t'%neu,'%1.2f\t'%compound)
    result=result.sort_values(by='Name')
    result.index=range(len(result))
    return result

In [None]:
stock_name = 'AAPL'
all_links=get_article_link(stock_name)
artical_content=get_article_content(all_links)
artical_texts=artical_format(artical_content)
news_result=vader_comparison(artical_texts)

## 4.  Acquiring Stock Data

In [22]:
!pip install pandas



In [23]:
!pip install pandas_datareader



In [30]:
from pandas_datareader import data as web
import datetime

def get_stock_return(stock_name):
    start=datetime.datetime(2007, 1, 1)
    end=datetime.datetime.today()
    print(start,end)
    
    #Get Stock data from Yahoo Finance
    df = web.DataReader('stock_name', 'yahoo', start, end)
    df.describe() #Get summary statistics
    
    #Calculate percent changes
    ma1 = df['Close'].pct_change() 
    
    %matplotlib inline
    return ma1.plot()

## 5. Data Visualization

In [None]:
from pandas_datareader import data as web
from datetime import datetime
import math
def stock_analysis(start, end, stock_name):
    print(start,end)
    stock_df = web.DataReader(stock_name, 'yahoo', start, end)
    stock_df['logReturn']=pd.Series([math.log(i) for i in stock_df['Close']/stock_df['Open']],
                                 index=stock_df.index)
    stock_df.describe() #Get summary statistics
    return stock_df

In [None]:
start=datetime(2000, 1, 1)
end=datetime.today()
stock_data=stock_analysis(start, end, stock_name)

In [None]:
from datetime import datetime
def news_stock_analysis(news_result, stock_data):
    headers = ['Name','pos','neg','neu','compound']
    news_result=news_result.groupby(['Name'])[headers[1:]].mean()
    news_result['Date']=[str(datetime.strptime(i,'%B %d, %Y'))[:10] for i in news_result.index]
    news_result.index=range(len(news_result))
    
    stock_data['Date']=[str(i)[:10] for i in stock_data.index]
    stock_data.index=range(len(stock_data))
    
    compound_data=news_result.merge(stock_data, on='Date', how='left')
    return compound_data

In [None]:
c=news_stock_analysis(news_result, stock_data)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates
def compare_plot(c, data1, data2):
    df=pd.DataFrame({'Date':matplotlib.dates.datestr2num(c['Date']),
                     'news_compound':np.array(c[data1]),
                     'stock_logReturn':np.array(c[data2])})
    plt.plot('Date',data1,data=df,marker='', color='olive', linewidth=2)
    plt.plot('Date',data2,data=df,marker='', color='blue', linewidth=2, linestyle='dashed')
    plt.legend()