## 1. Web Crawling for Stock Related News

In [1]:
#import the library
import requests
from bs4 import BeautifulSoup

ModuleNotFoundError: No module named 'requests'

In [None]:
# we first use APPLE as an example, and later in our formal code, we would change it into a search code, like input()
url = "https://finance.yahoo.com/quote/AAPL/?p=AAPL"
#send the request and get the response
response = requests.get(url)
print(response)
#check if the request was successful
#200 means we can access the website
if response.status_code == 200:
    print("Success")
else:
    print("Failure")

In [None]:
results_page = BeautifulSoup(response.content,'lxml')
print(results_page)

In [None]:
# we find the tag which contains the link of articles
all_h3_tags = results_page.find_all('h3', {'class': "Mb(5px)"})

In [None]:
#to check whether we get the right tag and see what in tag(h3)
all_h3_tags[5]

In [None]:
#scrap the article link
article_link = []
for tag in all_h3_tags:
    article_link.append(tag.find('a').get('href'))
print(article_link)

In [None]:
# we can find some href have the type as "http://", some as "/video", other as '/news'
# Firstly, we find there are still some articles about the stock below the video, so we choose not to delete this link.
# Secondly, we divide link into two parts, 'httep://' and '/video','/new'
all_links = []
for link in article_link:
    if 'http' in link:
        all_links.append(url)
    else:
        home_url = 'https://finance.yahoo.com'
        url = home_url + link
        all_links.append(url)
print(all_links)

## 2. Scrolling Down Page to Get More New

## 3. Text Mining

In [None]:
#import the library
import requests
from bs4 import BeautifulSoup

In [None]:
#change artical content format for text mining
def artical_format(artical_content):
    artical_texts = []
    for i in artical_content:
        if len(i)>1:
            artical_texts.append([i[0]]+[''.join(i[1:])])
    return artical_texts

In [None]:
#return text analysis results
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk import sent_tokenize
import pandas as pd
def vader_comparison(texts):
    headers = ['Name','pos','neg','neu','compound']
    result=pd.DataFrame(columns=headers)
    print("Name\t",'  pos\t','neg\t','neu\t','compound')
    analyzer = SentimentIntensityAnalyzer()
    for i in range(len(texts)):
        name = texts[i][0]
        sentences = sent_tokenize(texts[i][1])
        pos=compound=neu=neg=0
        for sentence in sentences:
            vs = analyzer.polarity_scores(sentence)
            pos+=vs['pos']/(len(sentences))
            neu+=vs['neu']/(len(sentences))
            neg+=vs['neg']/(len(sentences))
            compound+=vs['compound']/(len(sentences))
        result=result.append(pd.DataFrame([[name,pos,neg,neu,compound]],columns=headers))
        print('%-10s'%name,'%1.2f\t'%pos,'%1.2f\t'%neg,'%1.2f\t'%neu,'%1.2f\t'%compound)
    result=result.sort_values(by='Name')
    result.index=range(len(result))
    return result

In [None]:
stock_name = 'AAPL'
all_links=get_article_link(stock_name)
artical_content=get_article_content(all_links)
artical_texts=artical_format(artical_content)
news_result=vader_comparison(artical_texts)

## 4.  Acquiring Stock Data

## 5. Data Visualization

In [None]:
from pandas_datareader import data as web
from datetime import datetime
import math
def stock_analysis(start, end, stock_name):
    print(start,end)
    stock_df = web.DataReader(stock_name, 'yahoo', start, end)
    stock_df['logReturn']=pd.Series([math.log(i) for i in stock_df['Close']/stock_df['Open']],
                                 index=stock_df.index)
    stock_df.describe() #Get summary statistics
    return stock_df

In [None]:
start=datetime(2000, 1, 1)
end=datetime.today()
stock_data=stock_analysis(start, end, stock_name)

In [None]:
from datetime import datetime
def news_stock_analysis(news_result, stock_data):
    headers = ['Name','pos','neg','neu','compound']
    news_result=news_result.groupby(['Name'])[headers[1:]].mean()
    news_result['Date']=[str(datetime.strptime(i,'%B %d, %Y'))[:10] for i in news_result.index]
    news_result.index=range(len(news_result))
    
    stock_data['Date']=[str(i)[:10] for i in stock_data.index]
    stock_data.index=range(len(stock_data))
    
    compound_data=news_result.merge(stock_data, on='Date', how='left')
    return compound_data

In [None]:
c=news_stock_analysis(news_result, stock_data)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates
def compare_plot(c, data1, data2):
    df=pd.DataFrame({'Date':matplotlib.dates.datestr2num(c['Date']),
                     'news_compound':np.array(c[data1]),
                     'stock_logReturn':np.array(c[data2])})
    plt.plot('Date',data1,data=df,marker='', color='olive', linewidth=2)
    plt.plot('Date',data2,data=df,marker='', color='blue', linewidth=2, linestyle='dashed')
    plt.legend()