# Part 1: Web scraping on Financial Statements (on progress)
## Structure
For web scraping, there are two kind of open data source we can get, either twits or 10-K(Q). The original idea here is to download 10-K by clk and use tfidf/BERT to get sentiment/features from 10-K. Then use stock yearly/quartly returns as label to do supervised learning.

### Load Packages

In [1]:
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from tqdm import tqdm

## Cik Lookup Table

In [2]:
cik_lookup = {
    'AMZN': '0001018724',
    'HD': '0000354950',
    'NKE': '0000320187',
    'MCD': '0000063908',
    'SBUX': '0000887557',
    'LOW': '0000060667',
    'BKNG': '0001075531',
    'TJX': '0000109198',
    'TGT': '0000027419',
    'GM': '0000040730',
    'LVS': '0000850994',
    'MAR': '0001048286',
    'ROST': '0000745732',
    'DG': '0000029534',
    'VFC': '0001258370',
    'F': '0000037996',
    'ORLY': '0000898173',
    'CCL': '0000815097',
    'HLT': '0001593790',
    'YUM': '0001041061',
    'EBAY': '0001065088',
    'AZO': '0000866787',
    'RCL': '0000884887',
    'APTV': '0001521332',
    'CMG': '0001058090'} # not use


### Get 10-ks
give an example of AMZN

With the class constructed, let's pull a list of filled 10-ks from the SEC for each company.

In [15]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import requests
import urllib.request
import re

def _get_most_recent_index_page(ticker, filing):
    """
    This helper function return index_url by ticker and file_type
    """  
    try:
        page = requests.get("https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=" 
                        + ticker 
                        + "&type=" + filing + "&owner=exclude&count=40&search_text=")
    except requests.exceptions.RequestException as e:
        print("Error when getting", filing, "file index page for", ticker)
        raise Exception(e)
    
    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find(lambda tag: tag.name=='table' and tag.has_attr('summary') and tag['summary']=="Results")
    if table is not None:
        rows = table.findAll('tr')
        if rows is not None:
            doc_index_link = None
            for tr in rows:
                tds = tr.find_all('td')
                row = [i.text for i in tds]
                if len(row) >= 2 and filing == row[0]:   # the second element is the type of the file
                    return '/'.join(['http://sec.gov'] + [tds[1].find('a')['href']])
    print("Cannot find", filing, "file index page for", ticker)
    return None

def _get_file_url_by_index_url(index_url, filing):
    """
    This helper function return file_url by index_url and file_type
    """  
    try:
        page = requests.get(index_url)
    except requests.exceptions.RequestException as e:
        print("Error when getting url ticker")
        raise Exception(e)

    soup = BeautifulSoup(page.content, 'html.parser')
    table = soup.find(lambda tag: tag.name=='table' 
                      and tag.has_attr('summary') 
                      and tag['summary']=="Document Format Files")
    if table is not None:
        rows = table.findAll('tr')
        if rows is not None:
            doc_link = None
            for tr in rows:
                tds = tr.find_all('td')
                row = [i.text for i in tds]
                if len(row) >= 2 and filing in row[1]:   # the second element is the type of the file
                    return '/'.join(['http://sec.gov'] + tds[2].find('a')['href'].split('/')[2:])
    print("Index Page Found")
    return None

def get_most_recent_file(ticker, filing):
    """
    This function return pages by ticker and file_type
    """  
    index_url = _get_most_recent_index_page(ticker, filing)
    if index_url is None:
        raise Exception("Cannot find index page")
    file_url = _get_file_url_by_index_url(index_url, filing)
    if file_url is None:
        raise Exception("Cannot find file")
    return requests.get(file_url)

Let's pull the files using the `get_most_recent_file` function, and I choose to display one of the results. For displaying some of the data, we'll use the first ticker as an example. 

In [46]:
# first, I remove the duplicates of headlines in CSV, resulting in 24/40W left (most are still missing)
tickers = [w.lower() for w in pd.read_excel('./input/full_ticker_mapping.xlsx',engine='openpyxl')["ticker"]]
example_ticker = tickers[0] # amzn

# for ticker, cik in cik_lookup.items():
#     page[ticker] = get_most_recent_file(ticker, '10-K')
page = get_most_recent_file(example_ticker, '10-K')
# pprint.pprint(page[example_ticker])

In [48]:
def remove_consecutive_span(s):
    """
    This function return pages by ticker and file_type
    """  
    start = 0
    pattern = '</span><span'
    locations = []
    while start != -1:
        start = s.find(pattern, start)
        if start != -1:
            end = s.find('>', start + len(pattern))
            locations.append((start, end))
            start = end + 1
    if len(locations) == 0:
        return s
    res = [s[0:locations[0][0]]]
    for i in range(1, len(locations)):
        res += [s[locations[i-1][1]+1:locations[i][0]]]
    res += [s[locations[-1][1]+1:]]
    return ''.join(res)

html = remove_consecutive_span(page.content.decode('utf-8'))
html[:100]

'<?xml version="1.0" ?><!--XBRL Document Created with Wdesk from Workiva--><!--Copyright 2021 Workiva'

In [49]:
def text_from_html(html):
    invisible = set(['style', 'script', 'head', 'title', 'meta', '[document]'])
    soup = BeautifulSoup(html, 'html.parser')
    soup.find('ix:header').decompose()
    elements = soup.findAll(text=True)
    visible = []
    for ele in elements:
        if ele.parent.name not in invisible and not isinstance(ele, Comment):
            visible.append(ele)
    return u" ".join(t.strip() for t in visible)
visible_txt = text_from_html(html)
visible_txt[:100]

'Table of Contents UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549 __________'

### Preprocess
This step takes a long string as input, then performs some pre-operations like delete companies tickers that will not be feasible in future feature engineering.

In [27]:
nltk.download('wordnet')
nltk.download('punkt')

def preprocess(message):
    """
    This function takes a string as input, then performs these operations: 
        - lowercase
        - remove ticker symbols 
        - removes punctuation
        - tokenize by splitting the string on whitespace 
        - removes any single character tokens
    """    
    # Lowercase headline
    text = message.lower()
    
    # Replace URLs with a space in the message
    text = re.sub(r'^http?:\/\/.*[\r\n]*', ' ', text, flags = re.MULTILINE)
    
    # Replace ticker symbols with a space. The ticker symbols are any stock symbol that starts with >.
    text = re.sub (r'^>.*\s',' ',text,flags=re.MULTILINE)

    # Replace everything not a letter with a space
    text = re.sub (r'[^a-zA-Z]',' ',text,flags=re.MULTILINE)
    
    # Tokenize by splitting the string on whitespace into a list of words
    tokens = nltk.tokenize.word_tokenize(text)

    # Lemmatize words using the WordNetLemmatizer. You can ignore any word that is not longer than one character.
    wnl = nltk.stem.WordNetLemmatizer()
    tokens = [wnl.lemmatize(t) for t in tokens if t not in tickers] # delete company tickers
    
    return tokens
words = preprocess(visible_txt)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/z20171126/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/z20171126/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Set the frequently use pronouns and propositions.

In [42]:
pronouns = set(['i', 'me', 'myself', 'mine', 'mine','my',
                'we', 'us', 'ourself', 'ourselves', 'ours', 'our',
                'you', 'yourself', 'yourselves', 'yours', 'your',
                'thou', 'thee', 'thyself', 'thine', 'thy',
                'ye', 'yeers', "y\'all", 'youse', 'yeerselves',
                'he', 'him', 'himself', 'his',
                'she', 'her', 'herself', 'hers',
                'it', 'itself', 'its'])
propositions = set(['aboard','about','above','across','after','against','along','amid','among','around','as',
                    'at','before','behind','below','beneath','beside','between','beyond','but','by','concerning','considering',
                    'despite','down','during','except','following','for','from','in','inside','into','like','minus','near','next',
                    'of','off','on','onto','opposite','out','outside','over','past','per','plus','regarding','round','save','since',
                    'than','through','till','to','toward','under','underneath','unlike','until','up','upon','versus','via',
                    'with','within','without'
                    ])

### Download NLP Corpora
download the stopwords corpus for removing stopwords and count the most frequent words.

In [50]:
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer 
   
ps = PorterStemmer() 
cnt = {}
stem_to_source = {}

for raw_word in words:
    word = raw_word.lower()
    word = re.sub(r"[^a-z0-9'.-]", " ", word) # Include period since phrase like I.R.S should be considered legal.
    if word[-1] == '.':
        word = word[:-1]                        # Only remove the periods that work as the end of sentence.
    word = word.strip()
    pure_num=re.compile("^[0-9. -]*$")
    if (not bool(pure_num.match(word))) and word not in stopwords.words('english'):
        stemmed_word = ps.stem(word)
        if stemmed_word not in pronouns and stemmed_word not in propositions:
            if stemmed_word in cnt:
                cnt[stemmed_word] += 1
                if word in stem_to_source[stemmed_word]:
                    stem_to_source[stemmed_word][word] += 1
                else:
                    stem_to_source[stemmed_word][word] = 1
            else:
                cnt[stemmed_word] = 1
                stem_to_source[stemmed_word] = {word: 1}

# below is the first tenth frequent words in 10-K            
print(sorted(cnt.items(), key=lambda item: -item[1])[:10]) 

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/z20171126/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('leas', 483), ('vehicl', 450), ('may', 415), ('oper', 389), ('energi', 388), ('decemb', 371), ('product', 354), ('million', 349), ('cost', 346), ('note', 336)]


### Future Works
using the word lists, we can generate sentiment bag of words from the 10-k documents. Or using the words, we can calculate the jaccard similarity on the bag of words and plot it over time by using `from sklearn.metrics import jaccard_similarity_score`. Or we can generate TFIDF from the 10-k documents. It seems trivial if we directly apply SentimentAnalysis package on 10K.