# 10K Dataset

In [3]:
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
import project_helper

from tqdm import tqdm

## Download NLP Corpora
Use the stopwords and wordnet from nltk to preprocess the dataset later

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /Users/kathy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/kathy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Get 10Ks Dataset
We will use the company's unique CIK (Central Index Key) to identify each company.  
Find the CIK through: https://www.sec.gov/edgar/searchedgar/companysearch.html

In [5]:
# Set the company CIK here
cik_lookup = {
    'AMZN': '0001018724',
    'FB': '0001326801',   
    'GOOG': '0001652044',
    'MSFT': '0000789019',
    'TSLA': '0001318605'}

additional_cik = {
    'AEP': '0000004904',
    'AXP': '0000004962',
    'BA': '0000012927', 
    'BK': '0001390777',
    'CAT': '0000018230',
    'DE': '0000315189',
    'DIS': '0001001039',
    'DTE': '0000936340',
    'ED': '0001047862',
    'EMR': '0000032604',
    'ETN': '0001551182',
    'GE': '0000040545',
    'IBM': '0000051143',
    'IP': '0000051434',
    'JNJ': '0000200406',
    'KO': '0000021344',
    'LLY': '0000059478',
    'MCD': '0000063908',
    'MO': '0000764180',
    'MRK': '0000310158',
    'MRO': '0000101778',
    'PCG': '0001004980',
    'PEP': '0000077476',
    'PFE': '0000078003',
    'PG': '0000080424',
    'PNR': '0000077360',
    'SYY': '0000096021',
    'TXN': '0000097476',
    'UTX': '0000101829',
    'WFC': '0000072971',
    'WMT': '0000104169',
    'WY': '0000106535',
    'XOM': '0000034088'}

In [6]:
# Use SecAPI to get the 10Ks data
sec_api = project_helper.SecAPI()

In [7]:
# Pull a lost of filled 10-ks from the API for each company
from bs4 import BeautifulSoup

def get_sec_data(cik, doc_type, start=0, count=60):
    rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
        '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
        .format(cik, doc_type, start, count)
    sec_data = sec_api.get(rss_url)
    feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
    entries = [
        (
            entry.content.find('filing-href').getText(),
            entry.content.find('filing-type').getText(),
            entry.content.find('filing-date').getText())
        for entry in feed.find_all('entry', recursive=False)]

    return entries

In [8]:
# Pull the data, and show one of the examples
example_ticker = 'AMZN'
sec_data = {}

for ticker, cik in cik_lookup.items():
    sec_data[ticker] = get_sec_data(cik, '10-K')

pprint.pprint(sec_data[example_ticker][:5])

[('https://www.sec.gov/Archives/edgar/data/1018724/000101872420000004/0001018724-20-000004-index.htm',
  '10-K',
  '2020-01-31'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872419000004/0001018724-19-000004-index.htm',
  '10-K',
  '2019-02-01'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872418000005/0001018724-18-000005-index.htm',
  '10-K',
  '2018-02-02'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872417000011/0001018724-17-000011-index.htm',
  '10-K',
  '2017-02-10'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872416000172/0001018724-16-000172-index.htm',
  '10-K',
  '2016-01-29')]


In [9]:
# Download fillings from the urls we get in last step
raw_fillings_by_ticker = {}

for ticker, data in sec_data.items():
    raw_fillings_by_ticker[ticker] = {}
    for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
        if (file_type == '10-K'):
            file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')            
            
            raw_fillings_by_ticker[ticker][file_date] = sec_api.get(file_url)


print('Example Document:\n\n{}...'.format(next(iter(raw_fillings_by_ticker[example_ticker].values()))[:1000]))

Downloading AMZN Fillings: 100%|██████████| 25/25 [00:06<00:00,  3.61filling/s]
Downloading FB Fillings: 100%|██████████| 10/10 [00:03<00:00,  2.90filling/s]
Downloading GOOG Fillings: 100%|██████████| 7/7 [00:02<00:00,  3.21filling/s]
Downloading MSFT Fillings: 100%|██████████| 27/27 [00:09<00:00,  2.98filling/s]
Downloading TSLA Fillings: 100%|██████████| 12/12 [00:04<00:00,  2.55filling/s]

Example Document:

<SEC-DOCUMENT>0001018724-20-000004.txt : 20200131
<SEC-HEADER>0001018724-20-000004.hdr.sgml : 20200131
<ACCEPTANCE-DATETIME>20200130204613
ACCESSION NUMBER:		0001018724-20-000004
CONFORMED SUBMISSION TYPE:	10-K
PUBLIC DOCUMENT COUNT:		109
CONFORMED PERIOD OF REPORT:	20191231
FILED AS OF DATE:		20200131
DATE AS OF CHANGE:		20200130

FILER:

	COMPANY DATA:	
		COMPANY CONFORMED NAME:			AMAZON COM INC
		CENTRAL INDEX KEY:			0001018724
		STANDARD INDUSTRIAL CLASSIFICATION:	RETAIL-CATALOG & MAIL-ORDER HOUSES [5961]
		IRS NUMBER:				911646860
		STATE OF INCORPORATION:			DE
		FISCAL YEAR END:			1231

	FILING VALUES:
		FORM TYPE:		10-K
		SEC ACT:		1934 Act
		SEC FILE NUMBER:	000-22513
		FILM NUMBER:		20562951

	BUSINESS ADDRESS:	
		STREET 1:		410 TERRY AVENUE NORTH
		CITY:			SEATTLE
		STATE:			WA
		ZIP:			98109
		BUSINESS PHONE:		2062661000

	MAIL ADDRESS:	
		STREET 1:		410 TERRY AVENUE NORTH
		CITY:			SEATTLE
		STATE:			WA
		ZIP:			98109
</SEC-HEADER>
<DOCUMENT>
<TYPE>10-K
<




In [10]:
# Get documents from the fillings
import re

# To return a list of documents from a filling
def get_documents(text):
    extracted_docs = []
    
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')   
    
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]
    
    for doc_start_i, doc_end_i in zip(doc_start_is, doc_end_is):
            extracted_docs.append(text[doc_start_i:doc_end_i])
    
    return extracted_docs

In [11]:
# Extract the documents for the company
filling_documents_by_ticker = {}

for ticker, raw_fillings in raw_fillings_by_ticker.items():
    filling_documents_by_ticker[ticker] = {}
    for file_date, filling in tqdm(raw_fillings.items(), desc='Getting Documents from {} Fillings'.format(ticker), unit='filling'):
        filling_documents_by_ticker[ticker][file_date] = get_documents(filling)


print('\n\n'.join([
    'Document {} Filed on {}:\n{}...'.format(doc_i, file_date, doc[:200])
    for file_date, docs in filling_documents_by_ticker[example_ticker].items()
    for doc_i, doc in enumerate(docs)][:3]))

Getting Documents from AMZN Fillings: 100%|██████████| 20/20 [00:00<00:00, 75.73filling/s]
Getting Documents from FB Fillings: 100%|██████████| 8/8 [00:00<00:00, 41.60filling/s]
Getting Documents from GOOG Fillings: 100%|██████████| 5/5 [00:00<00:00, 31.93filling/s]
Getting Documents from MSFT Fillings: 100%|██████████| 27/27 [00:00<00:00, 56.09filling/s]
Getting Documents from TSLA Fillings: 100%|██████████| 10/10 [00:00<00:00, 29.18filling/s]

Document 0 Filed on 2020-01-31:

<TYPE>10-K
<SEQUENCE>1
<FILENAME>amzn-20191231x10k.htm
<DESCRIPTION>10-K
<TEXT>
<XBRL>
<?xml version="1.0" encoding="UTF-8"?>
<!--XBRL Document Created with Wdesk from Workiva-->
<!--p:c57a17684e854b...

Document 1 Filed on 2020-01-31:

<TYPE>EX-4.6
<SEQUENCE>2
<FILENAME>amzn-20191231xex46.htm
<DESCRIPTION>EXHIBIT 4.6
<TEXT>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>...

Document 2 Filed on 2020-01-31:

<TYPE>EX-21.1
<SEQUENCE>3
<FILENAME>amzn-20191231xex211.htm
<DESCRIPTION>EXHIBIT 21.1
<TEXT>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<ht...





In [12]:
# Define the function to get documents according to the type
def get_document_type(doc):
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    
    doc_type = type_pattern.findall(doc)[0][len('<TYPE>'):] 
    
    return doc_type.lower()

In [13]:
# Only get the 10-k documents for the companies we selected
ten_ks_by_ticker = {}

for ticker, filling_documents in filling_documents_by_ticker.items():
    ten_ks_by_ticker[ticker] = []
    for file_date, documents in filling_documents.items():
        for document in documents:
            if get_document_type(document) == '10-k':
                ten_ks_by_ticker[ticker].append({
                    'cik': cik_lookup[ticker],
                    'file': document,
                    'file_date': file_date})


project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['cik', 'file', 'file_date'])

[
  {
    cik: '0001018724'
    file: '\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2019123...
    file_date: '2020-01-31'},
  {
    cik: '0001018724'
    file: '\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2018123...
    file_date: '2019-02-01'},
  {
    cik: '0001018724'
    file: '\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2017123...
    file_date: '2018-02-02'},
  {
    cik: '0001018724'
    file: '\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2016123...
    file_date: '2017-02-10'},
  {
    cik: '0001018724'
    file: '\n<TYPE>10-K\n<SEQUENCE>1\n<FILENAME>amzn-2015123...
    file_date: '2016-01-29'},
]


## Preprocess the Data

### Clean up - remove the html tags and lowercase all the text

In [14]:
def remove_html_tags(text):
    text = BeautifulSoup(text, 'html.parser').get_text()
    
    return text

def clean_text(text):
    text = text.lower()
    text = remove_html_tags(text)
    
    return text

In [15]:
for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Cleaning {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_clean'] = clean_text(ten_k['file'])

project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['file_clean'])

Cleaning AMZN 10-Ks: 100%|██████████| 20/20 [00:30<00:00,  1.50s/10-K]
Cleaning FB 10-Ks: 100%|██████████| 8/8 [00:12<00:00,  1.59s/10-K]
Cleaning GOOG 10-Ks: 100%|██████████| 5/5 [00:10<00:00,  2.03s/10-K]
Cleaning MSFT 10-Ks: 100%|██████████| 27/27 [00:40<00:00,  1.49s/10-K]
Cleaning TSLA 10-Ks: 100%|██████████| 10/10 [00:18<00:00,  1.82s/10-K]

[
  {
    file_clean: '\n10-k\n1\namzn-20191231x10k.htm\n10-k\n\n\n\n\n\...},
  {
    file_clean: '\n10-k\n1\namzn-20181231x10k.htm\n10-k\n\n\n\n\n\...},
  {
    file_clean: '\n10-k\n1\namzn-20171231x10k.htm\n10-k\n\n\n\n\n\...},
  {
    file_clean: '\n10-k\n1\namzn-20161231x10k.htm\nform 10-k\n\n\n...},
  {
    file_clean: '\n10-k\n1\namzn-20151231x10k.htm\nform 10-k\n\n\n...},
]





### Lemmatize
Grouping together various inflections of a word to analyze them as a single item, identified by the word’s lemma

In [16]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


def lemmatize_words(words):
    lemmatized_words = [WordNetLemmatizer().lemmatize(word, 'v') for word in words]
    return lemmatized_words

In [18]:
word_pattern = re.compile('\w+')

for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Lemmatize {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_lemma'] = lemmatize_words(word_pattern.findall(ten_k['file_clean']))


project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['file_lemma'])

Lemmatize AMZN 10-Ks: 100%|██████████| 20/20 [00:04<00:00,  4.3610-K/s]
Lemmatize FB 10-Ks: 100%|██████████| 8/8 [00:01<00:00,  4.9410-K/s]
Lemmatize GOOG 10-Ks: 100%|██████████| 5/5 [00:01<00:00,  4.7410-K/s]
Lemmatize MSFT 10-Ks: 100%|██████████| 27/27 [00:03<00:00,  6.8410-K/s]
Lemmatize TSLA 10-Ks: 100%|██████████| 10/10 [00:03<00:00,  3.0310-K/s]

[
  {
    file_lemma: '['10', 'k', '1', 'amzn', '20191231x10k', 'htm', '...},
  {
    file_lemma: '['10', 'k', '1', 'amzn', '20181231x10k', 'htm', '...},
  {
    file_lemma: '['10', 'k', '1', 'amzn', '20171231x10k', 'htm', '...},
  {
    file_lemma: '['10', 'k', '1', 'amzn', '20161231x10k', 'htm', '...},
  {
    file_lemma: '['10', 'k', '1', 'amzn', '20151231x10k', 'htm', '...},
]





### Remove Stopwords

In [19]:
from nltk.corpus import stopwords

lemma_english_stopwords = lemmatize_words(stopwords.words('english'))

for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Remove Stop Words for {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_lemma'] = [word for word in ten_k['file_lemma'] if word not in lemma_english_stopwords]

Remove Stop Words for AMZN 10-Ks: 100%|██████████| 20/20 [00:01<00:00, 18.0010-K/s]
Remove Stop Words for FB 10-Ks: 100%|██████████| 8/8 [00:00<00:00, 14.8810-K/s]
Remove Stop Words for GOOG 10-Ks: 100%|██████████| 5/5 [00:00<00:00, 14.1910-K/s]
Remove Stop Words for MSFT 10-Ks: 100%|██████████| 27/27 [00:01<00:00, 19.8910-K/s]
Remove Stop Words for TSLA 10-Ks: 100%|██████████| 10/10 [00:01<00:00,  8.8310-K/s]


Here, the keys for each ten_k is ['cik', 'file', 'file_date', 'file_clean', 'file_lemma'].

### Transform the Data Format - from Dict to DataFrame

In [31]:
ten_ks_df_dict = {'date': [], 'company': [], 'ticker': [], 'text': []}

for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in ten_ks:
        ten_ks_df_dict['date'].append(ten_k['file_date'])
        ten_ks_df_dict['company'].append(ticker)
        ten_ks_df_dict['ticker'].append(cik_lookup[ticker])
        ten_ks_df_dict['text'].append(ten_k['file_lemma'])

ten_ks_df = pd.DataFrame(ten_ks_df_dict)

ten_ks_df.head()

Unnamed: 0,date,company,ticker,text
0,2020-01-31,AMZN,1018724,"[10, k, 1, amzn, 20191231x10k, htm, 10, k, doc..."
1,2019-02-01,AMZN,1018724,"[10, k, 1, amzn, 20181231x10k, htm, 10, k, doc..."
2,2018-02-02,AMZN,1018724,"[10, k, 1, amzn, 20171231x10k, htm, 10, k, doc..."
3,2017-02-10,AMZN,1018724,"[10, k, 1, amzn, 20161231x10k, htm, form, 10, ..."
4,2016-01-29,AMZN,1018724,"[10, k, 1, amzn, 20151231x10k, htm, form, 10, ..."


Since each company will only have one financial report each year, we will consider the 10ks text data is the same in the same year.

In [33]:
ten_ks_df['year'] = ten_ks_df['date'].apply(lambda x: x.split('-')[0])

ten_ks_df.head()

Unnamed: 0,date,company,ticker,text,year
0,2020-01-31,AMZN,1018724,"[10, k, 1, amzn, 20191231x10k, htm, 10, k, doc...",2020
1,2019-02-01,AMZN,1018724,"[10, k, 1, amzn, 20181231x10k, htm, 10, k, doc...",2019
2,2018-02-02,AMZN,1018724,"[10, k, 1, amzn, 20171231x10k, htm, 10, k, doc...",2018
3,2017-02-10,AMZN,1018724,"[10, k, 1, amzn, 20161231x10k, htm, form, 10, ...",2017
4,2016-01-29,AMZN,1018724,"[10, k, 1, amzn, 20151231x10k, htm, form, 10, ...",2016
