# Get 10ks data

In [1]:
pip install ratelimit



In [2]:
pip install w3lib



In [3]:
import nltk
import numpy as np
import pandas as pd
import pickle
import pprint
import project_helper

from tqdm import tqdm

In [4]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
cik_df= pd.read_csv('cik_ticker.csv', sep="|")
cik_df.head()

Unnamed: 0,CIK,Ticker,Name,Exchange,SIC,Business,Incorporated,IRS
0,1090872,A,Agilent Technologies Inc,NYSE,3825.0,CA,DE,770518772.0
1,4281,AA,Alcoa Inc,NYSE,3350.0,PA,PA,250317820.0
2,1332552,AAACU,Asia Automotive Acquisition Corp,,6770.0,DE,DE,203022522.0
3,1287145,AABB,Asia Broadband Inc,OTC,8200.0,GA,NV,721569126.0
4,1024015,AABC,Access Anytime Bancorp Inc,,6035.0,NM,DE,850444597.0


In [6]:
cik_map = {}

for i in range(len(cik_df)):
    cik_map[cik_df.iloc[i]['Ticker']] = cik_df.iloc[i]['CIK']

In [7]:
company_df = pd.read_csv('dis_comp.csv')

company_df.head()

Unnamed: 0,Company
0,A
1,AAMC
2,AAME
3,AAN
4,AAOI


In [8]:
all_comp = []

for i in range(len(company_df)):
  all_comp.append(company_df.iloc[i]['Company'])

company = all_comp[540:610]

In [9]:
print(len(all_comp))

3495


In [10]:
cik_lookup = {}

for i, comp in enumerate(company):
  if comp in cik_map.keys():
    cik_lookup[comp] = cik_map[comp]

In [11]:
# Use SecAPI to get the 10Ks data
sec_api = project_helper.SecAPI()

In [12]:
# Pull a lost of filled 10-ks from the API for each company
from bs4 import BeautifulSoup

def get_sec_data(cik, doc_type, start=0, count=60):
    rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
        '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
        .format(cik, doc_type, start, count)
    sec_data = sec_api.get(rss_url)
    feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
    entries = [
        (
            entry.content.find('filing-href').getText(),
            entry.content.find('filing-type').getText(),
            entry.content.find('filing-date').getText())
        for entry in feed.find_all('entry', recursive=False)]

    return entries

In [13]:
# Pull the data, and show one of the examples
example_ticker = 'AMZN'
sec_data = {}

for ticker, cik in cik_lookup.items():
    sec_data[ticker] = get_sec_data(cik, '10-K')

# pprint.pprint(sec_data[example_ticker][:5])

In [14]:
print(len(sec_data))

50


In [None]:
# Download fillings from the urls we get in last step
raw_fillings_by_ticker = {}

for ticker, data in sec_data.items():
    raw_fillings_by_ticker[ticker] = {}
    for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
        if (file_type == '10-K'):
            file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')            
            
            raw_fillings_by_ticker[ticker][file_date] = sec_api.get(file_url)


# print('Example Document:\n\n{}...'.format(next(iter(raw_fillings_by_ticker[example_ticker].values()))[:1000]))

Downloading CBG Fillings: 100%|██████████| 21/21 [00:16<00:00,  1.26filling/s]
Downloading CBK Fillings: 100%|██████████| 29/29 [00:07<00:00,  3.72filling/s]
Downloading CBLI Fillings: 100%|██████████| 15/15 [00:06<00:00,  2.15filling/s]
Downloading CBM Fillings: 0filling [00:00, ?filling/s]
Downloading CBMG Fillings: 100%|██████████| 18/18 [00:09<00:00,  1.82filling/s]
Downloading CBMX Fillings: 100%|██████████| 11/11 [00:04<00:00,  2.64filling/s]
Downloading CBPO Fillings: 100%|██████████| 20/20 [00:04<00:00,  4.37filling/s]
Downloading CBS Fillings: 100%|██████████| 29/29 [00:08<00:00,  3.56filling/s]
Downloading CBSH Fillings: 100%|██████████| 28/28 [00:12<00:00,  2.31filling/s]
Downloading CBT Fillings: 100%|██████████| 27/27 [00:08<00:00,  3.14filling/s]
Downloading CBZ Fillings: 100%|██████████| 6/6 [00:00<00:00,  7.45filling/s]
Downloading CCBG Fillings: 100%|██████████| 32/32 [00:12<00:00,  2.55filling/s]
Downloading CCC Fillings: 100%|██████████| 27/27 [00:05<00:00,  4.82fill

In [None]:
# Get documents from the fillings
import re

# To return a list of documents from a filling
def get_documents(text):
    extracted_docs = []
    
    doc_start_pattern = re.compile(r'<DOCUMENT>')
    doc_end_pattern = re.compile(r'</DOCUMENT>')   
    
    doc_start_is = [x.end() for x in doc_start_pattern.finditer(text)]
    doc_end_is = [x.start() for x in doc_end_pattern.finditer(text)]
    
    for doc_start_i, doc_end_i in zip(doc_start_is, doc_end_is):
            extracted_docs.append(text[doc_start_i:doc_end_i])
    
    return extracted_docs

In [None]:
# Extract the documents for the company
filling_documents_by_ticker = {}

for ticker, raw_fillings in raw_fillings_by_ticker.items():
    filling_documents_by_ticker[ticker] = {}
    for file_date, filling in tqdm(raw_fillings.items(), desc='Getting Documents from {} Fillings'.format(ticker), unit='filling'):
        filling_documents_by_ticker[ticker][file_date] = get_documents(filling)


#print('\n\n'.join([
#    'Document {} Filed on {}:\n{}...'.format(doc_i, file_date, doc[:200])
#    for file_date, docs in filling_documents_by_ticker[example_ticker].items()
#    for doc_i, doc in enumerate(docs)][:3]))

In [None]:
# Define the function to get documents according to the type
def get_document_type(doc):
    type_pattern = re.compile(r'<TYPE>[^\n]+')
    
    doc_type = type_pattern.findall(doc)[0][len('<TYPE>'):] 
    
    return doc_type.lower()

In [None]:
# Only get the 10-k documents for the companies we selected
ten_ks_by_ticker = {}

for ticker, filling_documents in filling_documents_by_ticker.items():
    ten_ks_by_ticker[ticker] = []
    for file_date, documents in filling_documents.items():
        for document in documents:
            if get_document_type(document) == '10-k':
                ten_ks_by_ticker[ticker].append({
                    'cik': cik_lookup[ticker],
                    'file': document,
                    'file_date': file_date})


# project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['cik', 'file', 'file_date'])

# Preprocess the Data

### Clean up - remove the html tags and lowercase all the text

In [None]:
from w3lib.html import remove_tags

def remove_html_tags(text):
    #text = BeautifulSoup(text, 'html.parser').get_text()
    text = remove_tags(text)
    
    return text

def clean_text(text):
    text = text.lower()
    text = remove_html_tags(text)
    
    return text

In [None]:
for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Cleaning {} 10-Ks'.format(ticker), unit='10-K'):
      #if ten_k['file']:
        ten_k['file_clean'] = clean_text(ten_k['file'])

# project_helper.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['file_clean'])

### Lemmatize
Grouping together various inflections of a word to analyze them as a single item, identified by the word’s lemma

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet


def lemmatize_words(words):
    lemmatized_words = [WordNetLemmatizer().lemmatize(word, 'v') for word in words]
    return lemmatized_words

In [None]:
word_pattern = re.compile('\w+')

for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Lemmatize {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_lemma'] = lemmatize_words(word_pattern.findall(ten_k['file_clean']))

### Remove Stopwords

In [None]:
from nltk.corpus import stopwords

lemma_english_stopwords = lemmatize_words(stopwords.words('english'))

for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in tqdm(ten_ks, desc='Remove Stop Words for {} 10-Ks'.format(ticker), unit='10-K'):
        ten_k['file_lemma'] = [word for word in ten_k['file_lemma'] if word not in lemma_english_stopwords]

Here, the keys for each ten_k is ['cik', 'file', 'file_date', 'file_clean', 'file_lemma'].

### Transform the Data Format - from Dict to DataFrame

In [None]:
ten_ks_df_dict = {'date': [], 'company': [], 'ticker': [], 'doc': []}

for ticker, ten_ks in ten_ks_by_ticker.items():
    for ten_k in ten_ks:
        ten_ks_df_dict['date'].append(ten_k['file_date'])
        ten_ks_df_dict['company'].append(ticker)
        ten_ks_df_dict['ticker'].append(cik_lookup[ticker])
        #ten_ks_df_dict['lemma'].append(ten_k['file_lemma'])
        ten_ks_df_dict['doc'].append(' '.join(ten_k['file_lemma']))

ten_ks_df = pd.DataFrame(ten_ks_df_dict)

ten_ks_df.head()

In [None]:
ten_ks_df.to_csv('1.csv', index = False)