# Preamble

In [1]:
import requests
from bs4 import BeautifulSoup
from bs4 import NavigableString
from htmllaundry import sanitize
from htmllaundry.cleaners import LaundryCleaner
import htmllaundry.utils

import xmltodict

import re
import json

from pprint import pprint

import pandas as pd

from glob import glob

In [2]:
# Cached requests session to ease load on the server

from cachecontrol import CacheControl

sess = requests.session()
cach = CacheControl(sess)

In [3]:
# Helper for HTML popups

from IPython.display import HTML

def window(html):
    s = '<script type="text/javascript">'
    s += 'var win = window.open("", "", "toolbar=no, location=no, directories=no, status=no, menubar=no, scrollbars=yes, resizable=yes, width=780, height=200, top="+(screen.height-400)+", left="+(screen.width-840));'
    s += 'win.document.body.innerHTML = \'' + html.replace("\n",'\\n').replace("'", "\\'") + '\';'
    s += '</script>'
    return HTML(s)

In [4]:
# Override default empty tag behavior of htmllaundry, we want to retain some empty tags

if not 'old_remove_empty_tags' in locals(): old_remove_empty_tags = htmllaundry.utils.remove_empty_tags
htmllaundry.utils.remove_empty_tags = lambda doc : old_remove_empty_tags(doc, ['td', 'th'])

In [5]:
# Custom cleaner class

CustomCleaner = LaundryCleaner(
            page_structure=False,
            remove_unknown_tags=False,
            allow_tags=['blockquote', 'a', 'i', 'em', 'p', 'b', 'strong',
                        'h1', 'h2', 'h3', 'h4', 'h5', 
                        'ul', 'ol', 'li', 
                        'sub', 'sup',
                        'abbr', 'acronym', 'dl', 'dt', 'dd', 'cite',
                        'dft', 'br', 
                        'table', 'tr', 'td', 'th', 'thead', 'tbody', 'tfoot'],
            safe_attrs_only=True,
            add_nofollow=True,
            scripts=True,
            javascript=True,
            comments=True,
            style=True,
            links=False,
            meta=True,
            processing_instructions=False,
            frames=True,
            annoying_tags=False)

In [6]:
# Fix SEC encoding issues
# See: https://stackoverflow.com/questions/17392422/how-to-decode-cp1252-which-is-in-decimal-147-instead-of-x93

def reformat_cp1252(match):
    codePoint = int(match.group(1))
    if 128 <= codePoint <= 159:
        return bytes([codePoint])
    else:
        return match.group()

def clean_sec_content(binary):
    return re.sub(b'&#(\d+);', reformat_cp1252, binary, flags=re.I).decode("windows-1252").encode('utf-8').decode('utf-8')

In [7]:
# Normalize strings

def slugify(value):
    import unicodedata
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub('[^\w\s\.\-]', '-', value).strip().lower()
    value = re.sub('[-\s]+', '-', value)
    return value
    
slugify('test/string.html')

'test-string.html'

# Basic SEC scraping

In [8]:
base_url = 'https://www.sec.gov{}'

## Get ticker list

In [9]:
# Used by the main search field at https://www.sec.gov/

tickers = cach.get(base_url.format('/files/company_tickers.json')).json()

In [10]:
list(tickers.values())[:3]

[{'cik_str': 320193,
  'title': 'Apple Inc.',
  'ticker': 'AAPL',
  'type': 'company',
  'total': 1493092},
 {'cik_str': 19617,
  'title': 'JPMorgan Chase & Co.',
  'ticker': 'JPM',
  'type': 'company',
  'total': 1462835},
 {'cik_str': 1534504,
  'title': 'PBF Energy Inc.',
  'ticker': 'PBF',
  'type': 'company',
  'total': 1449404}]

## Helper functions

We define a general `get_sec_table` method to fetch out tables from SEC pages, with support for pagination and custom URL parameters.

In [11]:
def pagination_provider_by_element_start_count(find_args, find_kwargs):
    def pagination_provider_by_element_start_count_wrapped(soup, params):
        if soup.find(*find_args, **find_kwargs) is None:
            return None
        params['start'] += params['count']
        return params
    return pagination_provider_by_element_start_count_wrapped

In [12]:
def params_provider_by_dict(params):
    return lambda : params

In [13]:
def table_provider_by_summary(summary, header=0, index_col=0):
    return lambda soup: pd.read_html(
        str(soup.find('table', summary=summary)).replace('<br>', '<br>\n'), header=header, index_col=index_col)[0]

In [21]:
def get_sec_table(url,
                  table_provider=None,
                  base_params={}, 
                  params_provider=None,
                  pagination_provider=None,
                  replace_links=True,
                  session=None):
    def return_data_frame(session, url, params, provider):
        request = session.get(url, params=params)
        soup = BeautifulSoup(request.text)
        if replace_links:
            for a in soup.find_all('a'):
                parent = a.find_parent('td')
                if parent: parent.string = a['href']
        df = provider(soup)
        return df, soup
    if session is None:
        session = cach
    if not url.startswith('http://') and not url.startswith('https://'):
        url = base_url.format(url)
    params = dict(base_params)
    if params_provider:
        if isinstance(params_provider, dict):
            params.update(params_provider)
        else:
            params.update(params_provider())
    if not pagination_provider:
        df, soup = return_data_frame(session, url, params, table_provider)
        return df
    else:
        data_frames = []
        page_params = dict(params)
        while True:
            df, soup = return_data_frame(session, url, page_params, table_provider)
            data_frames.append(df)
            # Make sure columns retain their names
            data_frames[-1].columns = data_frames[0].columns
            new_params = pagination_provider(soup, page_params)
            if not new_params:
                break
            else:
                page_params.update(new_params)
        return pd.concat(data_frames, sort=False, ignore_index=True)

## Search by name

Retrieve CIK numbers based on name search, as featured on the main page.

In [15]:
# We could do it this way:

def get_companies_by_name(name):
    soup = BeautifulSoup(sess.get(base_url.format('/cgi-bin/browse-edgar'),
             params={'company': name, 'owner': 'exclude', 'action': 'getcompany'}).text)
    return pd.read_html(str(soup.find('table', summary='Results')), header=0, index_col=0)[0]

get_companies_by_name('tesla').head(2)

Unnamed: 0_level_0,Company,State/Country
CIK,Unnamed: 1_level_1,Unnamed: 2_level_1
1728203,Tesla Auto Lease Trust 2018-A,CA
1345028,TESLA GROUP HOLDINGS PTY LTD,C3


In [22]:
# This is more generic, using our helper:

get_companies_by_name = lambda name : get_sec_table('/cgi-bin/browse-edgar', 
              params_provider={'company': name, 'owner': 'exclude', 'action': 'getcompany'},
              table_provider=table_provider_by_summary('Results'),
              replace_links=False)

get_companies_by_name('tesla').head(2)

Unnamed: 0_level_0,Company,State/Country
CIK,Unnamed: 1_level_1,Unnamed: 2_level_1
1728203,Tesla Auto Lease Trust 2018-A,CA
1345028,TESLA GROUP HOLDINGS PTY LTD,C3


## Get company filings

Retrieve overview of documents for a given CIK number. We need to paginate here.

In [23]:
get_company_filings = lambda cik, form_type = '' : get_sec_table('/cgi-bin/browse-edgar', 
              params_provider={'CIK': cik, 'owner': 'exclude', 'action': 'getcompany', 'datab': '', 
                               'start': 0, 'count': 100, 'type': form_type},
              table_provider=table_provider_by_summary('Results', index_col=None),
              pagination_provider=pagination_provider_by_element_start_count(('input',), {'value': 'Next 100'}))

filings = get_company_filings(1318605)

print(filings.shape)
filings.head(3)

(427, 5)


Unnamed: 0,Filings,Format,Description,Filing Date,File/Film Number
0,SC TO-T/A,/Archives/edgar/data/1318605/00011931251907581...,[Amend] Tender offer statement by Third PartyA...,2019-03-15,
1,S-4/A,/Archives/edgar/data/1318605/00011931251907580...,"[Amend] Registration of securities, business c...",2019-03-15,/cgi-bin/browse-edgar?action=getcompany&filenu...
2,425,/Archives/edgar/data/1318605/00011931251907575...,"Prospectuses and communications, business comb...",2019-03-15,


## Get company filings using RSS

In [24]:
def get_company_filings_rss(cik, type_=''):
    has_next = True
    dfs = []
    params = {'CIK': cik, 'type': type_, 'datab': '', 'owner': 'exclude', 'action': 'getcompany',
             'start': 0, 'count': 100, 'output': 'atom'}
    while has_next:
        xml = sess.get(base_url.format('/cgi-bin/browse-edgar'), params=params).text
        df = xmltodict.parse(xml)
        params['start'] += params['count']
        has_next = next((x for x in df['feed']['link'] if x.get('@rel') == 'next'), None) is not None
        dfs.append(df)
    return dfs

def company_filings_rss_to_pandas(filings_rss):
    # Flatten the rss feed based on keys in feed.entry
    return pd.DataFrame([ { '{}_{}'.format(k, ks) : (v[ks] if isinstance(v, dict) else v) \
        for k, v in entry.items() for ks in (v if isinstance(v, dict) else [k]) } \
        for page in filings_rss for entry in page['feed']['entry'] ])

In [25]:
filings_rss = get_company_filings_rss(1318605)
filings_df = company_filings_rss_to_pandas(filings_rss)

In [28]:
print(filings_df.shape)

filings_df.head(2)

(427, 26)


Unnamed: 0,category_@label,category_@scheme,category_@term,content_@type,content_accession-nunber,content_act,content_amend,content_file-number,content_file-number-href,content_filing-date,...,content_size,content_xbrl_href,id_id,link_@href,link_@rel,link_@type,summary_#text,summary_@type,title_title,updated_updated
0,form type,https://www.sec.gov/,SC TO-T/A,text/xml,0001193125-19-075818,,[Amend],,,2019-03-15,...,801 KB,,"urn:tag:sec.gov,2008:accession-number=00011931...",https://www.sec.gov/Archives/edgar/data/131860...,alternate,text/html,<b>Filed:</b> 2019-03-15 <b>AccNo:</b> 0001193...,html,SC TO-T/A [Amend] - Tender offer statement by...,2019-03-15T08:32:27-04:00
1,form type,https://www.sec.gov/,S-4/A,text/xml,0001193125-19-075806,33.0,[Amend],333-229749,https://www.sec.gov/cgi-bin/browse-edgar?actio...,2019-03-15,...,2 MB,,"urn:tag:sec.gov,2008:accession-number=00011931...",https://www.sec.gov/Archives/edgar/data/131860...,alternate,text/html,<b>Filed:</b> 2019-03-15 <b>AccNo:</b> 0001193...,html,"S-4/A [Amend] - Registration of securities, b...",2019-03-15T08:27:42-04:00


## Get documents for a filing

In [29]:
get_filing_documents = lambda url, summary = 'Document Format Files' : get_sec_table(url,
              table_provider=table_provider_by_summary(summary, index_col=None),
              pagination_provider=pagination_provider_by_element_start_count(('input',), {'value': 'Next 100'}))

In [30]:
documents = get_filing_documents('/Archives/edgar/data/1035443/0001047469-19-001263-index.html')

print(documents.shape)
documents.head(3)

(5, 5)


Unnamed: 0,Seq,Description,Document,Type,Size
0,1.0,424B5,/Archives/edgar/data/1035443/00010474691900126...,424B5,644742
1,2.0,G136386.JPG,/Archives/edgar/data/1035443/00010474691900126...,GRAPHIC,11919
2,3.0,G71556.JPG,/Archives/edgar/data/1035443/00010474691900126...,GRAPHIC,17325


In [31]:
documents = get_filing_documents('/Archives/edgar/data/1318605/000156459019003165/0001564590-19-003165-index.htm',
                                'Data Files')

print(documents.shape)
documents.head(3)

(6, 5)


Unnamed: 0,Seq,Description,Document,Type,Size
0,14,XBRL INSTANCE DOCUMENT,/Archives/edgar/data/1318605/00015645901900316...,EX-101.INS,7034413
1,15,XBRL TAXONOMY EXTENSION SCHEMA,/Archives/edgar/data/1318605/00015645901900316...,EX-101.SCH,168083
2,16,XBRL TAXONOMY EXTENSION CALCULATION LINKBASE,/Archives/edgar/data/1318605/00015645901900316...,EX-101.CAL,146696


## Get current events

In [32]:
# See https://www.sec.gov/edgar/searchedgar/currentevents.htm

def get_current_events(days_before=0, form_type=''):
    soup = BeautifulSoup(cach.get(base_url.format('/cgi-bin/current'), 
                            params={'q1': days_before, 'q2': 0, 'q3': form_type}).text)
    pre = soup.find('pre')
    ls = []
    # Remove horizontal rules and replace them with a new line
    # Then split on double spaces after removing links
    # And add in the links separately
    for line in str(pre).replace('<hr>', '\n').replace('<hr/>', '\n').split('\n'):
        bs_line = BeautifulSoup(line)
        clean_line = '  '.join(item.strip() for item in bs_line.find_all(text=True))
        split_line = [ x.strip() for x in clean_line.split('  ') if x.strip() ]
        split_line += [ a.get('href') for a in bs_line.find_all('a') ]
        if not all(x is None for x in split_line): ls.append(split_line)
    colnames = ls[0] + [ 'link_{}'.format(i) for i in range(max(len(l) for l in ls) - len(ls[0])) ]
    return pd.DataFrame(ls[1:], columns=colnames)

In [33]:
get_current_events().head(3)

Unnamed: 0,Date Filed,Form,CIK Code,Company Name,link_0,link_1
0,03-15-2019,10-K,1141807,1ST CONSTITUTION BANCORP,/Archives/edgar/data/1141807/0001141807-19-000...,browse-edgar?action=getcompany&CIK=1141807
1,03-15-2019,10-K,867665,ABRAXAS PETROLEUM CORP,/Archives/edgar/data/867665/0001437749-19-0050...,browse-edgar?action=getcompany&CIK=867665
2,03-15-2019,10-K,934549,ACACIA RESEARCH CORP,/Archives/edgar/data/934549/0000934549-19-0000...,browse-edgar?action=getcompany&CIK=934549


## Parse XBRL

In [34]:
doc = get_filing_documents('/Archives/edgar/data/1318605/000156459019003165/0001564590-19-003165-index.htm',
                  'Data Files')
doc.Document.values[0]

'/Archives/edgar/data/1318605/000156459019003165/tsla-20181231.xml'

In [35]:
from xbrl import XBRLParser, GAAP, GAAPSerializer
from io import StringIO

xml = sess.get(base_url.format(doc.Document.values[0])).text

xbrl_parser = XBRLParser()
xbrl = xbrl_parser.parse(StringIO(xml))

In [36]:
gaap_obj = xbrl_parser.parseGAAP(xbrl, ignore_errors=2)

In [37]:
print(gaap_obj.__dict__)

{'assets': 0, 'current_assets': 0, 'non_current_assets': 0, 'liabilities_and_equity': 0, 'liabilities': 0, 'current_liabilities': 0, 'noncurrentLiabilities': 0.0, 'commitments_and_contingencies': 0, 'redeemable_noncontrolling_interest': 0, 'temporary_equity': 0, 'equity': 0, 'equity_attributable_interest': 0, 'equity_attributable_parent': 0, 'stockholders_equity': 0, 'revenue': 0.0, 'cost_of_revenue': 0.0, 'gross_profit': 0, 'costs_and_expenses': 0, 'other_operating_income': 0, 'nonoperating_income_loss': 0, 'interest_and_debt_expense': 0, 'income_before_equity_investments': 0, 'income_from_equity_investments': 0, 'income_tax_expense_benefit': 0, 'net_income_shareholders': 0, 'extraordary_items_gain_loss': 0, 'income_loss': 0, 'preferred_stock_dividends': 0, 'net_income_loss_noncontrolling': 0, 'net_income_parent': 0.0, 'net_income_loss': 0, 'other_comprehensive_income': 0, 'comprehensive_income': 0, 'comprehensive_income_parent': 0, 'comprehensive_income_interest': 0, 'net_cash_flows_

In [38]:
df = xmltodict.parse(xml)

In [39]:
print(df['xbrl'].keys())

odict_keys(['@xmlns:utr', '@xmlns:iso4217', '@xmlns', '@xmlns:tsla', '@xmlns:xbrll', '@xmlns:xlink', '@xmlns:nonnum', '@xmlns:num', '@xmlns:xbrldt', '@xmlns:us-types', '@xmlns:srt-types', '@xmlns:us-gaap', '@xmlns:srt', '@xmlns:dei', '@xmlns:country', '@xmlns:currency', '@xmlns:exch', '@xmlns:invest', '@xmlns:stpr', '@xmlns:sic', '@xmlns:naics', '@xmlns:xbrldi', 'xbrll:schemaRef', 'unit', 'context', 'dei:DocumentType', 'dei:AmendmentFlag', 'dei:DocumentPeriodEndDate', 'dei:DocumentFiscalYearFocus', 'dei:DocumentFiscalPeriodFocus', 'dei:TradingSymbol', 'dei:EntityRegistrantName', 'dei:EntityCentralIndexKey', 'dei:CurrentFiscalYearEndDate', 'dei:EntityWellKnownSeasonedIssuer', 'dei:EntityCurrentReportingStatus', 'dei:EntityVoluntaryFilers', 'dei:EntityFilerCategory', 'dei:EntitySmallBusiness', 'dei:EntityEmergingGrowthCompany', 'dei:EntityShellCompany', 'dei:EntityCommonStockSharesOutstanding', 'dei:EntityPublicFloat', 'us-gaap:CashAndCashEquivalentsAtCarryingValue', 'us-gaap:RestrictedC

# Downloading and saving SEC documents

In [40]:
def download_sec_document(doc_link, directory='forms/'):
    contents = clean_sec_content(cach.get(base_url.format(doc_link)).content)
    name = slugify(doc_link)
    with open(directory + name, 'w') as f: f.write(contents)

In [41]:
# We'll scrape a collection of 424B5 forms

num_days = 5

for p in range(0, num_days):
    print('Scraping day-page:', p)
    forms = get_current_events(p, '424B5')
    for link in forms['link_0']:
        docs = get_filing_documents(base_url.format(link))
        doc_link = docs.loc[docs.Type == '424B5', 'Document'].values[0]
        download_sec_document(doc_link)

Scraping day-page: 0
Scraping day-page: 1
Scraping day-page: 2
Scraping day-page: 3
Scraping day-page: 4


# Extracting information

## Cleaning

In [43]:
def read_html(file):
    with open(file, 'r') as f: return f.read()

In [50]:
def clean_html(html):
    soup = BeautifulSoup(html)
    if not soup.find('p'):
        for div in soup.find_all('div'):
            div.name = 'p'
    for b in soup.find_all('b'):
        b.name = 'strong'
    for f in soup.find_all('font', style=re.compile('font-weight:\s*bold')):
        f.name = 'strong'
    for footer in soup.find_all(class_=['header', 'footer']): 
        try: footer.decompose()
        except: pass
    san = sanitize(str(soup), CustomCleaner)
    soup = BeautifulSoup(san)
    def decompose_parent(el, parent='p', not_grandparent='table'):
        try:
            parent = el.find_parent(parent)
        except: parent = None
        if not parent: return
        grandparent = parent.find_parent('table')
        if grandparent: return
        parent.decompose()
    for el in soup.find_all(text=lambda x: 'table of contents' == str(x).lower().strip()):
        decompose_parent(el, 'a')
    for el in soup.find_all(text=re.compile(r'^\s*S\-(\d+|[ivxlcdm]+)\s*$')): 
        decompose_parent(el, 'p')
    for el in soup.find_all(text=re.compile(r'^\s*\d+\s*$')): 
        decompose_parent(el, 'p')
    return soup

In [51]:
content = read_html('forms/-archives-edgar-data-319201-000119312519074988-d706100d424b5.htm')
cleaned = clean_html(content)

In [77]:
window(str(cleaned))

## Basic offering summary extraction

In [88]:
content = read_html('forms/-archives-edgar-data-319201-000119312519074988-d706100d424b5.htm')
cleaned = clean_html(content)

In [89]:
window(str(cleaned))

In [85]:
def extract_dual_tables(soup):
    dualrows = []
    for tr in soup.select("table tr"):
        row = [td.text.strip() for td in tr.find_all('td')]
        if len(row) != 2:
            continue
        if row[1].strip() == '':
            continue
        if all([row[x] == '' for x in range(0, len(row)-1)]):
            if len(dualrows) > 0 and len(row) == len(dualrows[-1]):
                dualrows[-1][-1] += ' ' + row[-1]
        else:
            dualrows.append(row)
    return dualrows

In [55]:
extract_dual_tables(cleaned)

[['(1)',
  'Calculated in accordance with Rule 457(r) of the Securities Act of 1933, as amended.'],
 ['(1)',
  'Plus accrued interest from March 20, 2019, if settlement occurs after that date'],
 ['Issuer', 'KLA-Tencor Corporation, a Delaware corporation.'],
 ['Securities offered',
  '$800,000,000\xa0aggregate principal amount of 2029\xa0notes and $400,000,000 aggregate principal amount of 2049\xa0notes.'],
 ['Maturity date',
  'The 2029\xa0notes will mature on March 15, 2029\xa0and the 2049\xa0notes will mature on\xa0March 15, 2049.'],
 ['Interest payment date',
  'March 15 and\xa0September 15 of each year, beginning\xa0September 15, 2019.'],
 ['Ranking',
  'The notes will be our unsecured senior obligations and will: As of December\xa031, 2018, we had $2.2\xa0billion of indebtedness for borrowed money, which includes current and non-current\nliabilities, none of which was secured indebtedness, and our subsidiaries had $526.3\xa0million of liabilities (including trade payables but exc

In [105]:
# Warning: the text parameter uses string internally and doesn't work in case of nested tags!
cleaned.find('p', text=re.compile(r'OFF'))

In [106]:
def match_by_name_and_regex(name, regex, lowercase=True):
    return lambda el : el.name == name and re.search(regex, el.text.lower() if lowercase else el.text) is not None

In [109]:
def get_offering_header_candidates(soup):
    return soup.find_all(match_by_name_and_regex('p', r'\s*offering\s*$'))

def get_after_offering_header_tables(header):
    tables = ''
    nextSibling = header.nextSibling
    table_seen = False
    while True:
        if nextSibling is None:
            break
        if type(nextSibling) == NavigableString:   
            if table_seen and str(nextSibling).strip() != '': break
            nextSibling = nextSibling.nextSibling
            continue
        if nextSibling.name != 'table':
            if table_seen and nextSibling.get_text(strip=True) != '': break
            nextSibling = nextSibling.nextSibling
            continue
        table_seen = True
        tables += str(nextSibling)
        if not nextSibling.nextSibling:
            print(nextSibling)
            print(nextSibling.nextSibling)
        nextSibling = nextSibling.nextSibling
    return tables

def extract_offering(soup):
    for header in get_offering_header_candidates(soup):
        tables = get_after_offering_header_tables(header)
        if tables:
            return extract_dual_tables(BeautifulSoup(tables))

        
extract_offering(cleaned)

[['Issuer', 'KLA-Tencor Corporation, a Delaware corporation.'],
 ['Securities offered',
  '$800,000,000\xa0aggregate principal amount of 2029\xa0notes and $400,000,000 aggregate principal amount of 2049\xa0notes.'],
 ['Maturity date',
  'The 2029\xa0notes will mature on March 15, 2029\xa0and the 2049\xa0notes will mature on\xa0March 15, 2049.'],
 ['Interest payment date',
  'March 15 and\xa0September 15 of each year, beginning\xa0September 15, 2019.'],
 ['Ranking',
  'The notes will be our unsecured senior obligations and will: As of December\xa031, 2018, we had $2.2\xa0billion of indebtedness for borrowed money, which includes current and non-current\nliabilities, none of which was secured indebtedness, and our subsidiaries had $526.3\xa0million of liabilities (including trade payables but excluding intercompany obligations and liabilities of a type not required to be reflected on a balance\nsheet of such subsidiaries in accordance with U.S. generally accepted accounting principles (“

## Spacy

In [110]:
import spacy
from spacy import displacy
import en_core_web_sm # python -m spacy download en

nlp = en_core_web_sm.load()

In [111]:
content = read_html('forms/-archives-edgar-data-319201-000119312519074988-d706100d424b5.htm')
cleaned = clean_html(content)

offering_table = extract_offering(cleaned)

In [112]:
for h, t in offering_table:
    doc = nlp(t)
    pprint([(X.text, X.label_) for X in doc.ents])

[('KLA-Tencor Corporation', 'ORG'), ('Delaware', 'GPE')]
[('800,000,000', 'MONEY'),
 ('\xa0', 'ORG'),
 ('2029', 'DATE'),
 ('\xa0', 'ORG'),
 ('400,000,000', 'MONEY'),
 ('2049', 'CARDINAL'),
 ('\xa0', 'ORG')]
[('2029', 'DATE'),
 ('\xa0', 'ORG'),
 ('March 15, 2029\xa0', 'DATE'),
 ('\xa0', 'PERSON'),
 ('\xa0March 15, 2049', 'DATE')]
[('March 15', 'DATE'),
 ('September 15 of each year', 'DATE'),
 ('\xa0', 'PERSON'),
 ('September 15, 2019', 'DATE')]
[('December\xa031, 2018', 'DATE'),
 ('2.2', 'MONEY'),
 ('\n', 'GPE'),
 ('526.3', 'MONEY'),
 ('\xa0million', 'CARDINAL'),
 ('\n', 'GPE'),
 ('U.S.', 'GPE'),
 ('Orbotech', 'ORG'),
 ('\n', 'GPE'),
 ('$900.0\xa0million', 'MONEY'),
 ('Revolving Credit Facility', 'ORG'),
 ('Orbotech', 'ORG'),
 ('Pro', 'ORG'),
 ('\n', 'GPE'),
 ('Capitalization', 'WORK_OF_ART'),
 ('$3.5\xa0billion', 'MONEY'),
 ('December\xa031, 2018', 'DATE')]
[('\n', 'GPE'),
 ('\n', 'GPE'),
 ('first', 'ORDINAL'),
 ('\n', 'GPE'),
 ('— Principal', 'PERSON')]
[]
[]
[('101%', 'PERCENT'), ('\

In [114]:
for h, t in offering_table:
    doc = nlp(t)
    print('================= ' + h + ' ==============')
    displacy.render(doc, jupyter=True, style='ent')















  "__main__", mod_spec)




  "__main__", mod_spec)






  "__main__", mod_spec)














See https://prodi.gy/ for a way how we could relatively easily train on SEC entities.

## Iterating over files

In [116]:
all_combinations = []

for html_file in glob('forms/*.htm'):
    print(html_file)
    content = read_html(html_file)
    cleaned = clean_html(content)
    off = extract_offering(cleaned)
    if off:
        all_combinations += off

forms\-archives-edgar-data-1035443-000104746919001118-a2237996z424b5.htm
forms\-archives-edgar-data-1035443-000104746919001263-a2238051z424b5.htm
forms\-archives-edgar-data-1042046-000114036119004640-s002575x1_424b5.htm
forms\-archives-edgar-data-1042046-000114036119004798-s002575x3_424b5.htm
forms\-archives-edgar-data-1044378-000156459019008062-bioc-424b5.htm
forms\-archives-edgar-data-1053507-000119312519071425-d636720d424b5.htm
forms\-archives-edgar-data-1066194-000104746919001242-a2238081z424b5.htm
forms\-archives-edgar-data-1090727-000119312519072761-d704691d424b5.htm
forms\-archives-edgar-data-1090727-000119312519075162-d704691d424b5.htm
forms\-archives-edgar-data-1099590-000114036119004743-s002725x2_424b5.htm
forms\-archives-edgar-data-1099590-000114036119004897-s002725x5_424b5.htm
forms\-archives-edgar-data-1126530-000092963819000366-a73006_424b5.htm
forms\-archives-edgar-data-1133818-000114420419013719-tv516099_424b5.htm
forms\-archives-edgar-data-1169987-000119312519073308-d7

In [None]:
window(str(cleaned))

In [117]:
set([ x[0] for x in all_combinations ])

{'(1)',
 '(2)',
 '(3)',
 '1',
 'ADSs to preferred share ratio',
 'Additional Amounts',
 'Bonds Offered',
 'Book-Entry Form Only',
 'Borrowed ADSs offered',
 'Certain Covenants',
 'Change of Control Offer',
 'Change of Control\xa0—\xa0Repurchase at the Option of the Holders',
 'Class\xa0A common stock offered by us',
 'Class\xa0A common stock outstanding after this offering',
 'Common Stock Offered By Us',
 'Common Stock to be Outstanding After this Offering',
 'Common shares outstanding prior to this offering',
 'Common shares to be offered from time to time',
 'Common stock offered by the selling stockholders',
 'Common stock offered by us',
 'Common stock offered by us in this\noffering',
 'Common stock offered by us:',
 'Common stock outstanding',
 'Common stock outstanding after this offering1',
 'Common stock to be outstanding\n        after this\noffering',
 'Common stock to be outstanding after this offering',
 'Common stock to be outstanding immediately after this offering',
 '

In [119]:
for entry in all_combinations:
    if 'symbol' in entry[0].lower():
        m1 = re.search(r'“([A-Z\.]{2,})”', entry[1])
        m2 = re.search(r'[A-Z]{2,}', entry[1])
        print(entry[1], '-->', m1.group(1) if m1 else m2.group(0))

HTGM --> HTGM
“ROKU” --> ROKU
Our common stock is listed for trading on The Nasdaq Capital Market and the TSX Venture Exchange under the symbol “WTER”. --> WTER
“TRTX” --> TRTX
“KRYS” --> KRYS
“KOPN.” --> KOPN.
“KOPN.” --> KOPN.
“DPW.” --> DPW.
“ABIO” --> ABIO
LXP --> LXP


In [120]:
for entry in all_combinations:
    if 'offer' in entry[0].lower():
        print('----------------------------')
        print(entry[1])
        print('----------------------------')
        doc = nlp(entry[1])
        print( 'MONEY:', [X.text for X in doc.ents if X.label_ == 'MONEY'] )
        print( 'CARDINAL:', [X.text for X in doc.ents if X.label_ == 'CARDINAL'] )
        print( 'DATE:', [X.text for X in doc.ents if X.label_ == 'DATE'] )
        print( 'PERCENT:', [X.text for X in doc.ents if X.label_ == 'PERCENT'] )

----------------------------
$                 aggregate principal amount of     % Senior Notes due 2024 and
$                 aggregate principal amount of     % Senior Notes due 2029.
----------------------------
MONEY: []
CARDINAL: []
DATE: ['2024', '2029']
PERCENT: ['\xa0\xa0\xa0\xa0%']
----------------------------
Following a Change of Control and Ratings Decline (each as defined herein), we will be required to offer to purchase all of the notes at a purchase price equal to 101% of the aggregate
principal amount of the notes repurchased, plus accrued and unpaid interest, if any, up to but not including the date of repurchase. See “Description of Notes—Repurchase of Notes Upon a Change of Control
Triggering Event.” The 2013 Credit Facility and the 2014 Credit Facility might restrict our ability to make such a payment.
----------------------------
MONEY: []
CARDINAL: []
DATE: ['2013', '2014']
PERCENT: ['101%']
----------------------------
Shares of our common stock having an aggrega

had 16,843,208 shares of our common stock outstanding as of December 31, 2018.
----------------------------
MONEY: ['50,000,000', '20.71']
CARDINAL: ['16,843,208']
DATE: ['March\xa011, 2019', 'December\xa031, 2018']
PERCENT: []
----------------------------
“At-the-market” offering that may be made from time to time through our sales agent, Cowen and Company, LLC. See “Plan of Distribution” on page S-12 of this prospectus supplement.
----------------------------
MONEY: []
CARDINAL: []
DATE: []
PERCENT: []
----------------------------
$             aggregate principal amount of 20     notes and
$             aggregate principal amount of 20     notes.
----------------------------
MONEY: []
CARDINAL: ['20', '20']
DATE: []
PERCENT: []
----------------------------
$800,000,000 aggregate principal amount of 2029 notes and $400,000,000 aggregate principal amount of 2049 notes.
----------------------------
MONEY: ['800,000,000', '400,000,000']
CARDINAL: ['2049']
DATE: ['2029']
PERCENT: []
----

In [122]:
def print_matches(typ, rgx, txt, gnum=0, bl=[]):
    matches = re.finditer(rgx, txt)
    if not matches: return
    for match in matches:
        match_text = match.group(gnum)
        if all([x not in match_text for x in bl]):
            print('-', typ, ':', match.group(gnum))

for entry in all_combinations:
    if not 'offer' in entry[0].lower(): continue
    print('----------------------------')
    print(entry[1])
    
    print_matches('money', r'\$([0-9,\.]+)\s*million', entry[1])
    print_matches('money', r'\$([0-9,\.]+\s*m*)', entry[1], 1, bl=['m'])
    print_matches('percent', r'([0-9\.,]+)\s*%', entry[1], 1)
    print_matches('cardinal', r'\$*\s*([0-9.,][0-9.,]+)\s*%*', entry[1], bl=['%', 'due', '$', '.'])
    print_matches('date', r'due\s*([0-9]+)', entry[1], 1)


----------------------------
$                 aggregate principal amount of     % Senior Notes due 2024 and
$                 aggregate principal amount of     % Senior Notes due 2029.
- cardinal :  2024 
- date : 2024
- date : 2029
----------------------------
Following a Change of Control and Ratings Decline (each as defined herein), we will be required to offer to purchase all of the notes at a purchase price equal to 101% of the aggregate
principal amount of the notes repurchased, plus accrued and unpaid interest, if any, up to but not including the date of repurchase. See “Description of Notes—Repurchase of Notes Upon a Change of Control
Triggering Event.” The 2013 Credit Facility and the 2014 Credit Facility might restrict our ability to make such a payment.
- percent : 101
- cardinal :  2013 
- cardinal :  2014 
----------------------------
Shares of our common stock having an aggregate offering price of up to $40,000,000.
- money : 40,000,000.
----------------------------
“At 

€300,000,000 aggregate principal amount of 0.000% notes due 2020.
- percent : 0.000
- cardinal : 300,000,000 
- date : 2020
----------------------------
shares.
----------------------------
$ per share
----------------------------
shares.
----------------------------
7,272,727 shares.
- cardinal : 7,272,727 
----------------------------
$1.10 per share.
- money : 1.10 
----------------------------
83,494,791 shares.
- cardinal : 83,494,791 
----------------------------
Shares of our common stock having an aggregate offering price of up to $100.0 million.
- money : $100.0 million
----------------------------
“At the market offering” that may be made from time to time through our sales agent, B. Riley FBR, Inc.  See “Plan of Distribution” on page S-7 of this prospectus supplement.
----------------------------
180,785 shares of our common stock.
- cardinal : 180,785 
----------------------------
6,482,073 shares of common stock
- cardinal : 6,482,073 
----------------------------
Shares o