In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re

from bs4 import BeautifulSoup as bs

import requests
from urllib.request import urlopen, Request 
from path import Path, getcwdu

import glob
import os
from pathlib import PurePath
import copy


from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, recall_score

In [2]:
train_file_list = [PurePath(file).name for file in glob.iglob('data/nc_training_filings/*')]
train_path_list = [PurePath(os.getcwd()).joinpath(file).as_posix() for file in glob.iglob('data/nc_training_filings/*')]
train_accession_ids = [PurePath(file).stem for file in train_file_list]
train_cik_nbrs = [x.split(sep='-')[0] for x in train_accession_ids]

In [3]:
val_accession_ids = train_accession_ids[300:]

Helper function for viewing paragraph text in training dataframe

In [10]:
def print_row_detail(df=srp_df, nrow=10, header_list = ['ticker', 'accession_number' ],
                    detail_list = ['data_key_friendly_name', 'text', 'paragraph_text'],
                    sortby=['accession_number', 'data_key_friendly_name'], ascending=True):
    df_sorted = df.sort_values(sortby, ascending=ascending).reset_index()
    nrow = min(len(df_sorted), nrow)
    for i in range(0, nrow):
        for h in header_list:
            print('-'*35  + ' ' +  str(df_sorted[h][i]) + ' ' + '-'*35)
        for d in detail_list:
            print(d + '  :' + str(df_sorted[d][i]))
            print('')

### Testing patterns and functions for creating "documents" from sections of html files

In [15]:
repurch_pat_list = [r"authorized the (re)?purchase",
r"repurchas(e|es|ed)( authoriz\w+)?",
r"[^0-9a-z]ASR",
r"Shares (re)Purchased",
r"Share buyback",
r"common stock purchase",
r"purchase an additional",
r"remaining share"]
repurch_pats = [re.compile(x, re.I) for x in repurch_pat_list]

In [16]:
repurch_pat = re.compile(r'authorized the purchase|repurchas(e|es|ed)|[^0-9a-z]ASR|Shares Purchased|Share buyback|common stock purchase|purchase an additional|remaining share', flags=re.I)

In [None]:
df_pat = re.compile(r'.*(authorized the purchase|repurchase|[^0-9a-z]ASR|Shares Purchased|Share buyback|common stock purchase|purchase an additional|remaining share).*', flags=re.I)

This table was identifed as a paragraph. The word sequence doesn't work. The div above it clearly talks about share repurchase and "following table." 

In [None]:
#https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK=0000002969&type=10-K&dateb=20171031&owner=include&count=40

### Functions for extracting candidate documents from an html file

Function to create and index a dataframe from an html table. Empty cells are removed.

In [30]:
def df_from_html_tbl(table_tag):
    data_re = re.compile(r'[a-zA-Z0-9$().]+')
    repl = re.compile(r'[()$]')
    rows = []
    for row in table_tag.findChildren('tr'):
        row_list = []
        for s in row.strings:
            s = re.sub(repl, '', s.strip())
            if len(s) > 0:
                row_list.append(s)
        if len(row_list) > 1:
            rows.append(row_list)
    tbl_df = pd.DataFrame.from_records(rows)
    try:
        tbl_df = tbl_df.set_index( tbl_df.applymap(len).max().values.argmax()) 
    except: tbl_df = tbl_df.set_index(0)
    return tbl_df

In [31]:
def get_repurch_hits(filepath):
    try:
        with open(filepath, encoding="utf8") as file: 
            file_html = file.read()
            soup = bs(file_html, 'lxml')
    except: 
        with open(filepath) as file: 
            file_html = file.read()
            soup = bs(file_html, 'lxml')
    soup_repurch = soup.body.find_all(string=[repurch_pat])
    text_dict = {} 
    df_dict = {}
    html_dict = {}
    html_list = []
    for idx, item in enumerate(soup_repurch):
        if item.find_parent('table') is None:
            if len(item.parent.parent.text) < 2000: 
                if item.parent.parent not in html_list:
                    tag = copy.copy(item.parent.parent)
                    text_dict[idx] = tag.get_text(' ', strip=True)
                    html_dict[idx] = tag
                    html_list.append(tag)
            elif item.parent not in html_list:
                tag = copy.copy(item.parent)
                text_dict[idx] = tag.get_text(' ', strip=True)
                html_dict[idx] = tag
                html_list.append(tag)
        elif item.find_parent('table') not in html_list:
            tag = copy.copy(item.find_parent('table'))
            text_dict[idx] = tag.get_text(' ', strip=True)
            html_dict[idx] = tag
            html_list.append(tag)
            try:
                df_dict[idx] = df_from_html_tbl(tag)
            except:
                if len(item.parent.parent.text) < 2000: 
                    if item.parent.parent not in html_list:
                        tag = copy.copy(item.parent.parent)
                        text_dict[idx] = tag.get_text(' ', strip=True)
                        html_dict[idx] = tag
                        html_list.append(tag)
                elif item.parent not in html_list:
                    tag = copy.copy(item.parent)
                    text_dict[idx] = tag.get_text(' ', strip=True)
                    html_dict[idx] = tag
                    html_list.append(tag)
    doc_dict = {'text' : text_dict, 
                'html' : html_dict,
               'df' : df_dict}
    return doc_dict 

#### Errors in table parsing to work out

In [24]:
 with open(train_path_list[134], encoding="utf8") as file: 
            file_html = file.read()
            err_soup = bs(file_html, 'lxml')
err_soup_repurch = err_soup.find_all(string=[repurch_pats])

In [25]:
err_tbl_dict = {}
for idx, item in enumerate(err_soup_repurch):
    if item.find_parent('table') is not None:
        err_tbl_dict[idx] = item.find_parent('table')


In [26]:
for k, v in err_tbl_dict.items():
    try: 
        df_from_html_tbl(err_tbl_dict[k])
    except: 
        print("key number " + str(k) + " causes an error")

key number 3 causes an error
key number 4 causes an error
key number 12 causes an error
key number 13 causes an error
key number 14 causes an error
key number 15 causes an error
key number 18 causes an error
key number 19 causes an error
key number 20 causes an error


In [27]:
print(err_tbl_dict[4].prettify())

<table cellpadding="0" cellspacing="0" class="c115">
 <tr>
  <td colspan="5" valign="bottom">
  </td>
 </tr>
 <tr>
  <td class="c117" valign="bottom">
  </td>
  <td class="c118" valign="bottom">
  </td>
  <td class="c118" valign="bottom">
  </td>
  <td class="c118" valign="bottom">
  </td>
  <td class="c118" valign="bottom">
  </td>
 </tr>
 <tr>
  <td class="c8" colspan="5" valign="bottom">
   <div class="c114">
    ² In March 2018, our Board of Directors approved an increase in the stock repurchase
authorization for fiscal 2018 by $200 million to $1.075 billion, up from the previously available $875 million
as of February 3, 2018.
   </div>
  </td>
 </tr>
</table>



In [30]:
#get_repurch_hits(train_path_list[107])

{'df': {3:                                       1      2      3     4
  0                                                          
  2017                               2016   2015   None  None
  Shares Repurchased   Units in\nMillions     39     38    26
  Cost of Shares      Dollars in Millions  1,970  1,056   804,
  7:                                                                1  \
  0                                                                  
  Fourth Quarter                  Total Number of Shares Purchased   
  Beginning Balance                                      7,696,097   
  October 1 - October 31, 2017                             165,307   
  November 1 - November 30, 2017                         1,560,559   
  December 1 - December 31, 2017                         2,261,847   
  Ending Balance                                         3,987,713   
  
                                                             2  \
  0                                             

## Create dictionary of documents parsed from html files : `para_dict`

In [32]:
para_dict = {}
for acc_id, file in zip(train_accession_ids[:], train_path_list[:]):
    para_dict[acc_id] = get_repurch_hits(file)

In [41]:
len(para_dict.keys())

435

In [42]:
len(train_accession_ids)

435

Count of "text" entries that are actually tables, and that have corresponding "df" entries

In [43]:
df_count = 0
df_key_dict = {}
df_keys = {}
for k in para_dict.keys():
    df_key_list = []
    for k2 in para_dict[k]['text'].keys():
        if k2 in para_dict[k]['df'].keys():
            df_count +=1
            df_key_list.append(k2)
    df_key_dict[k] = df_key_list
print(df_count)


2807


### Test and refine regex patterns for flagging likely relevant documents

In [44]:
def check_regex_match(pattern, text_list):
    for idx, s in enumerate(text_list):
        mo = re.search(pattern, s)
        if mo:
            ms = mo.span()[1]
            print("------    " + str(idx) + "   Matched!    -----")
            print('str length  :' + str(len(s)) + '    match span  :' + str(ms))
            print(s[:ms])
            print('')
            print(s[ms:])
            print(re.search(pattern, s))
        else:
            print("------    " + str(idx) + "  NO MATCH    -----")
            print(s)

In [45]:
test_list = ["On December 11, 2017, we announced a new repurchase plan for up to $18 billion of common stock, replacing the plan previously authorized in 2016.", 
"On December 11, 2017, the Board approved a new repurchase plan for up to $18,000 of common stock", 
"In February 2018, the Company's Board of Directors approved a new standing share repurchase authorization, whereby the Company may repurchase up to 20 million shares of its common stock through December 31, 2020 .", 
"In June 2016, Applied's Board of Directors approved a common stock repurchase program authorizing up to $2.0 billion in repurchases, which followed the completion of a $3.0 billion common stock repurchase program approved in April 2015. In September 2017, Applied's Board of Directors approved an additional common stock repurchase program authorizing up to an additional $3.0 billion in repurchases. At October 29, 2017 , $3.6 billion remained available for future stock repurchases under these repurchase programs.", 
"On December 11, 2017, we announced a new repurchase plan for up to $18 billion of common stock", 
"On February 15, 2016, the Board of Directors of the Company approved an increase to the current authorization for the stock repurchase program by $600.0 million to $1.0 billion in the aggregate. In the aggregate, our Board of Directors has authorized us to repurchase $6.2 billion of our common stock under the program. ", 
"At December 31, 2017, the amount available under the share repurchase plan, announced on December 11, 2017 , totaled $18 billion"
]

#### Actual patterns currently being used to label documents as relevant

In [46]:
pat_list = [r"Board( of Directors)?( has)? authorized the repurchase", 
r"(on|in|at|as of) (jan\w+|feb\w+|march|april|may|june|july|august|sept\w+|novem\w+|decem\w+) (20)?[0-9]{1,2}, (20[1-2][0-9][,] )?((we|the|\w+[']s) ){1,2}(board( of directors)? )?(of (the company|\w+([']s))? )?(approved|announced|authorized) (((a|the) (new )?)|(an increase to the (current )?(authoriz\w+ )?(for the )?))((stock|common stock|standing share) )?((rep|p)urchase) (authorization|plan|program)?(,)? (by|for|authoriz\w+|whereby the Company may repurchase) (up to )?([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4})( (m|b)illion )?", 
r"((we|the|\w+[']s) ){1,2}(board( of directors)? )?(of (the company|\w+([']s))? )?(approved|announced|authorized) (((a|the) (new )?)|(an increase to the (current )?(authoriz\w+ )?(for the )?))((stock|common stock|standing share) )?((rep|p)urchase) (authorization|plan|program)?(,)? (by|for|authoriz\w+|whereby the Company may repurchase) (up to )?([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4})( (m|b)illion)?",       
r"Board( of Directors)?( has)? (approved|authorized) a( new)? (share|stock) repurchase program", 
r"Board( of Directors)? approved an( additional)? increase in the stock repurchase",
r"share repurchase authorization (remain|remained|by the board)",
r"shares (rep|p)urchased as Part of Public", 
r"accelerated share repurchase",     
r"(re)?purchase(d)?[,]? (up to )?(an aggregate |in aggregate, |a )?(total )?(of )?((up to|approximately) )?(([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4}) ((m|b)illion )?(shares )?)(and (([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4}) ((m|b)illion )?(shares )?))?(shares )?of ((our|its|the|/w+[']s) ){1,2}common stock",
r"Shares (remaining )?that May Yet Be Purchased", 
r"(authorized|approved) a share repurchase program", 
r"authorized the repurchase of (shares|up to)", 
r"authorized repurchases of up to ([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}) ((m|b)illion )shares",
r"authorized share repurchase program", 
r"authorization replace(d|s) (the|all|any) prior repurchase authorization", 
r"([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4})( (m|b)illion)? shares were repurchased", 
r"(approximately )?[$](([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4})( (m|b)illion)? to repurchase (approximately )?[$]?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}) ((m|b)illion )?(shares )?of (\w+([']s)? )?Common Stock",
r"repurchased ([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}) (million )?(common )?shares", 
r"([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}) ((m|b)illion )? stock repurchase",
r"(Company|we) (repurchased|purchased) (approximately )?([$])?([0-9]{,4}[.]?[0-9]{,4}) million shares", 
r"we did not repurchase any shares", 
r"(shares|amount) ((available|remaining) )((for|under|the|share) ){1,5}repurchase ((\w+)[,]? ){0,4}share repurchase((\w+)[,]? ){1,6}([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4})( (m|b)illion)?( share(s)?)?",            
r"([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4}) ((m|b)illion )?(shares )?(of our common stock )?remain((s|ing|ed) )?under ((the|our|publicly announced) ){0,3}((authoriz/w+|program(s)?|share|repurchase)\s?){1,4}",
r"([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4}) ((m|b)illion )?(shares )?(of our common stock )?remain((s|ing|ed) )?to be repurchase",
r"million common shares remaining under the", 
r"remai(n|ned|ning) under (our|the) share repurchase (authorization|program)", 
r"The Company currently plans to (rep|p)urchase ([$]?)([0-9]{,4}[.]?[0-9]{,4}) ((m|b)illion )?(to ([$]?)([0-9]{,4}[.]?[0-9]{,4}) (m|b)illion )?(shares|of its common stock)",
r"(approximately )?([$]?)([0-9]{1,4}[.]?[0-9]{0,4}) (m|b)illion ((shares|of|our|common|stock) ){0,5}remai(ned|n) (available|under the authorization|authorized)",
r"ha(d|s) (approximately )([$]?)([0-9]{1,4}[.]?[0-9]{0,4}) (m|b)illion remaining under ((the|this ))?repurchase authorization",
        
           ]

reg_list = [re.compile(x, re.I) for x in pat_list]

In [47]:
#r"(approximately )?[$](([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4})( (m|b)illion)? to repurchase (approximately )?[$]?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}) ((m|b)illion )?(shares )?of AIG Common Stock"
check_regex_match(r"(approximately )?[$](([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4})( (m|b)illion)? to repurchase (approximately )?[$]?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}) ((m|b)illion )?(shares )?of (\w+([']s)? )?Common Stock",
                 ['• approximately $6.3 billion to repurchase approximately 100 million shares of AIG Common Stock; and', 
                 'repurchase programs approved by the Board of Directors in June 2016 and in September 2017, which authorized up to an aggregate of $5.0 billion in repurchases'])


------    0   Matched!    -----
str length  :100    match span  :95
• approximately $6.3 billion to repurchase approximately 100 million shares of AIG Common Stock

; and
<_sre.SRE_Match object; span=(2, 95), match='approximately $6.3 billion to repurchase approxim>
------    1  NO MATCH    -----
repurchase programs approved by the Board of Directors in June 2016 and in September 2017, which authorized up to an aggregate of $5.0 billion in repurchases


Patterns used to filter after initial regex

In [48]:
nonum_pats = [r"^[^\d]*$",
    r"^[^\d]*\d{1,2}[^\d]*$", 
    r"^([^\d]+(\d{1,2}[^\d]{2,}){1,5}\d{1,3}[^\d]*)$",
    r"^((\d{1,3}[^\d]{2,}){1,4}\d{1,2}[^\d]*)$"]
year_and_num = re.compile(r"20[0-2][0-9].*[0-9]{1,3}.*|[0-9]{1,3}.*20[0-2][0-9]", re.I)
nonum_regs = [re.compile(x) for x in nonum_pats]

In [49]:
# After reading back in, the html needs to be parsed again
#tbl_html_df = pd.read_csv('tbl_html_df.csv')
#tbl_html_df['tbl_html'] = tbl_html_df2.tbl_html.apply(lambda x: bs(x, 'lxml'))

### Assign documents to positive and negative lists based on regex; also clean text

In [50]:
pos_list = []; pos_key_list = []; neg_list = []; neg_key_list = [];
pos_keys = set(); neg_keys = set()

space_pat = re.compile(r"\s+")
para_dict_2 = {}
for idx, k in enumerate(para_dict.keys()):
    doc_dict = {}
    text_dict = {}
    for k2, v2 in para_dict[k]['text'].items():
        para = re.sub(space_pat, ' ', para_dict[k]['text'][k2].replace('\n', ' ')).replace(' ,',',')
        # Remove texts that match exclusion patterns
        if not any(reg.search(para) for reg in nonum_regs) and k2 not in para_dict[k]['df'].keys():
            text_dict[k2] = para
            if any(reg.search(para) for reg in reg_list):
                pos_list.append(para)
                pos_key_list.append(k)
                pos_keys.add(k)
            else:
                neg_list.append(para)
                neg_key_list.append(k)
                neg_keys.add(k)
    doc_dict['text'] = text_dict
    para_dict_2[k] = doc_dict
#    idx += 1
#    if idx >= 300:
#        break

In [51]:
pos_tup_train = [(pos_key_list[i], pos_list[i]) for i,v in enumerate(pos_key_list) if v not in val_accession_ids]
pos_tup_val = [(pos_key_list[i], pos_list[i]) for i,v in enumerate(pos_key_list) if v in val_accession_ids]
neg_tup_train = [(neg_key_list[i], neg_list[i]) for i,v in enumerate(neg_key_list) if v not in val_accession_ids]
neg_tup_val = [(neg_key_list[i], neg_list[i]) for i,v in enumerate(neg_key_list) if v  in val_accession_ids]

In [52]:
neg_key_list_train, neg_list_train = ([k[0] for k in neg_tup_train], [k[1] for k in neg_tup_train])
neg_key_list_val, neg_list_val = ([k[0] for k in neg_tup_val], [k[1] for k in neg_tup_val])
pos_key_list_train, pos_list_train = ([k[0] for k in pos_tup_train], [k[1] for k in pos_tup_train])
pos_key_list_val, pos_list_val = ([k[0] for k in pos_tup_val], [k[1] for k in pos_tup_val])

54 keys keys missed out of 300 covered  (11 if table texts are included) (2 only had table paragraphs)

In [53]:
print('total paragraphs with positive hits: ' + str(len(pos_list)))
print('training paragraphs with positive hits: ' + str(len(pos_list_train)))
print('training keys with positive hits: ' + str(len(set(pos_key_list_train))) + ' out of 300')
print('validation keys with positive hits: ' + str(len(set(pos_key_list_val))) + ' out of 135')

total paragraphs with positive hits: 1098
training paragraphs with positive hits: 764
training keys with positive hits: 246 out of 300
validation keys with positive hits: 112 out of 135


In [54]:
print('total paragraphs with no regex match: ' + str(len(neg_list)))
print('training paragraphs with no regex match: ' + str(len(neg_list_train)))
print('training keys with paragraphs labeled negative: ' + str(len(set(neg_key_list_train))) + ' out of 300')
print('validation keys with paragraphs labeled negative: ' + str(len(set(neg_key_list_val))) + ' out of 135')

total paragraphs with no regex match: 3161
training paragraphs with no regex match: 2240
training keys with paragraphs labeled negative: 294 out of 300
validation keys with paragraphs labeled negative: 133 out of 135


List of keys with no paragraphs flagged as relevant by the regex

In [55]:
missed_keys = [key for key in neg_keys if key not in pos_keys ]

Used for manually labeling documents as relevant or not

In [56]:
def print_docs_from_list(key_list: list=neg_key_list, doc_list: list=neg_list, start: int=0, ndocs: int=50):
    end = start + ndocs
    for idx, tup in enumerate(zip(key_list[start:end], doc_list[start:end])):
        print(str(idx + start) + '   ------   ' + str(tup[0]))
        print(tup[1])

In [57]:
neg_list_year_and_num = [s for s in neg_list[:100] if re.search(year_and_num, s)]

In [58]:
print_docs_from_list(pos_list, pos_key_list, 150, 2)

150   ------   During 2017, the Company repurchased 11.5 million shares of its common stock for total consideration of $900 million . In November 2016, the Board of Directors of the Company authorized the Company to repurchase up to $2.5 billion of the Company's common stock, which superseded any prior authorizations. The Company remains authorized to purchase additional shares of its common stock up to a value of approximately $1.5 billion . There is no time limit on the authorization. During 2016, the Company purchased 12.7 million shares of its common stock for total consideration of $800 million .
0000062709-18-000007
151   ------   In May 2017, our Board of Directors authorized the repurchase, for retirement, of up to $1.5 billion of shares of our common stock in open-market transactions or otherwise, replacing the previous Board of Directors authorization established in 2014. During 2017, we repurchased and retired 9.2 million shares of our common stock (including 0.9 million sha

35 files are included in the labeled negative documents

In [59]:
# Keys covered by first 200 entries
print(len(set(pos_key_list[:200])))
print(len(set(neg_key_list[:200])))

69
35


In [60]:
false_pos_indices = []
false_neg_indices = []

Make dataframe out of dictionaries, lists

In [39]:
#### Only keys with html tables that were converted to dataframes
tbl_keys_list = [k for k in para_dict.keys() if len(para_dict[k]['df'].keys()) > 0]

In [40]:
#### Table html 
tbl_html_list = [copy.copy(para_dict[k]['html'][k2]) for k in tbl_keys_list for k2 in para_dict[k]['df'].keys() ]
#### Corresponding key
tbl_key_list = [k for k in tbl_keys_list for k2 in para_dict[k]['df'].keys() ] 

In [41]:
#### Make dataframe
tbl_html_df = pd.DataFrame(data = { 'acc_id': tbl_key_list, 'tbl_html': tbl_html_list, 'split' : 'train' })
tbl_html_df.loc[tbl_html_df.acc_id.isin(val_accession_ids),'split'] = 'val'

In [42]:
#### Write to csv for later use
#tbl_html_df.to_csv('tbl_html_df.csv')

In [63]:
# After reading back in, the html needs to be parsed again
#tbl_html_df = pd.read_csv('data/tbl_html_df.csv')
tbl_html_df['tbl_html'] = tbl_html_df.tbl_html.apply(lambda x: bs(x, 'lxml'))

Change from markdown to code if (re)creating `paragraph_input_df` 

In [44]:
paragraph_input_dict = {'acc_id' : pos_key_list + neg_key_list, 
                       'para_text' : pos_list + neg_list, 
                        'len' : [len(p) for p in pos_list] + [len(n) for n in neg_list],
                       'label' : [1 for p in pos_list] + [0 for n in neg_list], 
                       'split' : 'train'}

paragraph_input_df = pd.DataFrame(paragraph_input_dict, columns=['acc_id', 'para_text','len','split', 'label'])

paragraph_input_df.loc[paragraph_input_df.acc_id.isin(val_accession_ids),'split'] = 'val'
#paragraph_input_df.to_csv('paragraph_input_df.csv')

In [45]:
#paragraph_input_df = pd.read_csv('paragraph_input_df.csv', index_col=0)

In [46]:
paragraph_input_df.head(2)

Unnamed: 0,acc_id,para_text,len,split,label
0,0000002969-17-000039,"On 15 September 2011, the Board of Directors a...",654,train,1
1,0000002969-17-000039,"On 15 September 2011, the Board of Directors a...",282,train,1


### EDA of paragraphs in para_dict_2

In [47]:
doc_count_2 = 0
for k in para_dict_2.keys():
    doc_count_2 +=  len(para_dict_2[k]['text'].keys())
print("There are " + str(len(para_dict_2.keys())) + " keys in para_dict_2.")
print("There are " + str(doc_count_2) + " paragraphs in para_dict_2.")     

There are 435 keys in para_dict_2.
There are 4259 paragraphs in para_dict_2.


Function to add summary dict about specific sub-dict (in this case, about text dictionaries for each file)

In [48]:
def add_subdict_summary(dict_name: dict, key: str, key_to_summarize: str ='text'):
    #Evaluate length of items in subdict
    length_dict = {}
    for k, t in dict_name[key][key_to_summarize].items():
        length_dict[k] = len(t)
    dict_name[key]['length'] = length_dict
    smry_dict = {}
    lengths = list(dict_name[key]['length'].values())
    if len(lengths) == 0:
        stats_lengths = [0]
    else: 
        stats_lengths = lengths
    smry_dict['id'] = key
    smry_dict['cnt'] = len(lengths)
    smry_dict['min'] = min(stats_lengths)
    smry_dict['median'] = np.median(stats_lengths)
    smry_dict['max'] = np.max(stats_lengths)
    smry_dict['mean'] = np.mean(stats_lengths)
    smry_key_list_name = key_to_summarize + '_keys'
    smry_dict[smry_key_list_name] = list(dict_name[key][key_to_summarize].keys())
    dict_name[key]['len_smry'] = smry_dict

In [49]:
for k in para_dict_2.keys():
    add_subdict_summary(para_dict_2, k)

In [50]:
len_cols = ['id','cnt',  'min', 'median', 'max', 'mean', 'text_keys']
len_df = pd.DataFrame([para_dict_2[x]['len_smry'] for x in para_dict_2.keys()], columns=len_cols)

Lengths of the text sections pulled from the html

In [51]:
len_df.describe()

Unnamed: 0,cnt,min,median,max,mean
count,435.0,435.0,435.0,435.0,435.0
mean,9.790805,233.602299,587.691954,1329.498851,651.723067
std,6.545826,181.975705,224.712044,614.957649,218.48146
min,0.0,0.0,0.0,0.0,0.0
25%,6.0,118.0,442.5,954.5,496.928571
50%,9.0,187.0,547.0,1232.0,615.285714
75%,12.0,288.5,691.5,1620.0,763.288889
max,64.0,1340.0,1510.5,5567.0,1510.5


#### Several files have way too many documents

In [52]:
len_df.sort_values('cnt', ascending=False).head(7)

Unnamed: 0,id,cnt,min,median,max,mean,text_keys
70,0000072971-18-000272,64,57,1258.0,2717,1221.5625,"[0, 2, 3, 21, 23, 24, 27, 29, 30, 32, 33, 35, ..."
272,0001104659-18-026815,63,62,578.0,3023,823.428571,"[0, 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14..."
66,0000070858-18-000009,34,83,579.5,1641,639.176471,"[2, 5, 10, 12, 13, 15, 20, 21, 22, 24, 25, 29,..."
137,0000750556-18-000079,32,128,610.5,1943,765.53125,"[2, 9, 11, 12, 15, 16, 19, 21, 24, 28, 30, 31,..."
170,0000859737-17-000023,32,165,721.5,1494,720.40625,"[0, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20..."
177,0000877890-18-000025,30,128,915.5,2727,950.166667,"[0, 1, 2, 4, 6, 9, 12, 18, 19, 20, 22, 23, 25,..."
194,0000912242-18-000004,28,219,671.0,1706,783.107143,"[0, 1, 2, 3, 10, 11, 12, 13, 17, 23, 28, 33, 3..."


A lot of long, irrelevant texts were pulled out of this filing

In [53]:
def print_subdict_items(dict_of_dicts: dict, dict_key: str, max_items: int=0, text_key: str='text'):
    for idx, v in enumerate(dict_of_dicts[dict_key][text_key].items()):
        if max_items:
            if idx >= max_items:
                break
        print(str(idx))
        print(dict_of_dicts[dict_key]['length'][v[0]])
        print(v[1])
        print('')

In [54]:
print_subdict_items(para_dict_2, '0000072971-18-000272',2)

0
722
DIVIDENDS The dividend restrictions discussions on page 3 of this report and in the 2017 Annual Report to Stockholders under "Financial Statements - Notes to Financial Statements - Note 3 (Cash, Loan and Dividend Restrictions)" are incorporated into this item by reference. REPURCHASES OF EQUITY SECURITIES In January 2016, our Board of Directors authorized the repurchase of 350 million shares of our common stock. The authorization covers shares repurchased to meet team member benefit plan requirements. The Company maintains a variety of retirement plans for its team members and typically is a net issuer of shares of common stock to these plans. From time to time, it also purchases shares of common stock from these

1
1066
plans to accommodate team member preferences. Share repurchases are subtracted from the Company's repurchase authority without offset for share issuances. Shares may be repurchased as part of employee stock option exercises, from the different benefit plans or in

Count of "text" entries that are actually tables, and that have corresponding "df" entries

In [55]:
df_count_2 = 0
df_key_dict_2 = {}
df_keys_2 = {}
for k in para_dict_2.keys():
    df_key_list = []
    for k2 in para_dict_2[k]['text'].keys():
        if k2 in para_dict[k]['df'].keys():
            df_count_2 +=1
            df_key_list.append(k2)
    df_key_dict_2[k] = df_key_list
print(df_count_2)


0


Add cleaned paragraph text to lists

### Dataframes showing which regex-document matches 

In [56]:

def make_reg_list_dict(reg_list=reg_list, pat_list=pat_list, pos_list=pos_list, neg_list=neg_list, n_per_class=0):
    reg_list_dict = {}
    if not n_per_class:
        n_pos = len(pos_list); n_neg = len(neg_list)
    else: 
        n_pos, n_neg = n_per_class
    reg_list_dict['hit'] = np.ones(len(pos_list[:n_pos])).tolist() + np.zeros(len(neg_list[:n_neg])).tolist()
    for idr, reg in enumerate(reg_list):
        reg_matches = np.zeros(len(pos_list[:n_pos] + neg_list[:n_neg]))
        for idx, para in enumerate(pos_list[:n_pos] + neg_list[:n_neg]):
            if reg.search(para):
                reg_matches[idx] = 1
        reg_list_dict[pat_list[idr]] = reg_matches
    reg_list_dict['para'] = pos_list[:n_pos] + neg_list[:n_neg]
    return reg_list_dict

In [57]:
reg_list_dict = make_reg_list_dict(pos_list=pos_list_train, neg_list=neg_list_train)

In [58]:
reg_df_cols = ['hit'] + [key for key in reg_list_dict.keys() if key not in ['hit', 'para']] + ['para']
reg_hit_df = pd.DataFrame(reg_list_dict, columns=reg_df_cols)
reg_hit_df['hit_sum'] = reg_hit_df[reg_hit_df.columns[1:-1]].apply(np.sum, axis=1)

reg_hit_df['fp'] = np.where(reg_hit_df.index.isin(false_pos_indices), 1, 0)
reg_hit_df['id'] = reg_hit_df.index.values
reg_hit_df['fn'] = np.where(reg_hit_df.index.isin([x + 200 for x in false_neg_indices]), 1, 0)

In [59]:
fp_cols = ['fp','hit' ,'shares of (its|our) common stock', 'paras']
for col in reg_hit_df.columns[1:-1]:
    print(reg_hit_df[reg_hit_df.hit_sum == 1].groupby('hit')[col].sum().to_frame())
    #print(str(reg_hit_df.groupby('hit')[col].sum().to_frame()) )


     Board( of Directors)?( has)? authorized the repurchase
hit                                                        
1.0                                               12.0     
     (on|in|at|as of) (jan\w+|feb\w+|march|april|may|june|july|august|sept\w+|novem\w+|decem\w+) (20)?[0-9]{1,2}, (20[1-2][0-9][,] )?((we|the|\w+[']s) ){1,2}(board( of directors)? )?(of (the company|\w+([']s))? )?(approved|announced|authorized) (((a|the) (new )?)|(an increase to the (current )?(authoriz\w+ )?(for the )?))((stock|common stock|standing share) )?((rep|p)urchase) (authorization|plan|program)?(,)? (by|for|authoriz\w+|whereby the Company may repurchase) (up to )?([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4})( (m|b)illion )?
hit                                                                                                                                                                                                                                                                                     

In [50]:
reg_unique_hit_counts = reg_hit_df[reg_hit_df.hit_sum == 1].groupby('hit').sum()[reg_hit_df.columns[1:-5]].melt(
    var_name = 'regex', value_name = 'paras').sort_values('paras', ascending=False)

In [189]:
reg_unique_hit_counts[reg_unique_hit_counts.paras == 0].regex.tolist()

['The Company currently plans to (rep|p)urchase ([$]?)([0-9]{,4}[.]?[0-9]{,4}) ((m|b)illion )?(to ([$]?)([0-9]{,4}[.]?[0-9]{,4}) (m|b)illion )?(shares|of its common stock)',
 'million common shares remaining under the',
 'authorization replace(d|s) (the|all|any) prior repurchase authorization',
 '(shares|amount) ((available|remaining) )((for|under|the|share) ){1,5}repurchase ((\\w+)[,]? ){0,4}share repurchase((\\w+)[,]? ){1,6}([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{0,4})( (m|b)illion)?( share(s)?)?',
 '([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4}) ((m|b)illion )? stock repurchase',
 "(on|in|at|as of) (jan\\w+|feb\\w+|march|april|may|june|july|august|sept\\w+|novem\\w+|decem\\w+) (20)?[0-9]{1,2}, (20[1-2][0-9][,] )?((we|the|\\w+[']s) ){1,2}(board( of directors)? )?(of (the company|\\w+([']s))? )?(approved|announced|authorized) (((a|the) (new )?)|(an increase to the (current )?(authoriz\\w+ )?(for the )?))((stock|common stock|standing share) )?((rep|p)urchase) (authorization|plan|progra

In [199]:
fp_cols = ['fp','hit' ,'shares of (its|our) common stock', 'paras']
for col in reg_hit_df.columns[1:-1]:
    #print(reg_hit_df[reg_hit_df.hit_sum == 1].groupby('hit')[col].sum().to_frame())
    print(reg_hit_df[reg_hit_df.hit==1.0].groupby('hit')[col].sum() )


hit
1.0    95.0
Name: Board( of Directors)?( has)? authorized the repurchase, dtype: float64
hit
1.0    6.0
Name: (on|in|at|as of) (jan\w+|feb\w+|march|april|may|june|july|august|sept\w+|novem\w+|decem\w+) (20)?[0-9]{1,2}, (20[1-2][0-9][,] )?((we|the|\w+[']s) ){1,2}(board( of directors)? )?(of (the company|\w+([']s))? )?(approved|announced|authorized) (((a|the) (new )?)|(an increase to the (current )?(authoriz\w+ )?(for the )?))((stock|common stock|standing share) )?((rep|p)urchase) (authorization|plan|program)?(,)? (by|for|authoriz\w+|whereby the Company may repurchase) (up to )?([$])?(([0-9]{1,3},)*[0-9]{1,3}[.]?[0-9]{,4})( (m|b)illion )?, dtype: float64
hit
1.0    6.0
Name: ((we|the|\w+[']s) ){1,2}(board( of directors)? )?(of (the company|\w+([']s))? )?(approved|announced|authorized) (((a|the) (new )?)|(an increase to the (current )?(authoriz\w+ )?(for the )?))((stock|common stock|standing share) )?((rep|p)urchase) (authorization|plan|program)?(,)? (by|for|authoriz\w+|whereby the Co

In [1266]:
#reg_hit_df[reg_hit_df.hit_sum == 1][fp_cols].sort_values(['fp','hit'], ascending=False)
print_row_detail(df=reg_hit_df[(reg_hit_df.fn == 1) ], nrow=30, header_list=['id'], 
                 detail_list=['fp','hit' ,'accelerated share repurchase', 'para'], 
                 sortby=['fp','hit', 'accelerated share repurchase'], ascending=False)

----------------------------------- 276 -----------------------------------
fp  :0

hit  :0.0

accelerated share repurchase  :0.0

para  :Years Ended December 31, (in millions) 2017 2016 2015 (a) Aggregate repurchases of common stock $ 6,275 $ 11,460 $ 10,691 Total number of common shares repurchased 100 201 182 Aggregate repurchases of warrants $ 3 $ 309 $ - Total number of warrants repurchased (b) - 17 -

----------------------------------- 277 -----------------------------------
fp  :0

hit  :0.0

accelerated share repurchase  :0.0

para  :(a) The total number of shares of AIG Common Stock repurchased in 2015 includes (but the aggregate purchase price does not include) approximately 3.5 million shares of AIG Common Stock received in January 2015 upon the settlement of an accelerated stock repurchase (ASR) agreement executed in the fourth quarter of 2014.

----------------------------------- 288 -----------------------------------
fp  :0

hit  :0.0

accelerated share repurchase  :0.0

In [60]:
false_pos = []; false_neg = [];
false_pos = [pos_list[:200][i] for i in false_pos_indices  ]
false_neg = [neg_list[:200][i] for i in false_neg_indices  ]

In [61]:
pos_labeled = [x for x in pos_list[:200] if x not in false_pos] + false_neg
neg_labeled = [x for x in neg_list[:200] if x not in false_neg] + false_pos
train_labeled = pos_labeled + neg_labeled