In [1]:
import numpy as np
import pandas as pd
import re

from bs4 import BeautifulSoup as bs

from path import Path, getcwdu

import glob
import os
from pathlib import PurePath
import copy

import random
import gzip
import shutil

In [2]:
full_path_list = [PurePath(os.getcwd()).joinpath(file).as_posix() for file in glob.iglob('../employee_filings/*.gz')]
full_file_list = [PurePath(file).name for file in glob.iglob('../employee_filings/*.gz')]
full_accession_ids = [PurePath(file).stem.replace('.html', '') for file in full_file_list]
full_cik_nbrs = [x.split(sep='-')[0] for x in full_accession_ids]

In [3]:
train_accession_ids = pd.read_csv('../data/train_accession_ids.csv', names=['acc_id'])['acc_id'].tolist()
val_accession_ids = pd.read_csv('../data/val_accession_ids.csv', names=['acc_id'])['acc_id'].tolist()

labeled_df = pd.read_excel('../data/train_val_employee_count_paragraphs.xlsx')
subset_df = pd.read_excel('../data/subset_employee_count_paragraphs.xlsx')

In [4]:
subset_file_list = [PurePath(os.getcwd()).joinpath('../employee_filings/').joinpath(file) for file in full_file_list if PurePath(file).stem.replace('.html', '') in subset_df.accession_number.unique().tolist()]

In [5]:
# After reading back in, the html needs to be parsed again
tbl_html_df = pd.read_csv('data/tbl_html_df.csv')
tbl_html_df['tbl_html'] = tbl_html_df.tbl_html.apply(lambda x: bs(x, 'lxml'))

In [6]:

p_dtypes = {'acc_id' : 'category', 
'len' : 'int', 
'emp_header' : 'bool', 
'first_emp_head_block' : 'bool', 
'split' : 'category', 
'label' : 'category'}
paragraph_input_df = pd.read_csv('data/paragraph_input_df.csv', dtype=p_dtypes, index_col=0)

train_df = paragraph_input_df[paragraph_input_df.split == 'train']

In [7]:
paragraph_input_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4817 entries, 0 to 4816
Data columns (total 9 columns):
acc_id                  4817 non-null category
para_text               4817 non-null object
len                     4817 non-null int32
emp_header              4817 non-null bool
first_emp_head_block    4817 non-null bool
para_text_orig          4817 non-null object
para_tag                4817 non-null object
split                   4817 non-null category
label                   4817 non-null category
dtypes: bool(2), category(3), int32(1), object(3)
memory usage: 207.6+ KB


In [8]:
def print_row_detail(df=train_df, nrow=10, header_list = ['acc_id'],
                    detail_list = [ 'len', 'first_emp_head_block', 'split' ,'para_text'],
                    sortby=['acc_id', 'first_emp_head_block', 'len'], ascending=False):
    df_sorted = df.sort_values(sortby, ascending=ascending).reset_index()
    nrow = min(len(df_sorted), nrow)
    for i in range(0, nrow):
        for h in header_list:
            print('-'*35  + ' ' +  str(df_sorted[h][i]) + ' ' + '-'*35)
        for d in detail_list:
            print(d + '  :' + str(df_sorted[d][i]))
            print('')

In [9]:
print_row_detail()

----------------------------------- 0001193125-17-107561 -----------------------------------
len  :179

first_emp_head_block  :True

split  :train

para_text  :3 Italia Dirigenti Employees means the 19 dirigenti employees of the 3 Italia Group that have been identified by the 3 Italia Group as being entitled to receive a Retention Bonus;

----------------------------------- 0001193125-17-107561 -----------------------------------
len  :992

first_emp_head_block  :False

split  :train

para_text  :VimpelCom is rebranding to VEON and has changed its name to VEON, effective as of March 30, 2017. VEON is an international communications and technology company, headquartered in Amsterdam, and driven by a vision to unlock new opportunities for customers as they navigate the digital world. Present in some of the world's most dynamic markets, VEON provides more than 200 million customers with voice, fixed broadband, data and digital services. VEON offers services to customers in 12 countries in

### Test and refine regex patterns for flagging likely relevant documents

In [10]:
def check_regex_match(pattern, text_list):
    for idx, s in enumerate(text_list):
        mo = re.search(pattern, s)
        if mo:
            ms = mo.span()[1]
            print("------    " + str(idx) + "   Matched!    -----")
            print('str length  :' + str(len(s)) + '    match span  :' + str(ms))
            print(s[:ms])
            print('')
            print(s[ms:])
            print(re.search(pattern, s))
        else:
            print("------    " + str(idx) + "  NO MATCH    -----")
            print(s)

In [13]:
flag_pat_list = [r"employ((ed|s)?) (approximately )?([0-9]{1,3},)*[0-9]{1,3} ((permanent|full-time|part-time|temporary) )?(employees|people|team members|members|persons|associates)",
#r"([0-9]{1,3},)*[0-9]{1,3} [^.]+ ((employ\w*|head([ -])?count|member(s)?|person(s)?|people|staff|team|workforce))",
#r"((employ\w*|head([ -])?count|member(s)?|person(s)?|people|staff|team|workforce) )+[^.]+ *([0-9]{1,3},)*[0-9]{1,3}",
r"([0-9]{1,3},)*[0-9]{1,3} ((permanent|full-time|part-time|temporary|total) )*(employees|team members)"]
#r"((permanent|full|part|time|full-time|part-time|temporary) )+(employees|team members|associates)",
#r"employed( approximately)?$", 
#r"Total workforce",

#r"^((permanent|full|part|time|full-time|part-time|temporary|total) )*(employees|team members|associates)"
                 
flag_pats = [re.compile(x, re.I) for x in flag_pat_list]

In [15]:
para_list = train_df.para_text.tolist()

In [26]:
pos_ids = set()

In [27]:
for i, para in enumerate(para_list):
    #if not any(reg.search(para) for reg in nonum_regs) and k2 not in para_dict[k]['df'].keys():
    for reg in flag_pats:
        if reg.search(para):
            pos_ids.append(train_df.index[i])
            break
        

In [30]:
pos_ids = set(pos_ids)

In [33]:
train_df.loc[train_df.index.isin(pos_ids), 'label'] = 1

ValueError: Cannot setitem on a Categorical with a new category, set the categories first

In [17]:
train_df.loc[train_df.index.isin(pos_ids)]

Unnamed: 0,acc_id,para_text,len,emp_header,first_emp_head_block,para_text_orig,para_tag,split,label
2,0000004127-16-000068,"As of September 30, 2016, we employed approxim...",245,True,True,"As of September 30, 2016,\r\nwe employed appro...","<div class=""c80""><span class=""c32"">As of</span...",train,0


In [18]:
train_df.iloc[0:2,]

Unnamed: 0,acc_id,para_text,len,emp_header,first_emp_head_block,para_text_orig,para_tag,split,label
2,0000004127-16-000068,"As of September 30, 2016, we employed approxim...",245,True,True,"As of September 30, 2016,\r\nwe employed appro...","<div class=""c80""><span class=""c32"">As of</span...",train,0
3,0000004127-16-000068,EMPLOYEES,9,False,False,EMPLOYEES,"<div class=""c90"">EMPLOYEES</div>",train,0
