In [1]:
# library import
import os
import re
import pdfplumber
import numpy as np
import pandas as pd 
import json

import scrape   # custom file

In [2]:
# Enable multiple output per cell. 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Definition Lists
Grab full list of definition names from two files

In [4]:
file_name_vib = 'Vibrant VIII - Indenture(133849278_1).pdf'
start_page_vib = 8
end_page_vib = 74

file_name_oh = 'OHA XII Indenture.pdf'
start_page_oh = 8
end_page_oh = 85

In [5]:
# extract pages from both documents
oh_pages = scrape.pdf_extract_pages(file_name_oh, start_page_oh, end_page_oh)
doc_str_oh = scrape.combine_strings(oh_pages) 
vib_pages = scrape.pdf_extract_pages(file_name_vib, start_page_vib, end_page_vib)
doc_str_vib = scrape.combine_strings(vib_pages)

In [12]:
# isolate ranges for both files
regex_oh = r'“.*”:'
regex_vib = r'".*":'
name_range_oh = scrape.text_match_str2list(regex_oh, doc_str_oh)
name_range_vib = scrape.text_match_str2list(regex_vib, doc_str_vib)

Store of list of definition names into dictionary, then export through dataframe

In [7]:
# filter out beginning and end quote/colon from each line 
name_range_oh = [x[1:(len(x)-2)] for x in name_range_oh]
name_range_vib = [x[1:(len(x)-2)] for x in name_range_vib]

In [8]:
doc_def_dict={}
doc_def_dict['OH'] = name_range_oh
doc_def_dict['Vib'] = name_range_vib

In [9]:
df1 = pd.DataFrame(data=doc_def_dict['OH'])
df1.columns = ['OH']
df2 = pd.DataFrame(data=doc_def_dict['Vib'])
df2.columns = ['Vib']
df = pd.concat([df1, df2], axis=1)
df.head() 

Unnamed: 0,OH,Vib
0,Acceleration Event,17g-5 Information Provider
1,Acceleration Priority of Payments,17g-5 Information Provider's Website
2,Account Agreement,Acceleration Event
3,Accountants’ Effective Date Comparison AUP Report,Accountants' Report
4,Accountants’ Effective Date Recalculation AUP ...,Accounts


### Table of Definitions: Raw definitions (no table/bullet processing) 
1. Process full file: start with OH
2. grab locations and map fields

In [17]:
def_locs_oh = scrape.text_match_2list(regex_oh, doc_str_oh)
def_descr_oh = scrape.list_betweenloc_to_string_bounded(def_locs_oh, doc_str_oh, len(doc_str_oh))

In [52]:
df_defs = pd.DataFrame([doc_str_oh[x[0]+1:x[1]-2] for x in def_locs_oh])
df_defs['description'] = def_descr_oh
df_defs.columns = ['def_name','def_description']
df_defs.head() 

Unnamed: 0,def_name,def_description
0,Acceleration Event,The meaning specified in the Acceleration Pr...
1,Acceleration Priority of Payments,The meaning specified in \nSection 11.1...
2,Account Agreement,An agreement in substantially the form of Ex...
3,Accountants’ Effective Date Comparison AUP Report,A report of agreed \nupon procedures perform...
4,Accountants’ Effective Date Recalculation AUP ...,A report of agreed \nupon procedures perfor...


### Detect bullet points / lists
1. Detection of different lists: different permutations of roman, letter, x/y, 
2. Detection within lists

In [58]:
# investigate roman numerals 
regex = r"[\n\s]\([vix]+\)"
match_reg = [re.search(regex, x) for x in df_defs['def_description']]
roman_bullet_flag = [1 if x is not None else 0 for x in match_reg]
df_defs['roman_bullets_flag'] = roman_bullet_flag
df_defs.iloc[11:16]   

Unnamed: 0,def_name,def_description,roman_bullets_flag
11,Adjusted Collateral Principal Amount,As of any date of determination: \n(a) the ...,1
12,Adjusted Coupon,As of any date of determination and with res...,0
13,Adjusted Spread,As of any date of determination and with res...,0
14,Administration Agreement,An agreement between the Administrator and t...,0
15,Administrative Expense Cap,An amount equal on any Payment Date (when \n...,1


In [59]:
# Letter bullets: look for '(a)'
regex = r"[\n\s]\(a\)"
match_reg = [re.search(regex, x) for x in df_defs['def_description']]
letter_bullet_flag = [1 if x is not None else 0 for x in match_reg]
df_defs['letter_bullets_flag'] = letter_bullet_flag
df_defs.iloc[0:16]   

Unnamed: 0,def_name,def_description,roman_bullets_flag,letter_bullets_flag
0,Acceleration Event,The meaning specified in the Acceleration Pr...,0,0
1,Acceleration Priority of Payments,The meaning specified in \nSection 11.1...,0,0
2,Account Agreement,An agreement in substantially the form of Ex...,0,0
3,Accountants’ Effective Date Comparison AUP Report,A report of agreed \nupon procedures perform...,0,0
4,Accountants’ Effective Date Recalculation AUP ...,A report of agreed \nupon procedures perfor...,0,0
5,Accountants’ Report,An agreed upon procedure report or reports o...,0,0
6,Accounts,(i) The Payment Account; (ii) the Coll...,1,0
7,Accredited Investor,An accredited investor as defined in Regulat...,0,0
8,Act” and “Act of Holders,The meanings specified in Section 14.2 (Acts...,0,0
9,Additional Notes,Any Notes issued pursuant to Section 2.4(a) ...,0,0
