In [1]:
# library import
import os
import re
import pdfplumber
import numpy as np
import pandas as pd 
import json

import scrape   # custom file

In [2]:
# Enable multiple output per cell. 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### Definition Lists
Grab full list of definition names from two files

In [3]:
file_name_vib = 'Vibrant VIII - Indenture(133849278_1).pdf'
start_page_vib = 8
end_page_vib = 74

file_name_oh = 'OHA XII Indenture.pdf'
start_page_oh = 8
end_page_oh = 85

In [4]:
# extract pages from both documents
oh_pages = scrape.pdf_extract_pages(file_name_oh, start_page_oh, end_page_oh)
doc_str_oh = scrape.combine_strings(oh_pages) 
vib_pages = scrape.pdf_extract_pages(file_name_vib, start_page_vib, end_page_vib)
doc_str_vib = scrape.combine_strings(vib_pages)

In [9]:
# isolate ranges for both files
regex_oh = r'“.*”:'
regex_vib = r'".*":'
name_range_oh = scrape.text_match_str2list(regex_oh, doc_str_oh)
name_range_vib = scrape.text_match_str2list(regex_vib, doc_str_vib)

Store of list of definition names into dictionary, then export through dataframe

In [10]:
# filter out beginning and end quote/colon from each line 
name_range_oh = [x[1:(len(x)-2)] for x in name_range_oh]
name_range_vib = [x[1:(len(x)-2)] for x in name_range_vib]

In [11]:
doc_def_dict={}
doc_def_dict['OH'] = name_range_oh
doc_def_dict['Vib'] = name_range_vib

In [12]:
df1 = pd.DataFrame(data=doc_def_dict['OH'])
df1.columns = ['OH']
df2 = pd.DataFrame(data=doc_def_dict['Vib'])
df2.columns = ['Vib']
df = pd.concat([df1, df2], axis=1)
df.head() 

Unnamed: 0,OH,Vib
0,Acceleration Event,17g-5 Information Provider
1,Acceleration Priority of Payments,17g-5 Information Provider's Website
2,Account Agreement,Acceleration Event
3,Accountants’ Effective Date Comparison AUP Report,Accountants' Report
4,Accountants’ Effective Date Recalculation AUP ...,Accounts


### Table of Definitions: Raw definitions (no table/bullet processing) 
1. Process full file: start with OH
2. grab locations and map fields

In [13]:
def_locs_oh = scrape.text_match_2list(regex_oh, doc_str_oh)
def_descr_oh = scrape.list_betweenloc_to_string_bounded(def_locs_oh, doc_str_oh, len(doc_str_oh))

In [14]:
df_defs = pd.DataFrame([doc_str_oh[x[0]+1:x[1]-2] for x in def_locs_oh])
df_defs['description'] = def_descr_oh
df_defs.columns = ['def_name','def_description']
df_defs.head() 

Unnamed: 0,def_name,def_description
0,Acceleration Event,The meaning specified in the Acceleration Pr...
1,Acceleration Priority of Payments,The meaning specified in \nSection 11.1...
2,Account Agreement,An agreement in substantially the form of Ex...
3,Accountants’ Effective Date Comparison AUP Report,A report of agreed \nupon procedures perform...
4,Accountants’ Effective Date Recalculation AUP ...,A report of agreed \nupon procedures perfor...


### Detect bullet points / lists
1. Detection of different lists: different permutations of roman, letter, x/y, 
   - Add their locations to table. 
2. Detection within lists: Create sub tables around different bullet types. 
3. merge back in sub table. 


In [15]:
# Detect Roman numerals 
regex = r"[\n\s]\(i\)"
match_reg = [re.search(regex, x) for x in df_defs['def_description']]
roman_bullet_flag = [x.span()[0] if x is not None else 0 for x in match_reg]
df_defs['roman_bullets_loc'] = roman_bullet_flag
df_defs.iloc[10:16]   

Unnamed: 0,def_name,def_description,roman_bullets_loc
10,Additional Notes Closing Date,The closing date for the issuance of...,0
11,Adjusted Collateral Principal Amount,As of any date of determination: \n(a) the ...,526
12,Adjusted Coupon,As of any date of determination and with res...,0
13,Adjusted Spread,As of any date of determination and with res...,0
14,Administration Agreement,An agreement between the Administrator and t...,0
15,Administrative Expense Cap,An amount equal on any Payment Date (when \n...,419


In [16]:
# Detect Letter bullets: look for '(a)'
regex = r"[\n\s]\(a\)"
match_reg = [re.search(regex, x) for x in df_defs['def_description']]
letter_bullet_flag = [x.span()[0] if x is not None else 0 for x in match_reg]
df_defs['letter_bullets_loc'] = letter_bullet_flag
df_defs.iloc[6:12]   

Unnamed: 0,def_name,def_description,roman_bullets_loc,letter_bullets_loc
6,Accounts,(i) The Payment Account; (ii) the Coll...,3,0
7,Accredited Investor,An accredited investor as defined in Regulat...,0,0
8,Act” and “Act of Holders,The meanings specified in Section 14.2 (Acts...,0,0
9,Additional Notes,Any Notes issued pursuant to Section 2.4(a) ...,0,0
10,Additional Notes Closing Date,The closing date for the issuance of...,0,0
11,Adjusted Collateral Principal Amount,As of any date of determination: \n(a) the ...,526,35


### Process roman numerals showing up first 
Process roman numeral bullets that show up first: this will be marked by roman bullet flag location being higher than letter bullet. 
1. Isolate table Roman letters come before letter bullets
2. create substrings starting from roman numberal to end of string <br> 
Note: Only processing <font color=salmon> lower </font> case bullet points. will create regex for upper as well, but most bullet points in OH were lower. 

In [17]:
df_defs_sub1 =df_defs[(
                        (df_defs.roman_bullets_loc < df_defs.letter_bullets_loc) & 
                         (df_defs.roman_bullets_loc != 0)
                        ) | 
                     ( 
                        (df_defs.roman_bullets_loc !=0) & 
                          (df_defs.letter_bullets_loc == 0)
                         )
                    ]
df_defs_sub1.iloc[2]

def_name                                        Administrative Expenses
def_description         Fees, expenses (including indemnities) and o...
roman_bullets_loc                                                   452
letter_bullets_loc                                                    0
Name: 16, dtype: object

### Create regex lists for roman numerals. 
1. Explore Administrative Expenses definitions: shows (x) before, but not in middle. 
2. Look at Affiliates. shows (x) between roman numerals. 

In [1]:
roman_regex_list = [r"[\n\s]\(i\)", r"[\n\s]\(ii\)", r"[\n\s]\(iii\)", r"[\n\s]\(iv\)",
                     r"[\n\s]\(v\)", r"[\n\s]\(vi\)", r"[\n\s]\(vii\)", r"[\n\s]\(viii\)", 
                    r"[\n\s]\(ix\)", r"[\n\s]\(x\)", r"[\n\s]\(xi\)", r"[\n\s]\(xii\)", r"[\n\s]\(xiii\)", 
                    r"[\n\s]\(xiv\)", r"[\n\s]\(xv\)", r"[\n\s]\(xvi\)", r"[\n\s]\(xvii\)", 
                    r"[\n\s]\(xviii\)", r"[\n\s]\(xix\)", r"[\n\s]\(xx\)",  
                    r"[\n\s]\(xxi\)", r"[\n\s]\(xxii\)", r"[\n\s]\(xxiii\)", r"[\n\s]\(xxiv\)",
                    r"[\n\s]\(xxv\)", r"[\n\s]\(xxvi\)", r"[\n\s]\(xxvii\)", r"[\n\s]\(xxviii\)",
                    r"[\n\s]\(xxix\)", r"[\n\s]\(xxx\)"
                   ]

roman_upper_regex_list =  [r"[\n\s]\(I\)", r"[\n\s]\(II\)", r"[\n\s]\(III\)", r"[\n\s]\(IV\)",
                     r"[\n\s]\(V\)", r"[\n\s]\(VI\)", r"[\n\s]\(VII\)", r"[\n\s]\(VIII\)", 
                    r"[\n\s]\(IX\)", r"[\n\s]\(X\)", r"[\n\s]\(XI\)", r"[\n\s]\(XII\)", r"[\n\s]\(XIII\)", 
                    r"[\n\s]\(XIV\)", r"[\n\s]\(XV\)", r"[\n\s]\(XVI\)", r"[\n\s]\(XVII\)", 
                    r"[\n\s]\(XVIII\)", r"[\n\s]\(XIX\)", r"[\n\s]\(XX\)", 
                    r"[\n\s]\(XXI\)", r"[\n\s]\(XXII\)", r"[\n\s]\(XXIII\)", r"[\n\s]\(XXIV\)",
                    r"[\n\s]\(XXV\)", r"[\n\s]\(XXVI\)", r"[\n\s]\(XXVII\)", r"[\n\s]\(XXVIII\)",
                    r"[\n\s]\(XXIX\)", r"[\n\s]\(XXX\)"
                   ]

Admin Expenses: line 2 of definition (roman numerals first) subset <br> <br> 
Affiliates: line 18 of definitions set

In [19]:
# Show Example bullet point processing 
adminexp_def_list = scrape.list_btn_loc_regexlist_keepbullet(df_defs_sub1['def_description'].iloc[2], len(df_defs_sub1['def_description'].iloc[2]), roman_regex_list)
aff_def_list = scrape.list_btn_loc_regexlist_keepbullet(df_defs['def_description'].iloc[18], len(df_defs['def_description'].iloc[18]), roman_regex_list)
print("Admin Expenses \n")
adminexp_def_list
print("\n Affiliate Definitions \n")
aff_def_list

Admin Expenses 



[' (i) the  Independent  accountants,  agents  (other  than  the  Portfolio  Manager)  and \ncounsel of the Issuer, any Issuer Subsidiary (for fees, expenses and any taxes or government fees \nof such Issuer Subsidiary) and to any Person for any costs of Tax Account Reporting Rules \nCompliance',
 ' (ii) the Rating Agencies for fees and expenses (including surveillance fees) in \nconnection with any rating of the Secured Notes or any Collateral Obligations',
 ' (iii) any Person in \nrespect of Petition Expenses',
 ' (iv) the Portfolio Manager under this Indenture and the Portfolio \nManagement  Agreement,  including,  without  limitation,  expenses  of  the  Portfolio  Manager \n(including fees for its accountants, agents and counsel) incurred in connection with the purchase \nor  sale  of  any  Collateral  Obligations,  any  other  expenses  incurred  in  connection  with  the \nCollateral Obligations and amounts payable pursuant to Sections 9(c) and 11 of the Portfolio \nManagement A


 Affiliate Definitions 



[' (i) of such \nPerson',
 ' (ii) of any subsidiary or parent company of such Person o',
 ' (iii) of any Person described \nin clause (a) above.  For the purposes of this definition, control of a Person means the power, \ndirect or indirect, (x) to vote more than 50% of the securities having ordinary voting power for \nthe election of directors of such Persons or (y) to direct or cause the direction of the management \nand policies of such Person whether by contract or otherwise.  For purposes of this definition, \n(i) no entity shall be deemed an Affiliate of the Issuer or the Co-Issuer solely because the \nAdministrator or any of its Affiliates acts as administrator or share trustee for such entity, (ii) no \nentity to which the Portfolio Manager provides investment management or advisory services \nshall be deemed an Affiliate of the Portfolio Manager solely because the Portfolio Manager acts \nin such capacity and (iii) a Person shall not be deemed to be an “Affiliate” of an Person

Apply logic to all rows in <font color = aqua> Roman Numerals First </font> sub-table, then add to back to subtable. 

In [20]:
rom_def_strings = [scrape.list_btn_loc_regexlist_keepbullet(x,len(x),roman_regex_list) for x in df_defs_sub1['def_description']]
df_defs_sub1['roman_bullets'] = rom_def_strings 
df_defs_sub1.head() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,def_name,def_description,roman_bullets_loc,letter_bullets_loc,roman_bullets
6,Accounts,(i) The Payment Account; (ii) the Coll...,3,0,"[ (i) The Payment Account;, (ii) the Colle..."
15,Administrative Expense Cap,An amount equal on any Payment Date (when \n...,419,497,[ (i) upon the occurrence and continuation of ...
16,Administrative Expenses,"Fees, expenses (including indemnities) and o...",452,0,"[ (i) the Independent accountants, agents ..."
32,Balance,"On any date, with respect to Cash or Eligibl...",112,0,"[ (i) current balance of Cash, demand deposits..."
38,Business Day,Any day other than (i) a Saturday or a Sunda...,20,0,"[ (i) a Saturday or a Sunday o, (ii) a day on..."


### Process <font color=cyan> Letter </font> Bullet Points showing up first 

In [21]:
df_defs_sub2 =df_defs[(
                        (df_defs.roman_bullets_loc > df_defs.letter_bullets_loc) & 
                         (df_defs.letter_bullets_loc != 0)
                        ) | 
                     ( 
                        (df_defs.letter_bullets_loc !=0) & 
                          (df_defs.roman_bullets_loc == 0)
                         )
                    ]
df_defs_sub2.head()
len(df_defs_sub2) 

Unnamed: 0,def_name,def_description,roman_bullets_loc,letter_bullets_loc
11,Adjusted Collateral Principal Amount,As of any date of determination: \n(a) the ...,526,35
18,Affiliate” or “Affiliated,"With respect to a Person, (a) any other Pers...",240,27
20,Aggregate Excess Spread,"As of any date of determination, an amount e...",269,71
35,Benefit Plan Investor,(a) an employee benefit plan (as defined in ...,0,1
72,Clean-up Call Redemption Price,An amount at least equal to the sum of (a) t...,0,40


56

**Regex List for Letter Bullets** 

In [22]:
loweralpha_regex_list = [r"[\n\s]\(a\)", r"[\n\s]\(b\)", r"[\n\s]\(c\)", r"[\n\s]\(d\)",
                     r"[\n\s]\(e\)", r"[\n\s]\(f\)", r"[\n\s]\(g\)", r"[\n\s]\(h\)", 
                    r"[\n\s]\(i\)", r"[\n\s]\(j\)", r"[\n\s]\(k\)", r"[\n\s]\(l\)", 
                    r"[\n\s]\(m\)", r"[\n\s]\(n\)", r"[\n\s]\(o\)", r"[\n\s]\(p\)", 
                    r"[\n\s]\(q\)", r"[\n\s]\(r\)", r"[\n\s]\(s\)", r"[\n\s]\(t\)",
                    r"[\n\s]\(u\)", r"[\n\s]\(v\)", r"[\n\s]\(w\)", r"[\n\s]\(x\)",
                    r"[\n\s]\(y\)", r"[\n\s]\(z\)", r"[\n\s]\(aa\)", r"[\n\s]\(bb\)", 
                   ]

upperalpha_regex_list = [r"[\n\s]\(A\)", r"[\n\s]\(B\)", r"[\n\s]\(C\)", r"[\n\s]\(D\)",
                     r"[\n\s]\(E\)", r"[\n\s]\(F\)", r"[\n\s]\(G\)", r"[\n\s]\(H\)", 
                    r"[\n\s]\(I\)", r"[\n\s]\(J\)", r"[\n\s]\(K\)", r"[\n\s]\(L\)", 
                    r"[\n\s]\(M\)", r"[\n\s]\(N\)", r"[\n\s]\(O\)", r"[\n\s]\(P\)", 
                    r"[\n\s]\(Q\)", r"[\n\s]\(R\)", r"[\n\s]\(S\)", r"[\n\s]\(T\)",
                    r"[\n\s]\(U\)", r"[\n\s]\(V\)", r"[\n\s]\(W\)", r"[\n\s]\(X\)",
                    r"[\n\s]\(Y\)", r"[\n\s]\(Z\)", r"[\n\s]\(AA\)", r"[\n\s]\(BB\)", 
                   ]

In [None]:
lowalpha_def_strings = [scrape.list_btn_loc_regexlist_keepbullet(x,len(x),loweralpha_regex_list) for x in df_defs_sub2['def_description']]
df_defs_sub2['alpha_bullets'] = lowalpha_def_strings
df_defs_sub2.head() 

### Merge alphabet and roman numeral bullet point:
Map 1st layer bullet points back to original definitions table. 

In [24]:
df_def_comb = pd.merge(df_defs, df_defs_sub1[['def_name','roman_bullets']], on='def_name', how='left')

In [25]:
df_def_comb = pd.merge(
    pd.merge(df_defs, df_defs_sub1[['def_name','roman_bullets']], on='def_name', how='left'), 
    df_defs_sub2[['def_name','alpha_bullets']], on='def_name',how='left')
df_def_comb

Unnamed: 0,def_name,def_description,roman_bullets_loc,letter_bullets_loc,roman_bullets,alpha_bullets
0,Acceleration Event,The meaning specified in the Acceleration Pr...,0,0,,
1,Acceleration Priority of Payments,The meaning specified in \nSection 11.1...,0,0,,
2,Account Agreement,An agreement in substantially the form of Ex...,0,0,,
3,Accountants’ Effective Date Comparison AUP Report,A report of agreed \nupon procedures perform...,0,0,,
4,Accountants’ Effective Date Recalculation AUP ...,A report of agreed \nupon procedures perfor...,0,0,,
...,...,...,...,...,...,...
483,Weighted Average Moody’s Rating Factor,The number (rounded up to the \nnearest whol...,84,65,,[ (a) the product of (i) the Principal Balance...
484,Weighted Average Moody’s Recovery Rate,"As of any date of determination, \nthe numb...",302,0,"[ (i) the Target Balance \nan, (ii) Aggregate...",
485,Weighted Average Rating Adjusted Cov-Lite ...,Prior to the \nsatisfaction of the Contr...,0,0,,
486,Yield Adjusted Collateral Obligation,Any Collateral Obligation (other than a \nDi...,267,0,[ (i) it is acquired by the Issuer for a purch...,


### Definition Dictionary
Set up dictionary to allow for several sublayers in each definition to handle different bullet points, tables, etc. 
1. **Keys** will be definition names 
2. Values: <br>  
- Top Level: roman and letter bullet point locations 
  - if no bullet points, use full definition 
  - Depending on if roman or alphabet bullet point shows up first, use roman or alphabet bullet point list from **Definitions** dataframe. 
  - For bullet points: split out bullet points from rest of text within definition. 

In [26]:
def_dict = {} 
for i, item in enumerate(df_def_comb['def_description']):
    key_name = df_def_comb.iloc[i]['def_name']
    key_vals = {}   # holds definition value
    sub_dict = {}   # holds items within definition value
    roman_loc1 = df_def_comb.iloc[i]['roman_bullets_loc']  # will re-use for rest of loop 
    letter_loc1 = df_def_comb.iloc[i]['letter_bullets_loc']
    key_vals['roman_loc1'] = roman_loc1
    key_vals['letter_loc1'] = letter_loc1
        
    # Definitions WITHOUT bullet points 
    if( (df_def_comb.iloc[i]['roman_bullets_loc']==0) &
           (df_def_comb.iloc[i]['letter_bullets_loc']==0) ):
        key_vals['def_values'] = item
        
    # Definitions with bullets: lowercase Roman comes first. 
    elif( ( 
            (df_def_comb.iloc[i]['roman_bullets_loc'] < df_def_comb.iloc[i]['letter_bullets_loc'] ) & 
                (df_def_comb.iloc[i]['roman_bullets_loc'] != 0 )
            ) | 
          ( 
            (df_def_comb.iloc[i]['roman_bullets_loc'] != 0) & 
              (df_def_comb.iloc[i]['letter_bullets_loc'] == 0)
            )
        ):
            sub_dict['opening_sec'] = df_def_comb.iloc[i]['def_description'][0:roman_loc1]
            sub_dict['bullet_level1'] = df_def_comb.iloc[i]['roman_bullets']
            key_vals['def_values'] = sub_dict

    # Definitions with bullets: lowercase Roman comes first. 
    elif( ( 
            (df_def_comb.iloc[i]['letter_bullets_loc'] < df_def_comb.iloc[i]['roman_bullets_loc'] ) & 
                (df_def_comb.iloc[i]['letter_bullets_loc'] != 0 )
            ) | 
          ( 
            (df_def_comb.iloc[i]['letter_bullets_loc'] != 0) & 
              (df_def_comb.iloc[i]['roman_bullets_loc'] == 0)
            )
        ):
            sub_dict['opening_sec'] = df_def_comb.iloc[i]['def_description'][0:letter_loc1]
            sub_dict['bullet_level1'] = df_def_comb.iloc[i]['alpha_bullets']
            key_vals['def_values'] = sub_dict
    
    else: def_dict[key_name] = 5000
    def_dict[key_name] = key_vals

In [27]:
# Sample of definitions with roman and letters coming first 
def_dict['Accounts']           # roman numerals first
def_dict['Clean-up Call Redemption Price']        # letters first 

{'roman_loc1': 3,
 'letter_loc1': 0,
 'def_values': {'opening_sec': '   ',
  'bullet_level1': [' (i) The  Payment  Account;',
   ' (ii) the  Collection  Account;',
   ' (iii) the \nRamp-Up Account',
   ' (iv) the Revolver Funding Account',
   ' (v) each Hedge Account (to the extent \npermitted under the related Hedge Agreement)',
   ' (vi) the Expense Reserve Account',
   ' (vii) the \nCustodial Account',
   ' (viii) the Ongoing Expense Maintenance Account',
   ' (ix) the Supplemental \nReserve Account',
   ' (x) the Contribution Account; an',
   ' (xi) the Tax Reserve Account. ']}}

{'roman_loc1': 0,
 'letter_loc1': 40,
 'def_values': {'opening_sec': '  An amount at least equal to the sum of',
  'bullet_level1': [' (a) the \nRedemption Price of the Secured Notes, plu',
   ' (b) the aggregate of all other amounts owing by the \nIssuer on the date of such redemption that are payable in accordance with the Priority of \nPayments prior to distributions in respect of the Subordinated Notes, including any amounts \npayable  in  respect  of  any  Hedge  Agreement  and  all  expenses  incurred  in  connection  with \neffecting the Clean-up Call Redemption. ']}}

In [31]:
df = pd.DataFrame.from_dict(data=def_dict, orient='index')
df.head() 

Unnamed: 0,roman_loc1,letter_loc1,def_values
Acceleration Event,0,0,The meaning specified in the Acceleration Pr...
Acceleration Priority of Payments,0,0,The meaning specified in \nSection 11.1...
Account Agreement,0,0,An agreement in substantially the form of Ex...
Accountants’ Effective Date Comparison AUP Report,0,0,A report of agreed \nupon procedures perform...
Accountants’ Effective Date Recalculation AUP Report,0,0,A report of agreed \nupon procedures perfor...
