In [1]:
from bs4 import BeautifulSoup
import re
import csv

In [11]:
html_path = 'NOXSOXPM-ISA-FINAL-SEP-2020.html'
csv_path = 'nox_2020_citation_table_raw.csv'

In [12]:
f = open(html_path, 'r', encoding='cp1252')
contents = f.read()
soup = BeautifulSoup(contents,'html.parser')
soup.prettify(formatter=lambda s: s.replace(u'\xa0', ' ').replace("&nbsp;", ' '))

f.close()

In [13]:
def is_heading(s):
    if s.startswith('IS.') or s.startswith('ES.'):
        return True
    if re.search(r'^\d+\..+\s{5,7}', s) is not None:
        return True
    if s.startswith('PREFACE') or (s.startswith('EXECUTIVE') and 'SUMMARY' in s) or (s.startswith('INTEGRATED') and 'SYNTHESIS' in s) or s.startswith('APPENDIX'):
        return True
    return False

In [14]:
def process_text(t):
    t = t.replace('‘', "'").replace('’', "'").replace('“', '"').replace('”', '"').replace('‑', '-').replace('·', '').replace('־', '-').replace('−', '-').replace('×', 'x')
    t = re.sub(r'\s+', ' ', t).strip()
    return t

In [18]:
f = open(csv_path, 'w', encoding='utf-8',newline='')
csv_r = csv.writer(f)
i = 0
level1 = '-'
level2 = '-'
level3 = '-'
level4 = '-'
level5 = '-'
level6 = '-'

row = ['INSTANCE_ID', 'REFERENCE_ID', 'Level1', 'Level2', 'Level3', 'Level4', 'Level5', 'Level6', 'AnchorText', 'ContextParagraph']
csv_r.writerow(row)

for tag in soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'h8']):
    # print(f'{tag.name}: {tag.text}')
    
    tag_text = process_text(tag.text)

    if is_heading(tag.text) and len(tag.text.split()[0].split('.')) == 1:
        level1 = tag_text
        # print ('PART', part)
    
    if is_heading(tag.text) and len(tag.text.split()[0].split('.')) == 2:
        level2 = tag_text
        # print ('CHAPTER', chapter)
    
    if is_heading(tag.text) and len(tag.text.split()[0].split('.')) == 3:
        level3 = tag_text
        # print ('SECTION', section)
        
    if is_heading(tag.text) and len(tag.text.split()[0].split('.')) == 4:
        level4 = tag_text
        # print ('SUBSECTION', subsection)
        
    if is_heading(tag.text) and len(tag.text.split()[0].split('.')) == 5:
        level5 = tag_text
        
    if is_heading(tag.text) and len(tag.text.split()[0].split('.')) == 6:
        level6 = tag_text
        
    pre_hero_id=""
    pre_tag_text=""
    anchor_text = ""

    for subtag in tag.descendants: # write out all citations in the current paragraph
        if subtag.name == 'a' and 'href' in subtag.attrs:
            if 'hero' in subtag.attrs['href'] and 'reference_id' in subtag.attrs['href']:
                i += 1
                hero_id = subtag.attrs['href'].split('=')[-1]
                

                if re.search(r'\d+', hero_id) is not None:
                    if pre_hero_id == hero_id and tag_text == pre_tag_text:
                        anchor_text += subtag_text

                    else:
                        anchor_text = subtag_text
                    
                    
                    if level1 == 'PREFACE': # chapter, section, and subsections were from the TOC before FREFACE
                        s_level2, s_level3, s_level4, s_level5, s_level6 = '-', '-', '-', '-', '-'
                    else:
                        s_level2 = level2

                    if level3.startswith(level2.split()[0]):
                        s_level3 = level3
                        if level4.startswith(level3.split()[0]):
                            s_level4 = level4
                            if level5.startswith(level4.split()[0]):
                                s_level5 = level5
                                if level6.startswith(level5.split()[0]):
                                    s_level6 = level6
                                else:
                                    s_level6 = '-'
                            else:
                                s_level5 = '-'
                                s_level6 = '-'
                        else:
                            s_level4 = '-'
                            s_level5 = '-'
                            s_level6 = '-'
                    else:
                        s_level3 = '-'
                        s_level4 = '-'
                        s_level5 = '-'
                        s_level6 = '-'
                    
                    subtag_text = process_text(subtag.text)
                    row = [str(i), hero_id, level1, s_level2, s_level3, s_level4, s_level5, s_level6, anchor_text, tag_text]
                    csv_r.writerow(row)
                    pre_hero_id = hero_id
                    pre_tag_text = tag_text
                
f.close()

## Reformat: remove instances, separate header sections

In [19]:
def reformat(input_path, output_path):
    
    in_f = open(input_path, 'r', encoding='utf-8',newline='')
    out_f = open(output_path, 'w', encoding='utf-8',newline='')

    csv_r = csv.reader(in_f)
    next(csv_r, None) # skip header
    
    csv_w = csv.writer(out_f)
    header = ['INSTANCE_ID', 'REFERENCE_ID']
    header += ['level_all_num', 'level1_num', 'level2_num', 'level3_num', 'level4_num', 'level5_num', 'level6_num']
    header += ['level1_char', 'level2_char', 'level3_char', 'level4_char', 'level5_char', 'level6_char']
    header += ['ANCHOR_TEXT', 'CONTEXT_PARAGRAPH']
    csv_w.writerow(header)
    
    i = 1
    csv_r = list(csv_r)
    length = len(csv_r)
    for index,row in enumerate(csv_r):
        INSTANCE_ID, REFERENCE_ID, Level1, Level2, Level3, Level4, Level5, Level6, AnchorText, ContextParagraph = row
        if index<length-1:
            _, next_re_id, _, _, _, _, _, _, next_anchor, next_context = csv_r[index+1]
        
        if (REFERENCE_ID == next_re_id) and (next_anchor.startswith(AnchorText))and(ContextParagraph[:20] == next_context[:20]):
            continue     
        
        level1_num = '.'
        level1_txt = ''
        level2_num = '.'
        level2_txt = ''
        level3_num = '.'
        level3_txt = ''
        level4_num = '.'
        level4_txt = ''
        level5_num = '.'
        level5_txt = ''
        level6_num = '.'
        level6_txt = ''
        
        if Level1 != '-':
            if Level1 == 'PREFACE' or Level1 == 'EXECUTIVE SUMMARY' or Level1 == 'INTEGRATED SYNTHESIS':
                level1_num = '0'
                level1_txt = Level1
            else:
                level1_num = Level1.split(' ', 2)[1]
                level1_txt = ' '.join(Level1.split(' ', 2)[:2] + ['|'] + Level1.split(' ', 2)[2:])
            section_num = level1_num
            section_txt = level1_txt
        if Level2 != '-':
            level2_num = Level2.split(' ', 1)[0].split('.')[1]
            level2_txt = Level2.split(' ', 1)[1]
            section_num, section_txt = Level2.split(' ', 1)
        if Level3 != '-':
            level3_num = Level3.split(' ', 1)[0].split('.')[2]
            level3_txt = Level3.split(' ', 1)[1]
            section_num, section_txt = Level3.split(' ', 1)
        if Level4 != '-':
            level4_num = Level4.split(' ', 1)[0].split('.')[3]
            level4_txt = Level4.split(' ', 1)[1]
            section_num, section_txt = Level4.split(' ', 1)
        if Level5 != '-':
            level5_num = Level5.split(' ', 1)[0].split('.')[4]
            level5_txt = Level5.split(' ', 1)[1]
            section_num, section_txt = Level5.split(' ', 1)
        if Level6 != '-':
            level6_num = Level6.split(' ', 1)[0].split('.')[5]
            level6_txt = Level6.split(' ', 1)[1]
            section_num, section_txt = Level6.split(' ', 1)
            
        new_row = [str(i), REFERENCE_ID]
        new_row += [section_num, level1_num, level2_num, level3_num, level4_num, level5_num, level6_num]
        new_row += [level1_txt, level2_txt, level3_txt, level4_txt, level5_txt, level6_txt]
        new_row += [AnchorText, ContextParagraph]
        csv_w.writerow(new_row)
        
        i += 1
        
    in_f.close()
    out_f.close()

In [20]:
new_csv_path = 'nox_2020_citation_table.csv'
reformat(csv_path, new_csv_path)