In [1]:
import re

In [2]:
html_file = 'antonkulaga.vcf.longevity_combined.html'

In [3]:
# load code of original report
with open(html_file, encoding='UTF-8') as fh:
    st_all = fh.read()

<br />

In [4]:
# extract fragment of HTML-code with table "Longevity significant variations"
table_1_begin = st_all.find('		<tbody id="longTBody">')
table_2_begin = st_all.find('		<tbody id="cancTBody">')
st_1 = st_all[table_1_begin:table_2_begin]

In [5]:
# cleaning of the table "Longevity significant variations" from duplicates
def process_table_1(st: str):
    fragments = []                    # collection of unique fragments
    end_of_preliminary_fragment = -1  # position of end of fragment of HTML code before fragments with records
    start_of_closing_fragment = -1    # position of start of fragment of HTML code after fragments with records
    position_begin = 0                # position of start of current fragment
    position_end = 0                  # position of end of current fragment

    i = 0
    while position_begin != -1:
        position_begin = st.find('			<tr class="clickable" onclick="doOnClick(this)">', position_end)  # find position of start of current fragment
        
        i += 1
        
        if end_of_preliminary_fragment == -1:
            end_of_preliminary_fragment = position_begin
            
        if position_begin == -1:
            start_of_closing_fragment = position_end
            break
        
        position_end = st.find('			</td></tr>', position_begin) + 13   # find position of end of current fragment
        
        found_fragment = st[position_begin:position_end] # extract fragment
        if found_fragment not in fragments:    # if fragment is absent in collection, add it to collection
            fragments.append(found_fragment)

    print(f"end_of_preliminary_fragment: {end_of_preliminary_fragment}, start_of_closing_fragment: {start_of_closing_fragment}")
    print(f"number of fragments in original: {i}, number of fragments after processing: {len(fragments)}")
    st = st[:end_of_preliminary_fragment] + '\n'.join(fragments) + st[start_of_closing_fragment:]  # assemble new table from fragments
    return st        


print(f"Length of fragment before processing: {len(st_1)}")
st_1 = process_table_1(st_1)
print(f"Length of fragment after processing : {len(st_1)}")
# print(st_1[:1000])

Length of fragment before processing: 1058968
end_of_preliminary_fragment: 25, start_of_closing_fragment: 1058501
number of fragments in original: 725, number of fragments after processing: 186
Length of fragment after processing : 274824


<br />

In [6]:
# extract fragment of HTML-code with table "Cancer Report"
st_2 = st_all[table_2_begin:]

In [7]:
# cleaning of the table "Cancer Report" from duplicates
def process_table_2(st: str):
    fragments = []                    # collection of unique fragments
    end_of_preliminary_fragment = -1  # position of end of fragment of HTML code before fragments with records
    start_of_closing_fragment = -1    # position of start of fragment of HTML code after fragments with records
    position_begin = 0                # position of start of current fragment
    position_end = 0                  # position of end of current fragment

    i = 0
    while position_begin != -1:
        # find position of start of current fragment
        position_begin_obj = re.search('			<tr class="clickable" onclick="doOnClick\(this\)"><td>\+</td><td>\d+</td><td>', st_2[position_end:])
        if position_begin_obj:
            position_begin = position_begin_obj.span()[0] + position_end
        else:
            start_of_closing_fragment = position_end
            break

        i += 1

        if end_of_preliminary_fragment == -1:
            end_of_preliminary_fragment = position_begin            
        
        position_end = st.find(']</td></tr>\n', position_begin) + 12    # find position of end of current fragment
        
        found_fragment = st[st.find('</td><td>chr', position_begin):position_end]  # extract fragment, reducint it, - so that it did not contain number of record

        if found_fragment not in fragments:    # if fragment is absent in collection, add it to collection
            fragments.append(found_fragment)

    print(f"end_of_preliminary_fragment: {end_of_preliminary_fragment}, start_of_closing_fragment: {start_of_closing_fragment}")
    print(f"number of fragments in original: {i}, number of fragments after processing: {len(fragments)}")

    for i in range(len(fragments)):   # restore fragments till full form but with new numbers of records
        fragments[i] = '			<tr class="clickable" onclick="doOnClick(this)"><td>+</td><td>' + str(i+1) + fragments[i]
    
    st = st[:end_of_preliminary_fragment] + '\n'.join(fragments) + st[start_of_closing_fragment:]    # assemble new table from fragments
    return st


print(f"Length of fragment before processing: {len(st_2)}")
st_2 = process_table_2(st_2)
print(f"Length of fragment after processing : {len(st_2)}")

Length of fragment before processing: 138511
end_of_preliminary_fragment: 25, start_of_closing_fragment: 138463
number of fragments in original: 117, number of fragments after processing: 37
Length of fragment after processing : 43732


In [8]:
# write cleaned report to new html-file
with open(html_file[:-5] + ' (cleaned)' + html_file[-5:], 'w', encoding='UTF-8') as fh:
    fh.write(st_all[:table_1_begin] + st_1 + st_2)