In [None]:
from bs4 import BeautifulSoup
import json
import re

from google.colab import drive

drive.mount("/content/drive/")

In [None]:
original_file_path = "drive/MyDrive/paragraph/corpus/full_technote_collection.txt"
fixed_file_path = "drive/MyDrive/paragraph/corpus/full_technote_collection_fixed.txt"
partial_out_path = "drive/MyDrive/paragraph/corpus/custom_sections.json"
final_out_path = "drive/MyDrive/paragraph/corpus/custom_sections_512.json"

In [None]:
TO_ESCAPE = ["(", ")", "+", "*", "?", "[", "]"]

# Functions for escaping de-escaping special chars

def escape(s):
  s = s.replace("\\ ","\\")
  s = s.replace(" \\","\\")
  s = s.replace("\\","\\\\")
  for escape_char in TO_ESCAPE:
    s = s.replace(escape_char, "\\" + escape_char)
  return s

def remove_escape(s):
  for escape_char in TO_ESCAPE:
    s = s.replace("\\" + escape_char, escape_char)
  s = s.replace("\\\\","\\")
  return s

In [None]:
# Resolving issues caused by wrong structure of some html pages
to_skip = ["swg21634896"]

with open(original_file_path, "r") as file:
  with open(fixed_file_path, "a") as file_out:
      for line in file:
          line_data = []
          line_documents = json.loads(line)
          for document in line_documents:
              if document["id"] in to_skip:
                  continue

              if document["id"] == "swg21652675":
                  document["text"] = document["text"].replace("* Windows", "* WINDOWS")
                  document["text"] = document["text"].replace("* Linux and UNIX", "* LINUX AND UNIX")

              if document["id"] == "isg3T1000214":
                  original_text = '<h2 class=" ibm-h4 ibm-bold"><a name="1.8">Deliver the testcase via FTP</a>'
                  document["content"] = document["content"].replace(original_text, original_text + "</h2>")

              if document["id"] == "swg27011656":
                  original_text = '<h2 class="ibm-rule ibm-h4 ibm-bold">DB2 9 for z/OS Product Documentation <br> <br>'
                  document["content"] = document["content"].replace(original_text, original_text + "</h2>")

              if document["id"] == "swg21672099":
                  document["content"] = document["content"].replace("&lt;/H2&gt;]", "</h2>")

              if document["id"] == "isg3T1000260":
                  original_text = '<h2 class=" ibm-h4 ibm-bold"> <a name="4"></a> The shell script'
                  document["content"] = document["content"].replace(original_text, original_text + "</h2>")

              if document["id"] == "isg3T1000192":
                  original_text = '<h2 class=" ibm-h4 ibm-bold"><a name="related">Related documentation</a>'
                  document["content"] = document["content"].replace(original_text, original_text + "</h2>")

              if document["id"] == "swg21298716":
                  original_text = '<h2 class=" ibm-h4 ibm-bold">If you did not find your question about DB2 listed here then please use the &apos;Rate this page&apos; section below to send the team a question to add to the DB2 Fequently Asked Question list.'
                  document["content"] = document["content"].replace(original_text, original_text + "</h2>")

              if document["id"] == "swg21651211":
                  original_text = "<tt>&#xA0;"
                  document["content"] = document["content"].replace(original_text, "")
                  original_text = "</tt></h2>"
                  document["content"] = document["content"].replace(original_text, "</h2>")

              if document["id"] == "swg22016151":
                  original_text = '<h2 class=" ibm-h4 ibm-bold"><b>Data&#xA0;</b><b>r</b><b>equest&#xA0;</b><b>overview</b></h2>'
                  new_text = '<h2 class=" ibm-h4 ibm-bold"><b>Datarequestoverview</b></h2>'
                  document["content"] = document["content"].replace(original_text, new_text)
                  original_text = '<h2 class=" ibm-h4 ibm-bold"><b>Set</b><b>ting</b><b>&#xA0;up the&#xA0;</b><b>s</b><b>ystem</b></h2>'
                  new_text = '<h2 class=" ibm-h4 ibm-bold"><b>Settingup thesystem</b></h2>'
                  document["content"] = document["content"].replace(original_text, new_text)
                  original_text = '<h2 class=" ibm-h4 ibm-bold"><b>Test&#xA0;</b><b>your d</b><b>ata&#xA0;</b><b>r</b><b>equest</b></h2>'
                  new_text = '<h2 class=" ibm-h4 ibm-bold"><b>Testyour datarequest</b></h2>'
                  document["content"] = document["content"].replace(original_text, new_text)


              line_data.append(document)

          file_out.write(json.dumps(line_data) + "\n")

      file_out.close()

  file.close()

In [None]:
# Function for creating a section
def create_section(key, text, start_index, end_index):
    section = {}
    deescaped_key = remove_escape(key)
    section["title"] = deescaped_key
    section["text"] = text[start_index: end_index]
    section["start"] = start_index
    section["end"] = end_index
    return section

# Pattern for removing special chars with regex
pattern = r"\xa0"

with open(partial_out_path, "w") as file_out:
  with open(fixed_file_path, "r") as file:
      for line in file:
          line_data = []
          line_documents = json.loads(line)
          for document in line_documents:
              print("id:", document["id"])

              soup = BeautifulSoup(document["content"], "html.parser")
              # Replace all <br> with blank spaces
              for br in soup.find_all("br"):
                br.replace_with(" ")

              document["content"] = str(soup)

              # Convert <a href="link.com"> link </a> --> link [link.com]
              for a_href in soup.find_all("a"):
                if a_href.get("href") == None or a_href.get("href") == "#" or a_href.get("href").find("/") == -1:
                  continue
                document["content"] = document["content"].replace(str(a_href), a_href.text + " [" + a_href.get("href") + "]")

              keys = []
              soup = BeautifulSoup(document["content"], "html.parser")
              h2s = soup.find_all("h2")
              
              # Convert <h2> TITLE\n name   <b>of</b>  \tthe  (title) </h2> --> TITLE NAME OF THE \(TITLE\)
              for h2 in h2s:
                  h2_text = h2.get_text(separator = "")
                  h2_text = re.sub(pattern, " ", h2_text)
                  h2_text = re.sub(r"\s+", " ", h2_text)
                  h2_text = escape(h2_text)
                  h2_text = h2_text.strip()
                  h2_text = h2_text.upper()
                  if h2_text == "":
                    continue
                  keys.append(h2_text)

              if "SUBSCRIBE" in keys:
                keys.remove("SUBSCRIBE")
              
              if len(keys) == 0:
                continue
              
              # Delete content after processing for saving memory
              del document["content"]
              document["text"] = re.sub(r"\s+", " ", document["text"])


              # Start of section creation part
              sections = []
              start_index = 0
              for i in range(len(keys) - 1):
                  # Search for occurrence of the next key, if not found search using regex
                  end_index = document["text"].find(keys[i + 1], start_index + 1)
                  if end_index == -1:
                    match = re.search(keys[i + 1], document["text"][start_index:], re.IGNORECASE)
                    interval = tuple(x + start_index for x in match.span())
                    end_index = interval[0]

                  section = create_section(keys[i], document["text"], start_index, end_index)
                  start_index = end_index
                  sections.append(section)

              end_index = len(document["text"])
              section = create_section(keys[-1], document["text"], start_index, end_index)
              sections.append(section)

              document["sections"] = sections

              line_data.append(document)
          
          file_out.write(json.dumps(line_data) + "\n")


In [None]:
# Count documents and lines

tot_docs = 0
tot_lines = 0
with open(fixed_file_path, "r") as file:
    for line in file:
        line_documents = json.loads(line)
        for document in line_documents:
            tot_docs = tot_docs + 1
        tot_lines = tot_lines + 1

print("Tot docs: " + str(tot_docs))
print("Tot lines: " + str(tot_lines))

In [None]:
# Count and check how many sections needs to be truncated (> 512 words) and the number of sections, plus the length of longest section

max_len = 512
cont_section_truncated = 0
len_longest_section = 0
tot_sections = 0
len_section_truncated = []
with open(partial_out_path, "r") as file:
  for line in file:
    line_documents = json.loads(line)
    for document in line_documents:
      for section in document["sections"]:
        section_text = section["text"]
        tot_sections = tot_sections + 1
        # Split the sentence in words and count them
        n_words = len(section_text.split())
        if n_words > max_len:
          cont_section_truncated = cont_section_truncated + 1
          len_section_truncated.append(n_words)
          if n_words > len_longest_section:
            len_longest_section = n_words

print("cont_section_truncated: ", str(cont_section_truncated))
print("len_longest_section: ", str(len_longest_section))
print("tot_sections: ", str(tot_sections))

In [None]:
def create_section_512(original_title, title_cont, char_start_index, char_end_index, text, offset):
    section = {}
    section["title"] = original_title + "-" + str(title_cont)
    section["start"] = char_start_index + offset
    section["end"] = char_end_index + offset
    section["text"] = text[char_start_index: char_end_index]
    return section

In [None]:
max_word_len = 512

with open(final_out_path, "w") as file_out:
    with open(partial_out_path, "r") as file:
        for line in file:
            line_documents = json.loads(line)
            for document in line_documents:
                doc_text = document["text"]
                sections = []
                for section in document["sections"]:

                    section_text = section["text"]
                    
                    # Divide text by words
                    txt_words_split = section_text.split() 
                    n_words = len(txt_words_split)
                    
                    offset = section["start"]
                    char_start_index = 0
                    word_start_index = 0
                    
                    # Divide sections in chunks of 512 words max
                    if n_words > max_word_len:
                        title_cont = 0
                        n_sections = n_words // max_word_len + 1
                        n_words_last_section = n_words % max_word_len

                        split_last_section = 0

                        # Avoid sections with less than 10 words by splitting the second-last in half
                        if n_words_last_section < 10:
                            split_last_section = 2

                        while title_cont < n_sections - split_last_section:

                            cur_words = txt_words_split[word_start_index: min(word_start_index + 512, len(txt_words_split))]

                            char_end_index = sum([len(cur_word) + 1 for cur_word in cur_words]) + char_start_index

                            new_section = create_section_512(section["title"], title_cont, char_start_index, char_end_index, section_text, offset)
                            sections.append(new_section)

                            # Update the indexes
                            word_start_index = word_start_index + 512
                            char_start_index = char_end_index
                            title_cont = title_cont + 1


                        # Join the last two sections 
                        if split_last_section == 2:
                            last_2sections_words = txt_words_split[word_start_index :]
                            cut_point = len(last_2sections_words) // 2

                            words_first_section = last_2sections_words[:cut_point]
                            char_end_index = sum([len(cur_word) + 1 for cur_word in words_first_section]) + char_start_index
                            new_section = create_section_512(section["title"], title_cont, char_start_index, char_end_index, section_text, offset)

                            title_cont = title_cont + 1
                            char_start_index = char_end_index
                            sections.append(new_section)

                            words_second_section = last_2sections_words[cut_point:]
                            char_end_index = sum([len(cur_word) + 1 for cur_word in words_second_section]) + char_start_index
                            new_section = create_section_512(section["title"], title_cont, char_start_index, char_end_index, section_text, offset)
                            sections.append(new_section)
                    else:
                        sections.append(section)

                document["sections"] = sections

            file_out.write(json.dumps(line_documents) + "\n")

In [None]:
tot_sections = 0
with open(final_out_path, "r") as file:
  for line in file:
    line_documents = json.loads(line)
    for document in line_documents:
      for section in document["sections"]:
        tot_sections = tot_sections + 1
        
print("tot_sections: ", str(tot_sections))