In [None]:
import os
import re
import json
import random
import requests
from io import StringIO

import yaml
from markdown import Markdown
from bs4 import BeautifulSoup
import tiktoken
from tqdm import tqdm
import plotly.express as px
import seaborn as sns

In [None]:
regex = re.compile(r"\d+ min read(?: - updated few hours ago)?")
regex.sub('', '3 min read - updated few hours ago')

In [None]:
def pprint_json(text: str):
    print(json.dumps(text, indent=4))

def delete_html_tags(text):
    tag_regex = re.compile(r'<.*?>')
    t = re.sub(tag_regex, ' ', text)
    t = ' '.join([s for s in t.split(' ') if s])
    return t

def delete_html_comments(text):
    comment_regex = re.compile(r'<!--((.|\n)*?)-->', re.MULTILINE)
    t = re.sub(comment_regex, ' ', text)
    t = ' '.join([s for s in t.split(' ') if s])
    return t

def get_text_from_md_link(text):
    url_regex = re.compile(r'__|\*|\#|(?:\[([^\]]*)\]\([^)]*\))')
    return re.sub(url_regex, r'\1', text)

def clear_text(text):
    t = delete_html_tags(t)
    t = delete_html_comments(t)
    t = get_text_from_md_link(t)
    t = '\n'.join([s.strip() for s in t.split('\n') if s.strip()])
    return t

In [None]:
with open('../preprocessed_data/docs_objects.json') as f:
    docs = json.load(f)

In [None]:
print(json.dumps(docs, indent=4))

In [None]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

In [None]:
tokenized_docs = []
for doc in tqdm(docs):
    tokenized_docs.append(encoding.encode(doc['text_content']))
token_lengths = list(map(len, tokenized_docs))

In [None]:
sns.histplot(token_lengths)

In [None]:
# doc = random.choice(docs)
doc = [doc for doc in docs if doc['doc_url'] == 'https://docs.revenuegrid.com/ri/fast/articles/Frequently-Asked-Questions'][0]
doc

In [None]:
soup = BeautifulSoup(Markdown(output_format='html').convert(doc['full_content']))
print(soup.prettify())

In [None]:
def split_doc_by_sections(doc: dict):
    doc_sections = []
    soup = BeautifulSoup(Markdown(output_format='html').convert(doc['full_content']))
    header_regex = re.compile(r'h\d')
    for el in soup.find('body'):
        if el.name is None or not re.fullmatch(header_regex, el.name):
            continue
        sec_info = {'sec_level': int(el.name[-1]),
                    'sec_title': el.text,
                    'sec_text': ''}
        for sib in el.next_siblings:
            if sib.name is not None and re.fullmatch(header_regex, sib.name):
                break
            if not sib.text.strip() or sib.name in ('style', 'comment', 'script'):
                continue
            sec_info['sec_text'] += sib.text.strip() + '\n'
        doc_sections.append(sec_info)
    return doc_sections

In [None]:
sections = [split_doc_by_sections(doc) for doc in docs]
len(sections)

In [None]:
len([secs for secs in sections if secs[0]['sec_level'] == 2 and secs[0]['sec_text'] == ''])

In [None]:
sum([len(secs) > 1 and secs[1]['sec_level'] == 1 for secs in sections])

In [None]:
[secs for secs in sections if len(secs) > 1 and secs[1]['sec_level'] == 1 and secs[0]['sec_level'] != 2]

In [None]:
[secs for secs in sections if secs[0]['sec_level'] == 2 and secs[0]['sec_text'] != '']

In [None]:
sum([secs[0]['sec_level'] == 1 for secs in sections])

In [None]:
142+196

In [None]:
def merge_first_h2_with_h1(sections: list):
    h1_sec = sections[1]
    h1_sec['sec_text'] = sections[0]['sec_text'] + '\n' + h1_sec['sec_text']
    sections[1] = h1_sec
    return sections[1:]

In [None]:
sections = [merge_first_h2_with_h1(secs) if secs[0]['sec_level'] == 2 else secs for secs in sections]

In [None]:
random.choice(sections)

In [None]:
sum([any([s['sec_text'] == '' for s in sec]) for sec in sections])

In [None]:
sum([secs[0]['sec_level'] == 1 for secs in sections])

In [None]:
def create_tree(nodes):
    root = {'children': []}
    stack = [(root, 0)]

    for node in nodes:
        level = node['sec_level']
        while stack[-1][1] >= level:
            stack.pop()

        parent = stack[-1][0]
        child = {'sec_title': node['sec_title'], 'sec_text': node['sec_text'], 'children': []}
        parent['children'].append(child)
        stack.append((child, level))

    return root['children']

In [None]:
sections[1]

In [None]:
pprint_json(create_tree(sections[1]))

In [None]:
def merge_nodes(tree, threshold):
    if not tree or len(tree) == 0:
        return None

    # Recursively merge child nodes first
    for node in tree:
        node['children'] = merge_nodes(node['children'], threshold)

    # Check if all children can be merged into one node
    if len(tree) > 1:
        all_same_level = all(node['sec_level'] == tree[0]['sec_level'] for node in tree)
        all_mergeable = all(len(node['sec_text']) <= threshold for node in tree)
        if all_same_level and all_mergeable:
            merged_text = ' '.join(node['sec_text'] for node in tree)
            merged_node = {
                'sec_title': tree[0]['sec_title'],
                'sec_text': merged_text,
                'children': []
            }
            return [merged_node]

    return tree

In [None]:
subdocs = []
for doc in docs:
    subdoc = {
        'doc_title': doc['title'],
        'site_name': doc['site_name'],
        'short_site_name': doc['short_site_name'],
        'doc_url': doc['doc_url'],
        'doc_filepath': doc['doc_filepath'],
        'nav_keys_list': doc['nav_keys_list']
    }
    if not doc['subtopics']:
        subdoc['topic_title'] = doc['title']