In [None]:
import os
import re
import json

from bs4 import BeautifulSoup
from markdown import Markdown
import langchain as lc
import tiktoken
import seaborn as sns
import requests

In [None]:
gpt_turbo_encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

def delete_redundant_newlines(text: str):
    return '\n'.join([s.strip() for s in text.split('\n') if s.strip()])

def text_from_html(html_text):
    text = BeautifulSoup(html_text).get_text()
    text = text.encode('ascii', 'ignore').decode()
    return delete_redundant_newlines(text)

def len_func(text):
    text = text_from_html(text)
    return len(gpt_turbo_encoding.encode(text))

In [None]:
with open('../preprocessed_data/docs_objects.json') as f:
    docs = json.load(f)

In [None]:
d = [d for d in docs if '<style' in d['full_content']][0]

html_doc = BeautifulSoup(Markdown(output_format='html').convert(d['full_content']), 'lxml')
print(html_doc.get_text())

In [None]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
r = requests.get(d['doc_url'], headers=headers)
print(d['doc_url'])
print(r.status_code)
if r.status_code == 200:
    html_doc = BeautifulSoup(r.content.decode()).find('article')

In [None]:
print(html_doc.get_text())

In [None]:
BeautifulSoup(r.content.decode()).find('title')

In [None]:
BeautifulSoup(r.content.decode()).find('article').find('h1')

In [None]:
len_func(html_doc)

In [None]:
splitter = lc.text_splitter.RecursiveCharacterTextSplitter(["<h1", "<h2", "<h3", "<h4", "<h5", "<h6"], chunk_size=400, chunk_overlap=0, length_function=len_func, keep_separator=True)
# splitter = lc.text_splitter.RecursiveCharacterTextSplitter.from_language('html', chunk_size=400, chunk_overlap=0, length_function=len_func, keep_separator=True)
splits = splitter.split_text(html_doc)
splits = splitter._merge_splits(splits, '\n')

splits = list(map(text_from_html, splits))
splits

In [None]:
list(map(len_func, splits))

In [None]:
from tqdm import tqdm

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36'}
for d in tqdm(docs):
    r = requests.get(d['doc_url'], headers=headers)
    if r.status_code != 200:
        print(d['title'], d['doc_url'], r.status_code)

In [None]:
with open('../preprocessed_data/documents.json') as f:
    docs = json.load(f)

sns.histplot([d['metadata']['section_token_length'] for d in docs])

In [None]:
len(docs)