In [1]:
import pandas as pd
import tiktoken

# Extracting statistics and comparing datasets

In [4]:
path_retrieved_html_success = '../../data/retrieved_html_success.csv'
path_text_success = '../../data/retrieved_text_success.csv'

In [5]:
df_retrieved_html_success = pd.read_csv(path_retrieved_html_success)
df_text_success = pd.read_csv(path_text_success)

In [6]:
print(len(df_retrieved_html_success))
print(len(df_text_success))

6590
6590


In [None]:
def get_token_average(df):
    sum = 0
    encoder = tiktoken.get_encoding("cl100k_base")
    for index, row in df.iterrows():
        html = row['html_content']
        if type(html) != str:
            continue
        tokens = encoder.encode(html)
        num_tokens = len(tokens)
        sum += num_tokens

    avg = sum / len(df)
    return avg

In [None]:
# Calculating the average number of tokens in the original DataFrame using the 'html_content' column
# Using the tiktoken library with the cl100k_base encoding

get_token_average(df_retrieved_html_success)

76188.75629742033

In [None]:
# Calculating the average number of tokens in the reduced DataFrame using the 'html_content' column
# Using the tiktoken library with the cl100k_base encoding

get_token_average(df_text_success)

2173.9285280728377

# Manual Extraction

In [18]:
def get_token_count(text):
    """
    Returns the number of tokens in a given text.
    """
    encoder = tiktoken.get_encoding("cl100k_base")
    tokens = encoder.encode(text)
    return len(tokens)

In [22]:
path_html = '../data/html_dirty.txt'
path_text_bs4 = '../data/text_bs4.txt'
path_text_manual = '../data/text_manual.txt'

In [23]:
# Read the HTML file
with open(path_html, 'r', encoding='utf-8') as file:
    html_content = file.read()

# Read the bs4 text file
with open(path_text_bs4, 'r', encoding='utf-8') as file:
    text_bs4 = file.read()

# Read the manual text file
with open(path_text_manual, 'r', encoding='utf-8') as file:
    text_manual = file.read()

In [24]:
# Calculate the number of tokens in the HTML content
num_tokens_html = get_token_count(html_content)
print(f"Number of tokens in HTML content: {num_tokens_html}")

# Calculate the number of tokens in the bs4 text
num_tokens_bs4 = get_token_count(text_bs4)
print(f"Number of tokens in bs4 text: {num_tokens_bs4}")

# Calculate the number of tokens in the manual text
num_tokens_manual = get_token_count(text_manual)
print(f"Number of tokens in manual text: {num_tokens_manual}")

Number of tokens in HTML content: 30897
Number of tokens in bs4 text: 1746
Number of tokens in manual text: 1987


In [25]:
# Calculante reduction between the original HTML and the bs4 text
reduction_bs4 = (num_tokens_html - num_tokens_bs4) / num_tokens_html * 100
print(f"Reduction in tokens from HTML to bs4 text: {reduction_bs4:.2f}%")

# Calculate the reduction between the original HTML and the manual text
reduction_manual = (num_tokens_html - num_tokens_manual) / num_tokens_html * 100
print(f"Reduction in tokens from HTML to manual text: {reduction_manual:.2f}%")

Reduction in tokens from HTML to bs4 text: 94.35%
Reduction in tokens from HTML to manual text: 93.57%
