In [None]:
import random
from pathlib import Path

import datasets
import pandas as pd
from bs4 import BeautifulSoup
from nltk import sent_tokenize
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


# PG-19

In [2]:
original_ds = datasets.load_dataset("mrsndmn/pg19", split="test")

In [3]:
def get_random_chunk(text: str, min_chunk_size: int) -> str:
    # random.seed(144)
    # Erase all the spaces
    text = " ".join(text.split())
    # Split into sentences
    sentences = sent_tokenize(text)
    if len(sentences) < min_chunk_size:
        chunk = " ".join(sentences)
    else:
        chunk = ""
        n_words = 0
        # Iterate untill appropriate chunk is found
        while not (12000 < n_words < 25000):
            max_start = len(sentences) - min_chunk_size
            start = random.randint(0, max_start)
            end = random.randint(start + min_chunk_size, len(sentences))
            chunk = " ".join(sentences[start:end])
            n_words = len(chunk.split())
    return chunk

In [4]:
# random.seed(144)

texts = []
min_chunk_size = 10  # Set the minimum chunk size (in sentences)
for _ in tqdm(range(1000)):
    n_words = 0
    while n_words < 12000:
        text = random.choice(original_ds)["text"]
        n_words = len(text.split())
    chunk = get_random_chunk(text, min_chunk_size)
    texts.append(chunk)

for text in texts[:10]:
    print(text[:100])    
    print(f"Number of words in the sample: {len(text.split())}")
    print('----------')

100%|██████████| 1000/1000 [01:01<00:00, 16.32it/s]

Page SOME ANIMAL PROPENSITIES. 81 THE PETRIFIED FERN. 83 WATER AND ANIMALS. 84 THE HERRING GULL. 87 
Number of words in the sample: 12043
----------
"N-no." "Then I shan't!" She looked up quickly, her blue eyes very persuasive. "I don't very often h
Number of words in the sample: 12585
----------
I do not preach next Sunday! SIR TRISTRAM. You'd better not! No, I'm here for the races. THE DEAN. T
Number of words in the sample: 15013
----------
It may have been the instinct of despair that led that Prince to appeal again to Gordon, but the Dar
Number of words in the sample: 20289
----------
The piece on Gouverneur Morris's Oration on Hamilton and that on the Louisiana Memorial are the last
Number of words in the sample: 18681
----------
[12] Chapter XVII. A much more impressive disaster, both in its dramatic features and as illustratin
Number of words in the sample: 24342
----------
She did not seek to combat her love; to what purpose should she do so? No one would ever know it. He
Numbe




In [8]:
# Save locally
dst_path = Path("../datasets/pg19_valid_1k_chunks.csv")

df = pd.DataFrame({"text": texts})
df.to_csv(dst_path)

df = pd.read_csv(dst_path, index_col=0)
df.head()

Unnamed: 0,text
0,Page SOME ANIMAL PROPENSITIES. 81 THE PETRIFIE...
1,"""N-no."" ""Then I shan't!"" She looked up quickly..."
2,I do not preach next Sunday! SIR TRISTRAM. You...
3,It may have been the instinct of despair that ...
4,The piece on Gouverneur Morris's Oration on Ha...


In [9]:
# Save to hugging face hub
dataset = datasets.Dataset.from_pandas(df, preserve_index=False)
dataset.push_to_hub("LarryLovestein/pg19_1k")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.02ba/s]
Processing Files (1 / 1): 100%|██████████| 57.7MB / 57.7MB, 12.0MB/s  
New Data Upload: 100%|██████████| 57.7MB / 57.7MB, 12.0MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:06<00:00,  6.96s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/LarryLovestein/pg19_1k/commit/aaeb326aa0c5b3983c42fc0663f84dc66c3ab3d5', commit_message='Upload dataset', commit_description='', oid='aaeb326aa0c5b3983c42fc0663f84dc66c3ab3d5', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LarryLovestein/pg19_1k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LarryLovestein/pg19_1k'), pr_revision=None, pr_num=None)

# Fanfics

In [2]:
with open(Path("./fanfics_urls.txt")) as file:
    fanfics_urls = list(map(str.strip, file.readlines()))

In [14]:
def extract_preface_info(html_file):
    """
    Extract title, author, summary, and notes from a single chapter HTML file.
    """
    with open(html_file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    
    workskin = soup.find("div", id="workskin")
    if not workskin:
        return {"title": None, "author": None, "summary": None, "notes": None}

    preface_div = workskin.find("div", class_="preface group")
    if not preface_div:
        return {"title": None, "author": None, "summary": None, "notes": None}

    # title
    title_tag = preface_div.find("h2", class_="title heading")
    title = title_tag.get_text(strip=True) if title_tag else None

    # author
    author_tag = preface_div.find("a", rel="author")
    author = author_tag.get_text(strip=True) if author_tag else None

    # summary
    summary = None
    summary_div = preface_div.find("div", class_="summary module")
    if summary_div:
        blockquote = summary_div.find("blockquote", class_="userstuff")
        if blockquote:
            summary = blockquote.get_text("\n", strip=True)

    # notes
    notes = None
    notes_div = preface_div.find("div", class_="notes module")
    if notes_div:
        blockquote = notes_div.find("blockquote", class_="userstuff")
        if blockquote:
            notes = blockquote.get_text("\n", strip=True)

    return {
        "title": title,
        "author": author,
        "summary": summary,
        "notes": notes
    }


def extract_book_text(html_file):
    # 1. Read the HTML
    with open(html_file, 'r', encoding='utf-8') as f:
        html_content = f.read()

    # 2. Parse with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # ----------------------------------------------------------------
    # Step A: Remove "Chapter Notes" paragraphs and the blockquotes
    #         within any <div class="meta group">
    # ----------------------------------------------------------------
    meta_divs = soup.find_all("div", class_="meta")
    for meta_div in meta_divs:
        # Find the <p> that exactly or partially matches "Chapter Notes"
        notes_p = meta_div.find("p", string=lambda text: text and "Chapter Notes" in text)
        if notes_p:
            notes_p.decompose()
        
        # Also remove the <blockquote> (class="userstuff") under <div class="meta group">
        blockquote = meta_div.find("blockquote", class_="userstuff")
        if blockquote:
            blockquote.decompose()

    # ----------------------------------------------------------------
    # Step B: Collect the text we DO want in the order it appears:
    #   - <h2 class="heading"> (the chapter titles)
    #   - <div class="userstuff"><p> ... </p></div> (the main story text)
    #
    # We'll iterate through all tags in document order and pick out
    # only those that match our criteria.
    # ----------------------------------------------------------------
    extracted_text = []

    for tag in soup.find_all():
        # 1) If it's a heading
        if tag.name == "h2" and "heading" in tag.get("class", []):
            heading_text = tag.get_text(strip=True)
            if heading_text:
                extracted_text += [f'\n{heading_text}\n']

        # 2) If it's a <p> under a <div class="userstuff">
        elif (tag.name == "p" 
              and tag.parent 
              and tag.parent.name == "div"
              and "userstuff" in tag.parent.get("class", [])):
            p_text = tag.get_text(strip=False)
            if p_text:
                extracted_text.append(p_text)

    # ----------------------------------------------------------------
    # Step C: Return/join the final cleaned text
    # ----------------------------------------------------------------
    return "\n".join(extracted_text)

In [16]:
# Convert HTML to foramtted text files
fanfics_path = Path("../datasets/fanfics/")
fanfics_path.mkdir(exist_ok=True)
fanfics_clean_path = Path("../datasets/fanfics_clean/")
fanfics_clean_path.mkdir(exist_ok=True)
for fanfic_path in fanfics_path.glob('*.html'):
    print("Path:", fanfic_path)
    book_text = extract_book_text(fanfic_path)
    preface = extract_preface_info(fanfic_path)
    with open(f"{fanfics_clean_path / fanfic_path.stem}.txt", "w") as file:
        for header in ["Title", "Author", "Summary", "Notes"]:
            if preface[header.lower()]:
                file.write(f"{header}: {preface[header.lower()]}\n\n")
        file.write(book_text)

Path: ../datasets/fanfics/A_Great_Eye_lidless.html
Path: ../datasets/fanfics/Adagio.html
Path: ../datasets/fanfics/Children_of_the_Desert.html
Path: ../datasets/fanfics/Christmas_and.html
Path: ../datasets/fanfics/Creatures_of_Truth.html
Path: ../datasets/fanfics/Grogu_Tells_Stories.html
Path: ../datasets/fanfics/Harry_Potter_and_the.html
Path: ../datasets/fanfics/I_Know_Where_the_Stars.html
Path: ../datasets/fanfics/In_Which_Harry_and_Ladon.html
Path: ../datasets/fanfics/Jay_Baby.html
Path: ../datasets/fanfics/Laws_of_the_Sea.html
Path: ../datasets/fanfics/Mirror_Prism.html
Path: ../datasets/fanfics/People_Stained_With.html
Path: ../datasets/fanfics/Shattered_Pieces_of_the.html
Path: ../datasets/fanfics/Sweet_Creatures.html
Path: ../datasets/fanfics/The_Bot_the_World_Forgot.html
Path: ../datasets/fanfics/The_Last_of_the_Jedi.html
Path: ../datasets/fanfics/The_Resurrection_of.html
Path: ../datasets/fanfics/The_Silmarillion_Simplified.html
Path: ../datasets/fanfics/The_Sith_Strikes_Back

In [17]:
def get_random_chunk(text: str, min_chunk_size: int) -> str:
    # random.seed(144)
    # Erase all the spaces
    text = " ".join(text.split())
    # Split into sentences
    sentences = sent_tokenize(text)
    if len(sentences) < min_chunk_size:
        chunk = " ".join(sentences)
    else:
        chunk = ""
        n_words = 0
        # Iterate untill appropriate chunk is found
        while not (12000 < n_words < 25000):
            max_start = len(sentences) - min_chunk_size
            start = random.randint(0, max_start)
            end = random.randint(start + min_chunk_size, len(sentences))
            chunk = " ".join(sentences[start:end])
            n_words = len(chunk.split())
    return chunk

In [18]:
data = []
for fanfic_path in fanfics_clean_path.glob("*.txt"):
    with open(fanfic_path) as file:
        data += [file.read()]


texts = []
min_chunk_size = 10  # Set the minimum chunk size (in sentences)
for _ in tqdm(range(1000)):
    n_words = 0
    while n_words < 12000:
        text = random.choice(data)
        n_words = len(text.split())
    chunk = get_random_chunk(text, min_chunk_size)
    texts.append(chunk)

for text in texts[:5]:
    print(text[:100])
    print(f"Number of words in the sample: {len(text.split())}")
    print('----------')

  0%|          | 0/1000 [00:00<?, ?it/s]

100%|██████████| 1000/1000 [01:00<00:00, 16.43it/s]

Do you really wish for such control over living beings?" "I don't," replied Aule. "I had just wanted
Number of words in the sample: 12313
----------
“We know.” Cesta turned to Maeglin and held out her hand. He quickly crossed to her side, not notici
Number of words in the sample: 13245
----------
However hard it was, however long it took, Harry would master it. * “BOLLOCKS!” Harry blinked and lo
Number of words in the sample: 23271
----------
In Sauron's dungeon, Beren and Finrod were the only ones still alive out of their companions. Sauron
Number of words in the sample: 15344
----------
Harry smiled and turned back to the book in front of him, pulling out a clean set of parchment and a
Number of words in the sample: 22452
----------





In [19]:
# Save locally
dst_path = Path("../datasets/fanfics_1k_chunks.csv")

df = pd.DataFrame({"text": texts})
df.to_csv(dst_path)

df = pd.read_csv(dst_path, index_col=0)
df.head()

Unnamed: 0,text
0,Do you really wish for such control over livin...
1,“We know.” Cesta turned to Maeglin and held ou...
2,"However hard it was, however long it took, Har..."
3,"In Sauron's dungeon, Beren and Finrod were the..."
4,Harry smiled and turned back to the book in fr...


In [20]:
# Save to hugging face hub
dataset = datasets.Dataset.from_pandas(df, preserve_index=False)
dataset.push_to_hub("LarryLovestein/fanfics_1k")

Creating parquet from Arrow format: 100%|██████████| 2/2 [00:00<00:00,  3.92ba/s]
Processing Files (1 / 1): 100%|██████████| 57.8MB / 57.8MB, 12.1MB/s  
New Data Upload: 100%|██████████| 57.8MB / 57.8MB, 12.1MB/s  
Uploading the dataset shards: 100%|██████████| 1/1 [00:07<00:00,  7.04s/ shards]


CommitInfo(commit_url='https://huggingface.co/datasets/LarryLovestein/fanfics_1k/commit/bc46a1efd6a8c5e02e8e5e20fcff0dac976552b3', commit_message='Upload dataset', commit_description='', oid='bc46a1efd6a8c5e02e8e5e20fcff0dac976552b3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/LarryLovestein/fanfics_1k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='LarryLovestein/fanfics_1k'), pr_revision=None, pr_num=None)