In [None]:
import random
import os

TARGET_WORDS = 100_000
DEV_WORDS = int(0.2 * TARGET_WORDS)
SNIPPET_SIZE = 10_000
NUM_SNIPPETS = (TARGET_WORDS + DEV_WORDS) // SNIPPET_SIZE

def sample_from_single_file(file_name, target_words):
    with open(f"../datasets/BabyLM_dataset/train_100M/{file_name}", "r", encoding="utf-8") as f:
        words = f.read().split()
    total_words = len(words)

    if (total_words > target_words + DEV_WORDS):
        max_start = total_words - SNIPPET_SIZE
        starts = random.sample(range(max_start), NUM_SNIPPETS)

        snippets = [words[start:start + SNIPPET_SIZE] for start in starts]

        sampled_words = [word for snippet in snippets for word in snippet]

        train_words = sampled_words[:target_words]
        dev_words = sampled_words[target_words:]

        # Fix: Write only train words to main file
        with open(f"../datasets/BabyLM_dataset/train_100k/{file_name}", "w+", encoding="utf-8") as f:
            f.write(" ".join(train_words))  # Changed from sampled_words to train_words
        with open(f"../datasets/BabyLM_dataset/train_100k/{file_name}_dev.train", "w+", encoding="utf-8") as f:
            f.write(" ".join(dev_words))
    else: 
        print(f"File {file_name} has only {total_words} words, not enough to sample {target_words} words.")


In [None]:

def sample_proportions (output_name, no_words, bnc_spoken, childes, gutenberg, open_subtitles, simple_wiki, switchboard):
    if bnc_spoken + childes + gutenberg + open_subtitles + simple_wiki + switchboard != 1:
        raise ValueError("Proportions must sum to 1.")

    files = ["bnc_spoken.train", "childes.train", "gutenberg.train", "open_subtitles.train", "simple_wiki.train", "switchboard.train"]
    proportions = [bnc_spoken, childes, gutenberg, open_subtitles, simple_wiki, switchboard]

    train_words = []
    dev_words = []
    
    for i in range(len(files)):
        file = files[i]
        total_words_needed = int((no_words + DEV_WORDS) * proportions[i])
        train_words_needed = int(no_words * proportions[i])
        dev_words_needed = total_words_needed - train_words_needed
        
        if total_words_needed == 0:
            continue
            
        with open(f"../datasets/BabyLM_dataset/train_100M/{file}", "r", encoding="utf-8") as f:
            words = f.read().split()
        total_words = len(words)

        if total_words < total_words_needed:
            print(f"File {file} has only {total_words} words, not enough to sample {total_words_needed} words.")
            continue

        # Calculate snippets needed for this file specifically
        snippets_needed = (total_words_needed + SNIPPET_SIZE - 1) // SNIPPET_SIZE  # Ceiling division
        max_start = total_words - SNIPPET_SIZE
        starts = random.sample(range(max_start), min(snippets_needed, max_start))
        snippets = [words[start:start + SNIPPET_SIZE] for start in starts]
        file_words = [word for snippet in snippets for word in snippet][:total_words_needed]
        
        # Split this file's words into train and dev proportionally
        file_train_words = file_words[:train_words_needed]
        file_dev_words = file_words[train_words_needed:train_words_needed + dev_words_needed]
        
        train_words.extend(file_train_words)
        dev_words.extend(file_dev_words)
    
    print(f"Train words: {len(train_words)}")
    print(f"Dev words: {len(dev_words)}")
    with open(f"../datasets/BabyLM_dataset/books_context/{output_name}.train", "w+", encoding="utf-8") as f:
        f.write(" ".join(train_words))
    with open(f"../datasets/BabyLM_dataset/books_context/{output_name}_dev.train", "w+", encoding="utf-8") as f:
        f.write(" ".join(dev_words))


In [None]:


def sample_percentage_of_books (percentage, target_words, filename):
    with open(f"../datasets/BabyLM_dataset/train_100M/gutenberg.train", "r", encoding="utf-8") as f:
        all_books = f.read()
    lines = all_books.split("\n")

    # split up the books
    texts = {}
    beginning_indices = []
    for i in range(len(lines)):
        if lines[i].startswith("= = = "):
            beginning_indices.append(i)
    target_with_dev = int(1.2 * target_words)
    # sample the first percentage of books
    text = ""
    total_words = 0
    for i in range(int(len(beginning_indices) * percentage)):
        number_of_lines = beginning_indices[i + 1] - beginning_indices[i] - 1
        last_index = beginning_indices[i] + int(percentage * number_of_lines)
        book_text = " ".join(lines[(beginning_indices[i] + 1):(last_index)])
        total_words += len(book_text.split())
        text += book_text
        if total_words >= target_with_dev:
            print(f"Sampled {total_words} words from {i + 1} books.")
            text_words = text.split()
            train_text = " ".join(text_words[:target_words])
            print(f"Train text length: {len(train_text.split())} words.")
            dev_text = " ".join(text_words[target_words:target_with_dev])
            print(f"Dev text length: {len(dev_text.split())} words.")
            break

    with open(f"../datasets/BabyLM_dataset/books_context/{filename}.train", "w", encoding="utf-8") as f:
        f.write(train_text)
    with open(f"../datasets/BabyLM_dataset/books_context/{filename}_dev.train", "w", encoding="utf-8") as f:
        f.write(dev_text)

sample_percentage_of_books(0.25, 1_000_000, "gutenberg_1M_25pct_books")
sample_percentage_of_books(0.5, 1_000_000, "gutenberg_1M_50pct_books")
sample_percentage_of_books(0.75, 1_000_000, "gutenberg_1M_75pct_books")
sample_percentage_of_books(1, 1_000_000, "gutenberg_1M_100pct_books")

In [10]:
sample_from_single_file("gutenberg.train", 100_000)
sample_from_single_file("simple_wiki.train", 100_000)
sample_from_single_file("open_subtitles.train", 100_000)

FileNotFoundError: [Errno 2] No such file or directory: 'gutenberg.train'