Load all texts from Spanish Wikipedia

In [1]:
# load texts (all wikipedia articles in spanish) using pickle
import pickle

with open("wiki_texts_list.pkl", "rb") as f:
    texts = pickle.load(f)

Cut texts to the first 1000, so that the training does not take prohibitly long

In [2]:
texts = texts[:2000]

Chunk Wikipedia texts into chunks of sequence length 1024 (using Tokenizer spezified in model_name) and dump it to a pickle file for later use

In [3]:
from tqdm import tqdm


def create_ebae_chunks(texts, tokenizer, train_seq_len=1024):
    pre_seq_length = train_seq_len - 100  # Leave some space for the model to generate text
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add padding token
    chunks = []
    
    for text in tqdm(texts):
        sentences = text.split(".")  # Split text into sentences
        input_buffer = []
        token_count = 0

        # Batch tokenize sentences
        batch_tokens = tokenizer(
            sentences,
            return_tensors="pt",
            truncation=True,
            padding=True
        )["input_ids"]

        for tokens, sentence in zip(batch_tokens, sentences):
            current_token_length = (tokens != tokenizer.pad_token_id).sum().item()  # Count non-padding tokens

            # Skip sentences that are too long individually
            if current_token_length > pre_seq_length:
                print(f"Skipping sentence as it exceeds seq_length: ...")
                continue

            if token_count + current_token_length <= pre_seq_length:
                # Add the sentence if it fits
                input_buffer.append(sentence)
                token_count += current_token_length
            else:
                # Add the current chunk to the list and reset buffer
                if input_buffer:
                    chunks.append(" ".join(input_buffer))
                input_buffer = [sentence]
                token_count = current_token_length

        # Handle leftover sentences in the buffer
        if input_buffer:
            chunks.append(" ".join(input_buffer))

    # Validate all chunks and remove long ones
    idx = 0
    print(f"Info: {len(chunks)} chunks created.")
    while idx < len(chunks):
        tokenized_chunk = tokenizer(chunks[idx])["input_ids"]
        if len(tokenized_chunk) > train_seq_len - 20:
            print(f"Chunk {idx} is too long after processing ({len(tokenized_chunk)} tokens).")
            chunks.pop(idx)  # Remove the chunk and do not increment the index
        elif len(tokenized_chunk) < 500:
            print(f"Chunk {idx} is too short after processing ({len(tokenized_chunk)} tokens).")
            chunks.pop(idx)  # Remove the chunk and do not increment the index
        else:
            print(f"Info: Chunk {idx} ({len(tokenized_chunk)} tokens).")
            idx += 1  # Only increment if no removal happened


    return chunks

In [4]:
from transformers import AutoTokenizer

model_name = 'Qwen/Qwen2.5-0.5B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Generate sentence pairs
chunks = create_ebae_chunks(texts, tokenizer, train_seq_len=1024)
print(f"Number of chunks: {len(chunks)}")

# save chunks using pickle
import pickle

with open("wiki_chunks_list_ebae.pkl", "wb") as f:
    pickle.dump(chunks, f)

  from .autonotebook import tqdm as notebook_tqdm
  2%|▏         | 44/2000 [00:01<01:08, 28.65it/s]

Skipping sentence as it exceeds seq_length: ...


  8%|▊         | 152/2000 [00:03<01:12, 25.41it/s]

Skipping sentence as it exceeds seq_length: ...


  8%|▊         | 157/2000 [00:03<01:15, 24.37it/s]

Skipping sentence as it exceeds seq_length: ...


  8%|▊         | 165/2000 [00:04<01:39, 18.35it/s]

Skipping sentence as it exceeds seq_length: ...


 33%|███▎      | 667/2000 [00:17<00:32, 40.58it/s]

Skipping sentence as it exceeds seq_length: ...


 35%|███▌      | 702/2000 [00:18<00:34, 37.59it/s]

Skipping sentence as it exceeds seq_length: ...


 38%|███▊      | 768/2000 [00:19<00:37, 33.24it/s]

Skipping sentence as it exceeds seq_length: ...


 42%|████▏     | 833/2000 [00:21<00:39, 29.62it/s]

Skipping sentence as it exceeds seq_length: ...


 46%|████▌     | 910/2000 [00:23<00:13, 78.82it/s]

Skipping sentence as it exceeds seq_length: ...


 47%|████▋     | 942/2000 [00:24<00:27, 39.08it/s]

Skipping sentence as it exceeds seq_length: ...


 50%|█████     | 1000/2000 [00:25<00:22, 44.66it/s]

Skipping sentence as it exceeds seq_length: ...


 55%|█████▌    | 1102/2000 [00:28<00:26, 34.43it/s]

Skipping sentence as it exceeds seq_length: ...


 60%|█████▉    | 1199/2000 [00:31<00:12, 66.00it/s]

Skipping sentence as it exceeds seq_length: ...
Skipping sentence as it exceeds seq_length: ...


 76%|███████▌  | 1510/2000 [00:38<00:16, 29.40it/s]

Skipping sentence as it exceeds seq_length: ...


 89%|████████▉ | 1786/2000 [00:50<00:05, 37.42it/s]

Skipping sentence as it exceeds seq_length: ...


 91%|█████████ | 1811/2000 [00:52<00:09, 20.93it/s]

Skipping sentence as it exceeds seq_length: ...


 94%|█████████▍| 1889/2000 [00:54<00:04, 26.86it/s]

Skipping sentence as it exceeds seq_length: ...


100%|██████████| 2000/2000 [00:59<00:00, 33.72it/s]


Info: 12431 chunks created.
Info: Chunk 0 (889 tokens).
Info: Chunk 1 (927 tokens).
Info: Chunk 2 (877 tokens).
Info: Chunk 3 (789 tokens).
Info: Chunk 4 (905 tokens).
Info: Chunk 5 (877 tokens).
Info: Chunk 6 (914 tokens).
Info: Chunk 7 (922 tokens).
Info: Chunk 8 (899 tokens).
Info: Chunk 9 (929 tokens).
Info: Chunk 10 (916 tokens).
Info: Chunk 11 (876 tokens).
Info: Chunk 12 (906 tokens).
Info: Chunk 13 (886 tokens).
Info: Chunk 14 (911 tokens).
Info: Chunk 15 (899 tokens).
Info: Chunk 16 (923 tokens).
Info: Chunk 17 (899 tokens).
Chunk 18 is too short after processing (284 tokens).
Info: Chunk 18 (907 tokens).
Info: Chunk 19 (910 tokens).
Info: Chunk 20 (826 tokens).
Info: Chunk 21 (877 tokens).
Info: Chunk 22 (876 tokens).
Info: Chunk 23 (911 tokens).
Info: Chunk 24 (915 tokens).
Info: Chunk 25 (866 tokens).
Info: Chunk 26 (884 tokens).
Info: Chunk 27 (914 tokens).
Info: Chunk 28 (907 tokens).
Info: Chunk 29 (843 tokens).
Chunk 30 is too short after processing (117 tokens).
Info: 