Load all texts from Spanish Wikipedia

In [None]:
# load texts (all wikipedia articles in spanish) using pickle
import pickle

with open("wiki_texts_list.pkl", "rb") as f:
    texts = pickle.load(f)

Cut texts to the first 1000, so that the training does not take prohibitly long

In [None]:
texts = texts[:1000]

Chunk Wikipedia texts into chunks of sequence length 1024 (using Tokenizer spezified in model_name) and dump it to a pickle file for later use

In [None]:
from tqdm import tqdm


def create_ebae_chunks(texts, tokenizer, pre_seq_length=1000, train_seq_len=1024):
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})  # Add padding token
    chunks = []
    
    for text in tqdm(texts):
        sentences = text.split(".")  # Split text into sentences
        input_buffer = []
        token_count = 0

        # Batch tokenize sentences
        batch_tokens = tokenizer(
            sentences,
            return_tensors="pt",
            truncation=True,
            padding=True
        )["input_ids"]

        for tokens, sentence in zip(batch_tokens, sentences):
            current_token_length = (tokens != tokenizer.pad_token_id).sum().item()  # Count non-padding tokens

            # Skip sentences that are too long individually
            if current_token_length > pre_seq_length:
                print(f"Skipping sentence as it exceeds seq_length: {sentence[:50]}...")
                continue

            if token_count + current_token_length <= pre_seq_length:
                # Add the sentence if it fits
                input_buffer.append(sentence)
                token_count += current_token_length
            else:
                # Add the current chunk to the list and reset buffer
                if input_buffer:
                    chunks.append(" ".join(input_buffer))
                input_buffer = [sentence]
                token_count = current_token_length

        # Handle leftover sentences in the buffer
        if input_buffer:
            chunks.append(" ".join(input_buffer))

    # Validate all chunks and remove long ones
    idx = 0
    while idx < len(chunks):
        tokenized_chunk = tokenizer(chunks[idx])["input_ids"]
        if len(tokenized_chunk) > train_seq_len - 10:
            print(f"Error: Chunk {idx} is too long after processing ({len(tokenized_chunk)} tokens).")
            chunks.pop(idx)  # Remove the chunk and do not increment the index
        else:
            idx += 1  # Only increment if no removal happened


    return chunks

In [None]:
from transformers import AutoTokenizer

model_name = 'Qwen/Qwen2.5-0.5B-Instruct'

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Generate sentence pairs
chunks = create_ebae_chunks(texts, tokenizer, pre_seq_length=900, train_seq_len=1024)
print(f"Number of chunks: {len(chunks)}")

# save chunks using pickle
import pickle

with open("wiki_chunks_list_ebae.pkl", "wb") as f:
    pickle.dump(chunks, f)