In [1]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HF_TOKEN")
from huggingface_hub import login
login(token=f"{secret_value_0}",write_permission=True)  # Enter your HF token when prompted

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
import time
import numpy as np
from datasets import load_dataset, Dataset
from transformers import MarianMTModel, MarianTokenizer
from tqdm.auto import tqdm
import torch

def process_dataset_in_batches(ds, model, tokenizer, batch_size=32):
    """Process the entire dataset in batches and create a new dataset with translations"""
    device = next(model.parameters()).device
    test_data = ds['test']
    
    # Initialize lists to store all data
    all_questions = []
    all_translations = []
    all_subject = []
    all_answers = []
    all_choices = []
    
    # Process in batches
    for i in tqdm(range(0, len(test_data), batch_size), desc="Processing batches"):
        # Get batch of questions
        batch_indices = range(i, min(i + batch_size, len(test_data)))
        batch_questions = [test_data[j]['question'] for j in batch_indices]
        
        # Translate batch
        encoded = tokenizer(batch_questions, return_tensors="pt", padding=True, truncation=True)
        encoded = {k: v.to(device) for k, v in encoded.items()}
        
        with torch.no_grad():
            outputs = model.generate(
                **encoded,
                max_length=200,  # Increased for longer questions
                num_beams=5,
                length_penalty=1.0,
                early_stopping=True,
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
            )
        
        outputs = outputs.cpu()
        batch_translations = [tokenizer.decode(output, skip_special_tokens=True) 
                            for output in outputs]
        
        # Store all data
        all_questions.extend(batch_questions)
        all_translations.extend(batch_translations)
        all_subject.extend([test_data[j]['subject'] for j in batch_indices])
        all_answers.extend([test_data[j]['answer'] for j in batch_indices])
        all_choices.extend([{
            'choice_A': test_data[j]['choices'][0],
            'choice_B': test_data[j]['choices'][1],
            'choice_C': test_data[j]['choices'][2],
            'choice_D': test_data[j]['choices'][3]
        } for j in batch_indices])
        
        # Optional: Save checkpoint every 1000 examples
        if i % 1000 == 0 and i > 0:
            save_checkpoint(i, all_questions, all_translations, all_subject, 
                          all_answers, all_choices)
    
    # Create dictionary for dataset
    dataset_dict = {
        'question': all_questions,
        'translation': all_translations,
        'subject': all_subject,
        'answer': all_answers,
        'choice_A': [choices['choice_A'] for choices in all_choices],
        'choice_B': [choices['choice_B'] for choices in all_choices],
        'choice_C': [choices['choice_C'] for choices in all_choices],
        'choice_D': [choices['choice_D'] for choices in all_choices]
    }
    
    # Create and save dataset
    translated_dataset = Dataset.from_dict(dataset_dict)
    translated_dataset.push_to_hub(
        "tinycrops/mmlu-lojban",
        private=False  # Set to False if you want it public
    )
    
    return translated_dataset

def save_checkpoint(index, questions, translations, subjects, answers, choices):
    """Save checkpoint to disk"""
    checkpoint = {
        'index': index,
        'questions': questions,
        'translations': translations,
        'subjects': subjects,
        'answers': answers,
        'choices': choices
    }
    torch.save(checkpoint, f'translation_checkpoint_{index}.pt')

# Main execution
print("Loading dataset and model...")
ds = load_dataset("cais/mmlu", "all")

model_name = "woctordho/lojban-translation"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# Move model to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
print(f"Using device: {device}")

print("\nStarting translation process...")
translated_dataset = process_dataset_in_batches(ds, model, tokenizer)

print("\nTranslation complete! Dataset uploaded to Hugging Face Hub.")
print("Dataset stats:", translated_dataset)

Loading dataset and model...


README.md:   0%|          | 0.00/53.2k [00:00<?, ?B/s]

dataset_infos.json:   0%|          | 0.00/138k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/3.50M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/408k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/76.5k [00:00<?, ?B/s]

auxiliary_train-00000-of-00001.parquet:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1531 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/285 [00:00<?, ? examples/s]

Generating auxiliary_train split:   0%|          | 0/99842 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/308 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/806k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/805k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.75M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/310M [00:00<?, ?B/s]

Using device: cuda

Starting translation process...


Processing batches:   0%|          | 0/439 [00:00<?, ?it/s]