In [12]:
pip install transformers==4.47.1



In [13]:
from transformers import MT5ForConditionalGeneration, MT5Tokenizer, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import Dataset, load_dataset
import pandas as pd


In [14]:
from datasets import load_dataset, Dataset

# Load dataset directly from JSON file
# Make sure your file is uploaded to Colab (or in the same folder in Jupyter)
dataset = load_dataset("json", data_files="/dataset_pastho_all2.json")

# Preview dataset
dataset


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'answer', 'difficulty'],
        num_rows: 2620
    })
})

In [15]:
# 80% train, 10% validation, 10% test
dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
train_valid = dataset['train'].train_test_split(test_size=0.1, seed=42)

train_dataset = train_valid['train']
valid_dataset = train_valid['test']
test_dataset  = dataset['test']

print("Train size:", len(train_dataset))
print("Validation size:", len(valid_dataset))
print("Test size:", len(test_dataset))


Train size: 1886
Validation size: 210
Test size: 524


In [16]:
model_name = "google/mt5-base"
tokenizer = MT5Tokenizer.from_pretrained(model_name, legacy=False)
model = MT5ForConditionalGeneration.from_pretrained(model_name)


In [17]:
def tokenize(example):
    inputs = tokenizer(example["context"], padding="max_length", truncation=True, max_length=512)
    targets = tokenizer(example["question"], padding="max_length", truncation=True, max_length=128)
    inputs["labels"] = [(l if l != tokenizer.pad_token_id else -100) for l in targets["input_ids"]]
    return inputs

train_dataset = dataset["train"].map(tokenize, batched=True, remove_columns=dataset["train"].column_names)
# valid_dataset = dataset["validation"].map(tokenize, batched=True, remove_columns=dataset["validation"].column_names)
test_dataset  = dataset["test"].map(tokenize, batched=True, remove_columns=dataset["test"].column_names)


Map:   0%|          | 0/2096 [00:00<?, ? examples/s]

Map:   0%|          | 0/524 [00:00<?, ? examples/s]

In [18]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [19]:
import torch  # Make sure this is imported

from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/mt5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base")

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./mt5_pashto_qg_base",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    logging_steps=100,
    save_steps=500,
    evaluation_strategy="epoch",  # requires up-to-date transformers
    fp16=True if torch.cuda.is_available() else False,
    save_total_limit=2,
    predict_with_generate=True
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Seq2SeqTrainer(


In [21]:
trainer.train()


RuntimeError: torch_xla/csrc/tensor.cpp:191 : Check failed: data()->tensor_data 
*** Begin stack trace ***
	tsl::CurrentStackTrace[abi:cxx11]()
	torch_xla::XLATensor::shape() const
	torch_xla::XLATensorImpl::SetupSizeProperties()
	torch_xla::XLATensorImpl::dim_custom() const
	
	
	
	_PyObject_GenericSetAttrWithDict
	PyObject_SetAttr
	_PyEval_EvalFrameDefault
	
	_PyEval_EvalFrameDefault
	
	PySequence_Tuple
	_PyEval_EvalFrameDefault
	
	
	_PyObject_Call
	
	_PyObject_MakeTpCall
	PyObject_Vectorcall
	_PyEval_EvalFrameDefault
	PyEval_EvalCode
	
	_PyEval_EvalFrameDefault
	
	
	_PyEval_EvalFrameDefault
	
	
	_PyObject_Call
	_PyEval_EvalFrameDefault
	
	
	
	
	
	
	
	_PyEval_EvalFrameDefault
	PyEval_EvalCode
	
	
	PyObject_Vectorcall
	_PyEval_EvalFrameDefault
	
	Py_RunMain
	Py_BytesMain
	
	__libc_start_main
	_start
*** End stack trace ***


In [None]:
trainer.save_model("./mt5_pashto_qg_base")
tokenizer.save_pretrained("./mt5_pashto_qg_base")


In [None]:
def generate_question(context):
    input_text = f"generate question: {context}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    output_ids = model.generate(
        **inputs,
        max_length=64,
        num_beams=4,
        early_stopping=True,
        no_repeat_ngram_size=2
    )

    return tokenizer.decode(output_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

# Example usage
print(generate_question("د پښتو ژبې د ویونکو شمېر ډېر دی."))
