In [None]:
!pip3 install transformers
!pip3 install datasets
!pip3 install evaluate
from datasets import load_dataset
from evaluate import load
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments

# Load dataset
train_data = load_dataset("json", data_files="urdu_train.jsonl")["train"]
val_data = load_dataset("json", data_files="urdu_val.jsonl")["train"]
test_data = load_dataset("json", data_files="urdu_test.jsonl")["train"]

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

# Tokenization function
def preprocess_function(examples):
    inputs = [doc for doc in examples["text"]]  # Update keys if necessary
    targets = [doc for doc in examples["summary"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(targets, max_length=128, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize data
tokenized_train = train_data.map(preprocess_function, batched=True)
tokenized_val = val_data.map(preprocess_function, batched=True)
tokenized_test = test_data.map(preprocess_function, batched=True)

# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    logging_dir='./logs',  # Directory for logs
)

# Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Train model
trainer.train()

# Evaluate
rouge = load("rouge")
predictions = trainer.predict(tokenized_test)

# Decode predictions and references
decoded_preds = tokenizer.batch_decode(predictions.predictions, skip_special_tokens=True)
decoded_labels = [tokenizer.decode(label, skip_special_tokens=True) for label in tokenized_test["labels"]]

# Compute ROUGE
results = rouge.compute(predictions=decoded_preds, references=decoded_labels)
print(results)


ModuleNotFoundError: No module named 'datasets'