Import libraries

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import Trainer, TrainingArguments
from datasets import DatasetDict, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


Load Dataset and Model

In [2]:
dataset = load_dataset("json", data_files={
    "train": "mahasquad\\train.json",
    "validation": "mahasquad\\val.json",
    "test": "mahasquad\\test.json"
})

In [3]:
# Load the XLM-R tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/marathi-roberta")
model = AutoModelForQuestionAnswering.from_pretrained("l3cube-pune/marathi-roberta")

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at l3cube-pune/marathi-roberta and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preprocessing function

In [4]:
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['data'],
        num_rows: 118516
    })
    validation: Dataset({
        features: ['data'],
        num_rows: 11873
    })
    test: Dataset({
        features: ['data'],
        num_rows: 11803
    })
})
{'data': {'answers': {'answer_start': [], 'text': []}, 'context': 'प्रेस्बिटेरियनिझम हा उत्तर आयर्लंडमधील सर्वात मोठा प्रोटेस्टंट संप्रदाय आहे आणि आयर्लंड बेटावर (आयर्लंडच्या अँग्लिकन चर्च नंतर) दुसरा सर्वात मोठा संप्रदाय आहे, [उद्धरण आवश्यक आहे] आणि स्कॉटिश वृक्षारोपण स्थायिकांनी अल्स्टरमध्ये आणले होते ज्यांना जेम्स व्ही द्वारे स्थलांतर करण्यास जोरदार प्रोत्साहन दिले होते. स्कॉटलंडचा, नंतर इंग्लंडचा जेम्स पहिला. अंदाजे १००,००० स्कॉटिश प्रेस्बिटेरियन्स १६०७ आणि १६९० मध्ये बॉयनच्या लढाईदरम्यान आयर्लंडच्या उत्तरेकडील काउंटीजमध्ये स्थलांतरित झाले. अल्स्टर आणि उर्वरित आयर्लंडमधील रोमन कॅथोलिकांसह प्रेस्बिटेरियन, १९व्या शतकाच्या सुरुवातीस ते मागे घेण्यापर्यंत भेदभाव करणाऱ्या दंड कायद्यांतर्गत त्रास सहन करावा लागला. आयर्लंडमधील प्रेस्

In [16]:
def preprocess_data(examples):
    questions = [item["question"] for item in examples["data"]]
    contexts = [item["context"] for item in examples["data"]]
    answers = [item["answers"] for item in examples["data"]]

    # Tokenize the inputs with truncation and padding
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
    )

    start_positions = []
    end_positions = []

    for i, ans in enumerate(answers):
        if ans["answer_start"]:
            start = ans["answer_start"][0]
            end = start + len(ans["text"][0])
            # Adjust start and end based on tokenized input
            start_positions.append(start)
            end_positions.append(end)
        else:
            start_positions.append(0)
            end_positions.append(0)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


Take subset of dataset

In [17]:
# Set a fraction of the dataset to use
fraction = 0.01  # Use 1% of the dataset

# Shuffle and select a subset of the training, validation, and test sets
train_subset = dataset["train"].shuffle(seed=42).select(range(int(len(dataset["train"]) * fraction)))
validation_subset = dataset["validation"].shuffle(seed=42).select(range(int(len(dataset["validation"]) * fraction)))
test_subset = dataset["test"].shuffle(seed=42).select(range(int(len(dataset["test"]) * fraction)))

# Create a new DatasetDict with the subsets
subset_dataset = DatasetDict({
    "train": train_subset,
    "validation": validation_subset,
    "test": test_subset
})

# Check the sizes of the subsets to confirm
print(f"Train subset size: {len(subset_dataset['train'])}")
print(f"Validation subset size: {len(subset_dataset['validation'])}")
print(f"Test subset size: {len(subset_dataset['test'])}")

Train subset size: 1185
Validation subset size: 118
Test subset size: 118


In [18]:
# Proceed with tokenization and training using subset_dataset
tokenized_subset = subset_dataset.map(preprocess_data, batched=True)



[A[A

[A[A

Map: 100%|██████████| 1185/1185 [00:01<00:00, 893.41 examples/s]


[A[A

Map: 100%|██████████| 118/118 [00:00<00:00, 756.30 examples/s]


[A[A

Map: 100%|██████████| 118/118 [00:00<00:00, 700.56 examples/s]


In [19]:
print(tokenized_subset)

DatasetDict({
    train: Dataset({
        features: ['data', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 1185
    })
    validation: Dataset({
        features: ['data', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 118
    })
    test: Dataset({
        features: ['data', 'input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 118
    })
})


Evaluation parameter

In [20]:
import evaluate
import numpy as np

# Load evaluation metrics
metric = evaluate.load("squad_v2")

def compute_metrics(eval_pred):
    predictions, references = eval_pred
    start_logits, end_logits = predictions
    start_predictions = np.argmax(start_logits, axis=1)
    end_predictions = np.argmax(end_logits, axis=1)
    return metric.compute(predictions={"id": start_predictions, "end": end_predictions}, references=references)

Training Arguments and Trainer

In [22]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./qa_subset_maharoberta_1_results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    log_level='debug',
    logging_steps=10,
)

In [23]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_subset["train"],
    eval_dataset=tokenized_subset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Training the model

In [24]:
# Train the model
trainer.train()

  0%|          | 0/447 [3:40:09<?, ?it/s]
Currently training with a batch size of: 8
The following columns in the training set don't have a corresponding argument in `XLMRobertaForQuestionAnswering.forward` and have been ignored: data. If data are not expected by `XLMRobertaForQuestionAnswering.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1,185
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 447
  Number of trainable parameters = 277,454,594
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
                                                  
[A                                                  

  2%|▏         | 10/447 [09:23<7:17:43, 60.10s/it]
[A

{'loss': 3.4466, 'grad_norm': 14.770986557006836, 'learning_rate': 2.9328859060402686e-05, 'epoch': 0.07}


                                                  
[A                                                  

  4%|▍         | 20/447 [19:55<7:33:42, 63.75s/it]
[A

{'loss': 3.5451, 'grad_norm': 5.7172417640686035, 'learning_rate': 2.8657718120805368e-05, 'epoch': 0.13}


                                                  
[A                                                  

  7%|▋         | 30/447 [29:58<7:08:24, 61.64s/it]
[A

{'loss': 3.7508, 'grad_norm': 3.39996600151062, 'learning_rate': 2.7986577181208053e-05, 'epoch': 0.2}


                                                  
[A                                                  

  9%|▉         | 40/447 [40:30<7:16:32, 64.36s/it]
[A

{'loss': 3.7231, 'grad_norm': 3.0733251571655273, 'learning_rate': 2.731543624161074e-05, 'epoch': 0.27}


                                                  
[A                                                  

 11%|█         | 50/447 [50:42<6:41:43, 60.71s/it]
[A

{'loss': 3.3397, 'grad_norm': 6.282351970672607, 'learning_rate': 2.6644295302013424e-05, 'epoch': 0.34}


                                                    
[A                                                  

 13%|█▎        | 60/447 [1:00:52<6:31:39, 60.72s/it]
[A

{'loss': 3.039, 'grad_norm': 2.734901189804077, 'learning_rate': 2.5973154362416106e-05, 'epoch': 0.4}


                                                    
[A                                                  

 16%|█▌        | 70/447 [1:11:21<6:41:51, 63.96s/it]
[A

{'loss': 2.8886, 'grad_norm': 5.130919456481934, 'learning_rate': 2.530201342281879e-05, 'epoch': 0.47}


                                                    
[A                                                  

 18%|█▊        | 80/447 [1:21:38<6:03:57, 59.50s/it]
[A

{'loss': 2.9961, 'grad_norm': 3.7754063606262207, 'learning_rate': 2.4630872483221476e-05, 'epoch': 0.54}


                                                    
[A                                                  

 20%|██        | 90/447 [1:32:19<6:13:53, 62.84s/it]
[A

{'loss': 2.9761, 'grad_norm': 6.0506591796875, 'learning_rate': 2.3959731543624162e-05, 'epoch': 0.6}


                                                     
[A                                                  

 22%|██▏       | 100/447 [1:42:43<5:47:56, 60.16s/it]
[A

{'loss': 3.4547, 'grad_norm': 7.94872522354126, 'learning_rate': 2.3288590604026844e-05, 'epoch': 0.67}


                                                     
[A                                                  

 25%|██▍       | 110/447 [1:53:11<5:47:24, 61.85s/it]
[A

{'loss': 3.4749, 'grad_norm': 4.710449695587158, 'learning_rate': 2.261744966442953e-05, 'epoch': 0.74}


                                                     
[A                                                  

 27%|██▋       | 120/447 [2:03:33<5:40:40, 62.51s/it]
[A

{'loss': 2.788, 'grad_norm': 6.004735946655273, 'learning_rate': 2.1946308724832218e-05, 'epoch': 0.81}


                                                     
[A                                                  

 29%|██▉       | 130/447 [2:13:57<5:30:20, 62.53s/it]
[A

{'loss': 2.9261, 'grad_norm': 10.497357368469238, 'learning_rate': 2.1275167785234903e-05, 'epoch': 0.87}


                                                     
[A                                                  

 31%|███▏      | 140/447 [2:24:20<5:18:10, 62.18s/it]
[A

{'loss': 4.2114, 'grad_norm': 4.02404260635376, 'learning_rate': 2.0604026845637585e-05, 'epoch': 0.94}


 33%|███▎      | 149/447 [2:33:15<4:27:09, 53.79s/it]The following columns in the evaluation set don't have a corresponding argument in `XLMRobertaForQuestionAnswering.forward` and have been ignored: data. If data are not expected by `XLMRobertaForQuestionAnswering.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 118
  Batch size = 8


[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

KeyError: 0

Saving the model

In [None]:
# Save the fine-tuned model
model.save_pretrained("marathi-qa-20-mahasquad")
tokenizer.save_pretrained("marathi-qa-20-mahasquad")