Import libraries

In [1]:
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForQuestionAnswering
from transformers import Trainer, TrainingArguments
from datasets import DatasetDict, load_dataset

  from .autonotebook import tqdm as notebook_tqdm


Load Dataset and Model

In [2]:
dataset = load_dataset("json", data_files={
    "train": "mahasquad\\train.json",
    "validation": "mahasquad\\val.json",
    "test": "mahasquad\\test.json"
})

In [3]:
# Load the XLM-R tokenizer and model
tokenizer = XLMRobertaTokenizer.from_pretrained("deepset/xlm-roberta-large-squad2")
model = XLMRobertaForQuestionAnswering.from_pretrained("deepset/xlm-roberta-large-squad2")

Some weights of the model checkpoint at deepset/xlm-roberta-large-squad2 were not used when initializing XLMRobertaForQuestionAnswering: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Preprocessing function

In [4]:
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['data'],
        num_rows: 118516
    })
    validation: Dataset({
        features: ['data'],
        num_rows: 11873
    })
    test: Dataset({
        features: ['data'],
        num_rows: 11803
    })
})
{'data': {'answers': {'answer_start': [], 'text': []}, 'context': 'प्रेस्बिटेरियनिझम हा उत्तर आयर्लंडमधील सर्वात मोठा प्रोटेस्टंट संप्रदाय आहे आणि आयर्लंड बेटावर (आयर्लंडच्या अँग्लिकन चर्च नंतर) दुसरा सर्वात मोठा संप्रदाय आहे, [उद्धरण आवश्यक आहे] आणि स्कॉटिश वृक्षारोपण स्थायिकांनी अल्स्टरमध्ये आणले होते ज्यांना जेम्स व्ही द्वारे स्थलांतर करण्यास जोरदार प्रोत्साहन दिले होते. स्कॉटलंडचा, नंतर इंग्लंडचा जेम्स पहिला. अंदाजे १००,००० स्कॉटिश प्रेस्बिटेरियन्स १६०७ आणि १६९० मध्ये बॉयनच्या लढाईदरम्यान आयर्लंडच्या उत्तरेकडील काउंटीजमध्ये स्थलांतरित झाले. अल्स्टर आणि उर्वरित आयर्लंडमधील रोमन कॅथोलिकांसह प्रेस्बिटेरियन, १९व्या शतकाच्या सुरुवातीस ते मागे घेण्यापर्यंत भेदभाव करणाऱ्या दंड कायद्यांतर्गत त्रास सहन करावा लागला. आयर्लंडमधील प्रेस्

In [5]:
def preprocess_data(examples):
    # Access fields inside 'data'
    questions = [item["question"] for item in examples["data"]]
    contexts = [item["context"] for item in examples["data"]]
    answers = [item["answers"] for item in examples["data"]]
    
    # Tokenize the inputs with truncation and padding
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation=True,
        padding="max_length"
    )
    
    # Prepare start and end positions for the answers
    start_positions = [ans["answer_start"][0] if ans["answer_start"] else 0 for ans in answers]
    end_positions = [
        (ans["answer_start"][0] + len(ans["text"][0])) if ans["answer_start"] and ans["text"] else 0 
        for ans in answers
    ]
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

Take subset of dataset

In [6]:
# Set a fraction of the dataset to use
fraction = 0.01  # Use 1% of the dataset

# Shuffle and select a subset of the training, validation, and test sets
train_subset = dataset["train"].shuffle(seed=42).select(range(int(len(dataset["train"]) * fraction)))
validation_subset = dataset["validation"].shuffle(seed=42).select(range(int(len(dataset["validation"]) * fraction)))
test_subset = dataset["test"].shuffle(seed=42).select(range(int(len(dataset["test"]) * fraction)))

# Create a new DatasetDict with the subsets
subset_dataset = DatasetDict({
    "train": train_subset,
    "validation": validation_subset,
    "test": test_subset
})

# Check the sizes of the subsets to confirm
print(f"Train subset size: {len(subset_dataset['train'])}")
print(f"Validation subset size: {len(subset_dataset['validation'])}")
print(f"Test subset size: {len(subset_dataset['test'])}")

Train subset size: 1185
Validation subset size: 118
Test subset size: 118


In [7]:
# Proceed with tokenization and training using subset_dataset
tokenized_subset = subset_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/1185 [00:00<?, ? examples/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returne

Evaluation parameter

In [8]:
import evaluate

# Load evaluation metrics
metric = evaluate.load("squad_v2")

def compute_metrics(eval_pred):
    predictions, references = eval_pred
    return metric.compute(predictions=predictions, references=references)

Training Arguments and Trainer

In [10]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./qa_subset-1percent_results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [11]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_subset["train"],
    eval_dataset=tokenized_subset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Training the model

In [12]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmokshada-sable-btech2021[0m ([33mmokshada-sable-btech2021-symbiosis-international[0m). Use [1m`wandb login --relogin`[0m to force relogin


  2%|▏         | 10/447 [41:22<30:18:24, 249.67s/it]

{'loss': 4.3499, 'grad_norm': 42.578826904296875, 'learning_rate': 2.9328859060402686e-05, 'epoch': 0.07}


  4%|▍         | 20/447 [1:22:11<29:35:54, 249.54s/it]

{'loss': 4.1274, 'grad_norm': 15.165018081665039, 'learning_rate': 2.8657718120805368e-05, 'epoch': 0.13}


  7%|▋         | 30/447 [2:04:22<29:34:05, 255.26s/it]

{'loss': 4.0995, 'grad_norm': 23.07839012145996, 'learning_rate': 2.7986577181208053e-05, 'epoch': 0.2}


  9%|▉         | 40/447 [2:40:22<20:01:47, 177.17s/it]

{'loss': 4.1841, 'grad_norm': 42.93212127685547, 'learning_rate': 2.731543624161074e-05, 'epoch': 0.27}


 11%|█         | 50/447 [3:04:02<15:30:05, 140.57s/it]

{'loss': 3.8351, 'grad_norm': 32.82088851928711, 'learning_rate': 2.6644295302013424e-05, 'epoch': 0.34}


 13%|█▎        | 60/447 [3:28:45<16:11:54, 150.68s/it]

{'loss': 3.5754, 'grad_norm': 41.09687805175781, 'learning_rate': 2.5973154362416106e-05, 'epoch': 0.4}


 16%|█▌        | 70/447 [3:52:38<15:01:56, 143.54s/it]

{'loss': 3.3027, 'grad_norm': 23.20370864868164, 'learning_rate': 2.530201342281879e-05, 'epoch': 0.47}


 18%|█▊        | 80/447 [4:17:32<15:09:48, 148.74s/it]

{'loss': 3.1027, 'grad_norm': 6.150061130523682, 'learning_rate': 2.4630872483221476e-05, 'epoch': 0.54}


 20%|██        | 90/447 [4:41:44<14:45:46, 148.87s/it]

{'loss': 3.0806, 'grad_norm': 177.53421020507812, 'learning_rate': 2.3959731543624162e-05, 'epoch': 0.6}


 22%|██▏       | 100/447 [5:05:42<14:11:16, 147.19s/it]

{'loss': 3.7146, 'grad_norm': 5.070763111114502, 'learning_rate': 2.3288590604026844e-05, 'epoch': 0.67}


 25%|██▍       | 110/447 [5:28:08<10:43:43, 114.61s/it]

{'loss': 3.6101, 'grad_norm': 1.2407060861587524, 'learning_rate': 2.261744966442953e-05, 'epoch': 0.74}


 27%|██▋       | 120/447 [5:46:05<9:34:41, 105.45s/it] 

{'loss': 3.1926, 'grad_norm': 200.64088439941406, 'learning_rate': 2.1946308724832218e-05, 'epoch': 0.81}


 29%|██▉       | 130/447 [6:03:37<9:18:43, 105.75s/it]

{'loss': 3.0438, 'grad_norm': 12.773775100708008, 'learning_rate': 2.1275167785234903e-05, 'epoch': 0.87}


 31%|███▏      | 140/447 [6:21:31<8:52:08, 104.00s/it]

{'loss': 4.463, 'grad_norm': 230.42543029785156, 'learning_rate': 2.0604026845637585e-05, 'epoch': 0.94}


 33%|███▎      | 149/447 [6:35:36<6:45:15, 81.60s/it] 

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

Saving the model

In [None]:
# Save the fine-tuned model
model.save_pretrained("marathi-qa-20-mahasquad")
tokenizer.save_pretrained("marathi-qa-20-mahasquad")