Import libraries

In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import Trainer, TrainingArguments
from datasets import DatasetDict, load_dataset

Load Dataset and Model

In [22]:
dataset = load_dataset("json", data_files={
    "train": "mahasquad\\train.json",
    "validation": "mahasquad\\val.json",
    "test": "mahasquad\\test.json"
})

In [23]:
# Load the XLM-R tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/marathi-roberta")
model = AutoModelForQuestionAnswering.from_pretrained("l3cube-pune/marathi-roberta")

Some weights of XLMRobertaForQuestionAnswering were not initialized from the model checkpoint at l3cube-pune/marathi-roberta and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Preprocessing function

In [24]:
print(dataset)
print(dataset["train"][0])

DatasetDict({
    train: Dataset({
        features: ['data'],
        num_rows: 118516
    })
    validation: Dataset({
        features: ['data'],
        num_rows: 11873
    })
    test: Dataset({
        features: ['data'],
        num_rows: 11803
    })
})
{'data': {'answers': {'answer_start': [], 'text': []}, 'context': 'प्रेस्बिटेरियनिझम हा उत्तर आयर्लंडमधील सर्वात मोठा प्रोटेस्टंट संप्रदाय आहे आणि आयर्लंड बेटावर (आयर्लंडच्या अँग्लिकन चर्च नंतर) दुसरा सर्वात मोठा संप्रदाय आहे, [उद्धरण आवश्यक आहे] आणि स्कॉटिश वृक्षारोपण स्थायिकांनी अल्स्टरमध्ये आणले होते ज्यांना जेम्स व्ही द्वारे स्थलांतर करण्यास जोरदार प्रोत्साहन दिले होते. स्कॉटलंडचा, नंतर इंग्लंडचा जेम्स पहिला. अंदाजे १००,००० स्कॉटिश प्रेस्बिटेरियन्स १६०७ आणि १६९० मध्ये बॉयनच्या लढाईदरम्यान आयर्लंडच्या उत्तरेकडील काउंटीजमध्ये स्थलांतरित झाले. अल्स्टर आणि उर्वरित आयर्लंडमधील रोमन कॅथोलिकांसह प्रेस्बिटेरियन, १९व्या शतकाच्या सुरुवातीस ते मागे घेण्यापर्यंत भेदभाव करणाऱ्या दंड कायद्यांतर्गत त्रास सहन करावा लागला. आयर्लंडमधील प्रेस्

In [25]:
def preprocess_data(examples):
    # Access fields inside 'data'
    questions = [item["question"] for item in examples["data"]]
    contexts = [item["context"] for item in examples["data"]]
    answers = [item["answers"] for item in examples["data"]]
    
    # Tokenize the inputs with truncation and padding
    inputs = tokenizer(
        questions,
        contexts,
        max_length=384,
        truncation=True,
        padding="max_length"
    )
    
    # Prepare start and end positions for the answers
    start_positions = [ans["answer_start"][0] if ans["answer_start"] else 0 for ans in answers]
    end_positions = [
        (ans["answer_start"][0] + len(ans["text"][0])) if ans["answer_start"] and ans["text"] else 0 
        for ans in answers
    ]
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

Take subset of dataset

In [26]:
# Set a fraction of the dataset to use
fraction = 0.05  # Use 5% of the dataset

# Shuffle and select a subset of the training, validation, and test sets
train_subset = dataset["train"].shuffle(seed=42).select(range(int(len(dataset["train"]) * fraction)))
validation_subset = dataset["validation"].shuffle(seed=42).select(range(int(len(dataset["validation"]) * fraction)))
test_subset = dataset["test"].shuffle(seed=42).select(range(int(len(dataset["test"]) * fraction)))

# Create a new DatasetDict with the subsets
subset_dataset = DatasetDict({
    "train": train_subset,
    "validation": validation_subset,
    "test": test_subset
})

# Check the sizes of the subsets to confirm
print(f"Train subset size: {len(subset_dataset['train'])}")
print(f"Validation subset size: {len(subset_dataset['validation'])}")
print(f"Test subset size: {len(subset_dataset['test'])}")

Train subset size: 5925
Validation subset size: 593
Test subset size: 590


In [27]:
# Proceed with tokenization and training using subset_dataset
tokenized_subset = subset_dataset.map(preprocess_data, batched=True)




[A[A[A


Map: 100%|██████████| 593/593 [00:00<00:00, 1626.18 examples/s]


Evaluation parameter

In [28]:
import evaluate

# Load evaluation metrics
metric = evaluate.load("squad_v2")

def compute_metrics(eval_pred):
    predictions, references = eval_pred
    return metric.compute(predictions=predictions, references=references)

Training Arguments and Trainer

In [29]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./qa_subset_maharoberta_results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

In [30]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_subset["train"],
    eval_dataset=tokenized_subset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

Training the model

In [31]:
# Train the model
trainer.train()

 33%|███▎      | 741/2223 [47:50:39<95:41:18, 232.44s/it]

                                                  


[A[A[A                                      
  1%|▏         | 10/741 [02:05<2:30:00, 12.31s/it] 

[A[A

{'loss': 5.9294, 'grad_norm': 34.9787712097168, 'learning_rate': 2.959514170040486e-05, 'epoch': 0.01}


                                                  


[A[A[A                                         
  3%|▎         | 20/741 [04:08<2:27:06, 12.24s/it]  

[A[A

{'loss': 5.6452, 'grad_norm': 14.557489395141602, 'learning_rate': 2.9190283400809717e-05, 'epoch': 0.03}


                                                  


[A[A[A                                         
  4%|▍         | 30/741 [06:11<2:25:17, 12.26s/it]  

[A[A

{'loss': 4.8172, 'grad_norm': 14.264787673950195, 'learning_rate': 2.8785425101214575e-05, 'epoch': 0.04}


                                                  


[A[A[A                                         
  5%|▌         | 40/741 [08:12<2:21:11, 12.08s/it]  

[A[A

{'loss': 4.1411, 'grad_norm': 13.462058067321777, 'learning_rate': 2.8380566801619436e-05, 'epoch': 0.05}


                                                  


[A[A[A                                         
  7%|▋         | 50/741 [10:15<2:21:35, 12.29s/it]  

[A[A

{'loss': 4.5092, 'grad_norm': 17.228219985961914, 'learning_rate': 2.7975708502024294e-05, 'epoch': 0.07}


                                                  


[A[A[A                                         
  8%|▊         | 60/741 [12:18<2:18:56, 12.24s/it]  

[A[A

{'loss': 3.4294, 'grad_norm': 18.45244026184082, 'learning_rate': 2.7570850202429152e-05, 'epoch': 0.08}


                                                  


[A[A[A                                         
  9%|▉         | 70/741 [14:21<2:17:11, 12.27s/it]  

[A[A

{'loss': 4.4159, 'grad_norm': 8.203121185302734, 'learning_rate': 2.716599190283401e-05, 'epoch': 0.09}


                                                  


[A[A[A                                         
 11%|█         | 80/741 [16:22<2:13:39, 12.13s/it]  

[A[A

{'loss': 3.7611, 'grad_norm': 6.780584335327148, 'learning_rate': 2.6761133603238868e-05, 'epoch': 0.11}


                                                  


[A[A[A                                         
 12%|█▏        | 90/741 [18:24<2:12:02, 12.17s/it]  

[A[A

{'loss': 3.8515, 'grad_norm': 6.43194055557251, 'learning_rate': 2.6356275303643726e-05, 'epoch': 0.12}


                                                   


[A[A[A                                         
 13%|█▎        | 100/741 [20:25<2:09:06, 12.09s/it] 

[A[A

{'loss': 4.5823, 'grad_norm': 9.319701194763184, 'learning_rate': 2.595141700404858e-05, 'epoch': 0.13}


                                                   


[A[A[A                                         
 15%|█▍        | 110/741 [22:27<2:07:26, 12.12s/it] 

[A[A

{'loss': 3.9181, 'grad_norm': 6.308564186096191, 'learning_rate': 2.5546558704453442e-05, 'epoch': 0.15}


                                                   


[A[A[A                                         
 16%|█▌        | 120/741 [24:29<2:04:40, 12.05s/it] 

[A[A

{'loss': 3.8045, 'grad_norm': 11.673052787780762, 'learning_rate': 2.51417004048583e-05, 'epoch': 0.16}


                                                   


[A[A[A                                         
 18%|█▊        | 130/741 [26:32<2:05:23, 12.31s/it] 

[A[A

{'loss': 4.3177, 'grad_norm': 4.451017379760742, 'learning_rate': 2.4736842105263158e-05, 'epoch': 0.18}


                                                   


[A[A[A                                         
 19%|█▉        | 140/741 [28:34<2:02:41, 12.25s/it] 

[A[A

{'loss': 3.0025, 'grad_norm': 2.1464571952819824, 'learning_rate': 2.4331983805668016e-05, 'epoch': 0.19}


                                                   


[A[A[A                                         
 20%|██        | 150/741 [30:37<2:01:40, 12.35s/it] 

[A[A

{'loss': 3.8401, 'grad_norm': 3.460672378540039, 'learning_rate': 2.3927125506072874e-05, 'epoch': 0.2}


                                                   


[A[A[A                                         
 22%|██▏       | 160/741 [32:39<1:57:40, 12.15s/it] 

[A[A

{'loss': 3.436, 'grad_norm': 4.845217227935791, 'learning_rate': 2.352226720647773e-05, 'epoch': 0.22}


                                                   


[A[A[A                                         
 23%|██▎       | 170/741 [34:41<1:55:24, 12.13s/it] 

[A[A

{'loss': 3.437, 'grad_norm': 4.741342544555664, 'learning_rate': 2.3117408906882593e-05, 'epoch': 0.23}


                                                   


[A[A[A                                         
 24%|██▍       | 180/741 [36:43<1:54:53, 12.29s/it] 

[A[A

{'loss': 2.84, 'grad_norm': 11.40686321258545, 'learning_rate': 2.271255060728745e-05, 'epoch': 0.24}


                                                   


[A[A[A                                         
 26%|██▌       | 190/741 [38:45<1:50:49, 12.07s/it] 

[A[A

{'loss': 2.662, 'grad_norm': 7.097522735595703, 'learning_rate': 2.230769230769231e-05, 'epoch': 0.26}


                                                   


[A[A[A                                         
 27%|██▋       | 200/741 [40:47<1:49:06, 12.10s/it] 

[A[A

{'loss': 3.977, 'grad_norm': 5.229985237121582, 'learning_rate': 2.1902834008097167e-05, 'epoch': 0.27}


                                                   


[A[A[A                                         
 28%|██▊       | 210/741 [42:48<1:47:04, 12.10s/it] 

[A[A

{'loss': 3.4583, 'grad_norm': 17.998348236083984, 'learning_rate': 2.1497975708502025e-05, 'epoch': 0.28}


                                                   


[A[A[A                                         
 30%|██▉       | 220/741 [44:49<1:44:56, 12.09s/it] 

[A[A

{'loss': 3.1371, 'grad_norm': 3.5172858238220215, 'learning_rate': 2.1093117408906883e-05, 'epoch': 0.3}


                                                   


[A[A[A                                         
 31%|███       | 230/741 [46:51<1:43:26, 12.15s/it] 

[A[A

{'loss': 3.0434, 'grad_norm': 5.408641338348389, 'learning_rate': 2.068825910931174e-05, 'epoch': 0.31}


                                                   


[A[A[A                                         
 32%|███▏      | 240/741 [48:54<1:42:29, 12.27s/it] 

[A[A

{'loss': 3.7267, 'grad_norm': 9.230902671813965, 'learning_rate': 2.0283400809716602e-05, 'epoch': 0.32}


                                                   


[A[A[A                                         
 34%|███▎      | 250/741 [50:57<1:39:44, 12.19s/it] 

[A[A

{'loss': 3.8894, 'grad_norm': 19.39901351928711, 'learning_rate': 1.987854251012146e-05, 'epoch': 0.34}


                                                   


[A[A[A                                         
 35%|███▌      | 260/741 [53:00<1:37:53, 12.21s/it] 

[A[A

{'loss': 3.7136, 'grad_norm': 1.5628939867019653, 'learning_rate': 1.9473684210526318e-05, 'epoch': 0.35}


                                                   


[A[A[A                                         
 36%|███▋      | 270/741 [55:02<1:35:51, 12.21s/it] 

[A[A

{'loss': 2.8779, 'grad_norm': 1.9027701616287231, 'learning_rate': 1.9068825910931176e-05, 'epoch': 0.36}


                                                   


[A[A[A                                         
 38%|███▊      | 280/741 [57:04<1:34:05, 12.25s/it] 

[A[A

{'loss': 2.8502, 'grad_norm': 2.3432440757751465, 'learning_rate': 1.8663967611336034e-05, 'epoch': 0.38}


                                                   


[A[A[A                                         
 39%|███▉      | 290/741 [59:06<1:31:30, 12.17s/it] 

[A[A

{'loss': 3.5009, 'grad_norm': 1.692151665687561, 'learning_rate': 1.8259109311740892e-05, 'epoch': 0.39}


                                                     


[A[A[A                                         
 40%|████      | 300/741 [1:01:05<1:26:15, 11.73s/it]

[A[A

{'loss': 3.4486, 'grad_norm': 3.841914653778076, 'learning_rate': 1.7854251012145746e-05, 'epoch': 0.4}


                                                     


[A[A[A                                         
 42%|████▏     | 310/741 [1:03:03<1:24:20, 11.74s/it]

[A[A

{'loss': 3.4456, 'grad_norm': 2.548811912536621, 'learning_rate': 1.7449392712550608e-05, 'epoch': 0.42}


                                                     


[A[A[A                                         
 43%|████▎     | 320/741 [1:05:00<1:21:55, 11.68s/it]

[A[A

{'loss': 3.7787, 'grad_norm': 39.665443420410156, 'learning_rate': 1.7044534412955466e-05, 'epoch': 0.43}


                                                     


[A[A[A                                         
 45%|████▍     | 330/741 [1:06:57<1:19:51, 11.66s/it]

[A[A

{'loss': 3.7552, 'grad_norm': 4.138166904449463, 'learning_rate': 1.6639676113360324e-05, 'epoch': 0.45}


                                                     


[A[A[A                                         
 46%|████▌     | 340/741 [1:08:55<1:18:33, 11.75s/it]

[A[A

{'loss': 3.759, 'grad_norm': 3.9456961154937744, 'learning_rate': 1.6234817813765182e-05, 'epoch': 0.46}


                                                     


[A[A[A                                         
 47%|████▋     | 350/741 [1:11:05<1:26:53, 13.33s/it]

[A[A

{'loss': 3.5762, 'grad_norm': 33.89286422729492, 'learning_rate': 1.582995951417004e-05, 'epoch': 0.47}


                                                     


[A[A[A                                         
 49%|████▊     | 360/741 [1:13:19<1:24:57, 13.38s/it]

[A[A

{'loss': 3.6244, 'grad_norm': 4.015892505645752, 'learning_rate': 1.5425101214574898e-05, 'epoch': 0.49}


                                                     


[A[A[A                                         
 50%|████▉     | 370/741 [1:15:36<1:25:24, 13.81s/it]

[A[A

{'loss': 3.8859, 'grad_norm': 2.9784164428710938, 'learning_rate': 1.5020242914979756e-05, 'epoch': 0.5}


                                                     


[A[A[A                                         
 51%|█████▏    | 380/741 [1:17:50<1:21:07, 13.48s/it]

[A[A

{'loss': 3.1384, 'grad_norm': 2.2424581050872803, 'learning_rate': 1.4615384615384615e-05, 'epoch': 0.51}


                                                     


[A[A[A                                         
 53%|█████▎    | 390/741 [1:20:04<1:18:20, 13.39s/it]

[A[A

{'loss': 3.0236, 'grad_norm': 1.4628390073776245, 'learning_rate': 1.4210526315789473e-05, 'epoch': 0.53}


                                                     


[A[A[A                                         
 54%|█████▍    | 400/741 [1:22:18<1:15:24, 13.27s/it]

[A[A

{'loss': 3.8594, 'grad_norm': 3.2678351402282715, 'learning_rate': 1.3805668016194333e-05, 'epoch': 0.54}


                                                     


[A[A[A                                         
 55%|█████▌    | 410/741 [1:24:32<1:14:06, 13.43s/it]

[A[A

{'loss': 3.3081, 'grad_norm': 2.7665789127349854, 'learning_rate': 1.3400809716599191e-05, 'epoch': 0.55}


                                                     


[A[A[A                                         
 57%|█████▋    | 420/741 [1:26:47<1:12:16, 13.51s/it]

[A[A

{'loss': 3.0654, 'grad_norm': 3.095036745071411, 'learning_rate': 1.2995951417004049e-05, 'epoch': 0.57}


                                                     


[A[A[A                                         
 58%|█████▊    | 430/741 [1:29:02<1:09:51, 13.48s/it]

[A[A

{'loss': 3.0594, 'grad_norm': 3.287097692489624, 'learning_rate': 1.2591093117408908e-05, 'epoch': 0.58}


                                                     


[A[A[A                                         
 59%|█████▉    | 440/741 [1:31:16<1:07:02, 13.36s/it]

[A[A

{'loss': 3.2509, 'grad_norm': 12.232487678527832, 'learning_rate': 1.2186234817813765e-05, 'epoch': 0.59}


                                                     


[A[A[A                                         
 61%|██████    | 450/741 [1:33:31<1:05:02, 13.41s/it]

[A[A

{'loss': 2.8454, 'grad_norm': 2.5101535320281982, 'learning_rate': 1.1781376518218623e-05, 'epoch': 0.61}


                                                     


[A[A[A                                         
 62%|██████▏   | 460/741 [1:35:41<57:43, 12.33s/it] 

[A[A

{'loss': 3.0525, 'grad_norm': 1.5209912061691284, 'learning_rate': 1.1376518218623482e-05, 'epoch': 0.62}


                                                   


[A[A[A                                         
 63%|██████▎   | 470/741 [1:37:38<52:48, 11.69s/it] 

[A[A

{'loss': 2.9681, 'grad_norm': 1.8912134170532227, 'learning_rate': 1.097165991902834e-05, 'epoch': 0.63}


                                                   


[A[A[A                                         
 65%|██████▍   | 480/741 [1:39:36<51:16, 11.79s/it] 

[A[A

{'loss': 3.2745, 'grad_norm': 3.222249984741211, 'learning_rate': 1.0566801619433198e-05, 'epoch': 0.65}


                                                   


[A[A[A                                         
 66%|██████▌   | 490/741 [1:41:33<49:04, 11.73s/it] 

[A[A

{'loss': 3.8298, 'grad_norm': 3.6237523555755615, 'learning_rate': 1.0161943319838056e-05, 'epoch': 0.66}


                                                   


[A[A[A                                         
 67%|██████▋   | 500/741 [1:43:30<47:05, 11.72s/it] 

[A[A

{'loss': 3.9289, 'grad_norm': 2.308835983276367, 'learning_rate': 9.757085020242916e-06, 'epoch': 0.67}


                                                   


[A[A[A                                         
 69%|██████▉   | 510/741 [1:45:34<45:08, 11.73s/it] 

[A[A

{'loss': 2.8549, 'grad_norm': 68.28762817382812, 'learning_rate': 9.352226720647774e-06, 'epoch': 0.69}


                                                   


[A[A[A                                         
 70%|███████   | 520/741 [1:47:31<42:37, 11.57s/it] 

[A[A

{'loss': 3.5086, 'grad_norm': 2.3901920318603516, 'learning_rate': 8.947368421052632e-06, 'epoch': 0.7}


                                                   


[A[A[A                                         
 72%|███████▏  | 530/741 [1:49:27<40:33, 11.53s/it] 

[A[A

{'loss': 4.021, 'grad_norm': 2.019343614578247, 'learning_rate': 8.542510121457492e-06, 'epoch': 0.72}


                                                   


[A[A[A                                         
 73%|███████▎  | 540/741 [1:51:24<38:49, 11.59s/it] 

[A[A

{'loss': 3.6781, 'grad_norm': 2.2249510288238525, 'learning_rate': 8.137651821862348e-06, 'epoch': 0.73}


                                                   


[A[A[A                                         
 74%|███████▍  | 550/741 [1:53:21<37:21, 11.74s/it] 

[A[A

{'loss': 3.507, 'grad_norm': 3.19053316116333, 'learning_rate': 7.732793522267206e-06, 'epoch': 0.74}


                                                   


[A[A[A                                         
 76%|███████▌  | 560/741 [1:55:17<35:05, 11.63s/it] 

[A[A

{'loss': 3.1895, 'grad_norm': 2.5257112979888916, 'learning_rate': 7.327935222672065e-06, 'epoch': 0.76}


                                                   


[A[A[A                                         
 77%|███████▋  | 570/741 [1:57:15<33:10, 11.64s/it] 

[A[A

{'loss': 3.4429, 'grad_norm': 1.2766791582107544, 'learning_rate': 6.923076923076923e-06, 'epoch': 0.77}


                                                   


[A[A[A                                         
 78%|███████▊  | 580/741 [1:59:11<31:00, 11.56s/it] 

[A[A

{'loss': 3.2059, 'grad_norm': 2.9516022205352783, 'learning_rate': 6.518218623481781e-06, 'epoch': 0.78}


                                                   


[A[A[A                                         
 80%|███████▉  | 590/741 [2:01:09<29:31, 11.73s/it] 

[A[A

{'loss': 3.2437, 'grad_norm': 1.817737340927124, 'learning_rate': 6.11336032388664e-06, 'epoch': 0.8}


                                                   


[A[A[A                                         
 81%|████████  | 600/741 [2:03:05<27:07, 11.54s/it] 

[A[A

{'loss': 3.7542, 'grad_norm': 1.8076229095458984, 'learning_rate': 5.708502024291498e-06, 'epoch': 0.81}


                                                   


[A[A[A                                         
 82%|████████▏ | 610/741 [2:05:03<25:30, 11.68s/it] 

[A[A

{'loss': 3.4334, 'grad_norm': 0.930919885635376, 'learning_rate': 5.303643724696356e-06, 'epoch': 0.82}


                                                   


[A[A[A                                         
 84%|████████▎ | 620/741 [2:06:59<23:13, 11.52s/it] 

[A[A

{'loss': 3.2013, 'grad_norm': 2.8235113620758057, 'learning_rate': 4.898785425101214e-06, 'epoch': 0.84}


                                                   


[A[A[A                                         
 85%|████████▌ | 630/741 [2:08:56<21:35, 11.67s/it] 

[A[A

{'loss': 4.1396, 'grad_norm': 3.4405410289764404, 'learning_rate': 4.493927125506073e-06, 'epoch': 0.85}


                                                   


[A[A[A                                         
 86%|████████▋ | 640/741 [2:10:52<19:37, 11.66s/it] 

[A[A

{'loss': 3.0305, 'grad_norm': 1.6741453409194946, 'learning_rate': 4.089068825910932e-06, 'epoch': 0.86}


                                                   


[A[A[A                                         
 88%|████████▊ | 650/741 [2:12:50<17:44, 11.70s/it] 

[A[A

{'loss': 2.7672, 'grad_norm': 0.9945653080940247, 'learning_rate': 3.6842105263157892e-06, 'epoch': 0.88}


                                                   


[A[A[A                                         
 89%|████████▉ | 660/741 [2:14:47<15:49, 11.73s/it] 

[A[A

{'loss': 3.7953, 'grad_norm': 1.9481979608535767, 'learning_rate': 3.279352226720648e-06, 'epoch': 0.89}


                                                   


[A[A[A                                         
 90%|█████████ | 670/741 [2:16:43<13:43, 11.60s/it] 

[A[A

{'loss': 3.6186, 'grad_norm': 23.444591522216797, 'learning_rate': 2.8744939271255064e-06, 'epoch': 0.9}


                                                   


[A[A[A                                         
 92%|█████████▏| 680/741 [2:18:41<11:51, 11.67s/it] 

[A[A

{'loss': 2.7495, 'grad_norm': 1.1903486251831055, 'learning_rate': 2.4696356275303644e-06, 'epoch': 0.92}


                                                   


[A[A[A                                         
 93%|█████████▎| 690/741 [2:20:38<09:54, 11.66s/it] 

[A[A

{'loss': 3.6075, 'grad_norm': 1.758362054824829, 'learning_rate': 2.0647773279352228e-06, 'epoch': 0.93}


                                                   


[A[A[A                                         
 94%|█████████▍| 700/741 [2:22:35<07:56, 11.63s/it] 

[A[A

{'loss': 2.8626, 'grad_norm': 5.043227672576904, 'learning_rate': 1.6599190283400812e-06, 'epoch': 0.94}


                                                   


[A[A[A                                         
 96%|█████████▌| 710/741 [2:24:33<06:02, 11.71s/it] 

[A[A

{'loss': 3.4461, 'grad_norm': 0.0847030058503151, 'learning_rate': 1.2550607287449393e-06, 'epoch': 0.96}


                                                   


[A[A[A                                         
 97%|█████████▋| 720/741 [2:26:30<04:03, 11.58s/it] 

[A[A

{'loss': 3.5856, 'grad_norm': 3.320694923400879, 'learning_rate': 8.502024291497976e-07, 'epoch': 0.97}


                                                   


[A[A[A                                         
 99%|█████████▊| 730/741 [2:28:26<02:07, 11.57s/it] 

[A[A

{'loss': 3.4159, 'grad_norm': 2.4425880908966064, 'learning_rate': 4.4534412955465585e-07, 'epoch': 0.99}


                                                   


[A[A[A                                         
100%|█████████▉| 740/741 [2:30:23<00:11, 11.61s/it] 

[A[A

{'loss': 3.5965, 'grad_norm': 7.399904727935791, 'learning_rate': 4.048582995951417e-08, 'epoch': 1.0}


100%|██████████| 741/741 [2:30:32<00:00, 10.65s/it]
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A
[A

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

Saving the model

In [32]:
# Save the fine-tuned model
model.save_pretrained("marathi-qa-20-mahasquad")
tokenizer.save_pretrained("marathi-qa-20-mahasquad")

('marathi-qa-20-mahasquad\\tokenizer_config.json',
 'marathi-qa-20-mahasquad\\special_tokens_map.json',
 'marathi-qa-20-mahasquad\\sentencepiece.bpe.model',
 'marathi-qa-20-mahasquad\\added_tokens.json',
 'marathi-qa-20-mahasquad\\tokenizer.json')