In [1]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import json

file_path = "/content/drive/MyDrive/HnM_BerT/qa_pairs.json"

with open(file_path, "r") as file:
    json_string = file.read()
    full_qa_data = json.loads(json_string)

# Calculate the middle index and keep only the first half of the data
tenp_index = len(full_qa_data) // 10
qa_data = full_qa_data[:tenp_index]


In [3]:
!pip install torch


!pip install datasets


!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:


import json
import torch
from datasets import Dataset
from transformers import BertForQuestionAnswering, BertTokenizerFast, TrainingArguments, Trainer


In [5]:


# Convert 'answer' field to a string
def convert_answers_to_str(data):
    for item in data:
        if not isinstance(item['answer'], str):
            item['answer'] = str(item['answer'])
    return data


In [6]:



str_qa_data = convert_answers_to_str(qa_data)



In [7]:

# Convert the JSON data to a Hugging Face Dataset
dataset = Dataset.from_dict({k: [d[k] for d in str_qa_data] for k in str_qa_data[0].keys()})
train_dataset, val_dataset = dataset.train_test_split(test_size=0.1).values()


In [8]:

def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=384
    )



In [9]:

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')

def tokenize_function(examples):
    return tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=384
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)



Map:   0%|          | 0/1347 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [10]:


# Set the start and end token positions for the answers in the context
def add_token_positions(batch):
    start_positions, end_positions = [], []
    for i, answer in enumerate(batch["answer"]):
        start_idx = batch["context"][i].find(answer)
        end_idx = start_idx + len(answer)

        start_positions.append(batch["input_ids"][i].index(tokenizer.encode(answer, add_special_tokens=False)[0]))
        end_positions.append(batch["input_ids"][i].index(tokenizer.encode(answer, add_special_tokens=False)[-1]))

    batch["start_positions"] = start_positions
    batch["end_positions"] = end_positions
    return batch


In [11]:



train_dataset = train_dataset.map(add_token_positions, batched=True)
val_dataset = val_dataset.map(add_token_positions, batched=True)



Map:   0%|          | 0/1347 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [13]:
pip install --upgrade transformers




In [19]:

training_args = TrainingArguments(
    output_dir="/Users/prithvi/Desktop/HnM_BerT",
    do_train=True,
    num_train_epochs=0.1,  # Further decrease the number of epochs
    per_device_train_batch_size=32,  # Reduce the batch size
    save_steps=100,
    save_total_limit=2,
    evaluation_strategy="steps",
    eval_steps=100,
    logging_steps=100,
    learning_rate=2e-5,
    fp16=False,
    no_cuda=True,
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)



In [21]:
from transformers import AutoModelForQuestionAnswering


from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForQuestionAnswering: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased a

In [24]:
from transformers import default_data_collator, TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch

# Define a default data collator
data_collator = default_data_collator

In [25]:

# create trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

In [None]:
# Train the model
trainer.train()



In [None]:




#***Decreased the number of training epochs to 0.25. This means the model will only train for half an epoch.
#Increased the per_device_train_batch_size to 32. This will allow the model to process more examples in parallel, which can speed up training.
####reducing the training time may impact the performance of your model. 





# Save the fine-tuned model and tokenizer
model.save_pretrained("/Users/prithvi/Desktop/HnM_BerT/pt_distilbert_model")
tokenizer.save_pretrained("/Users/prithvi/Desktop/HnM_BerT/pt_distilbert_tokenizer")



