In [15]:
import json

file_path = 'train-v2.0.json'
with open(file_path, 'r') as file:
    dataset = json.load(file)
clean_dataset = {}
for data in dataset['data']:
    clean_dataset[data['title']] = []
    for paragraph in data['paragraphs']:
        context_dict = {
            'context': paragraph['context'],
            'qas': []
        }
        for qa in paragraph['qas']:
            q_and_a = {
                'question': qa['question'],
                'answers': [answer['text'] for answer in qa['answers']],
                'is_impossible':bool(qa['is_impossible'])
            }
            context_dict['qas'].append(q_and_a)
        clean_dataset[data['title']].append(context_dict)

keys = list(clean_dataset.keys())
print(keys[0:10])

# def write_chunks_to_files(data):
#     for key in data:
#         filename = f'{key}.txt'.replace(' ', '_').replace('/', '_')  # Replace spaces and slashes with underscores
#         with open(filename, 'w') as file:
#             json.dump({key: clean_dataset[key]}, file, indent=4)

# write_chunks_to_files(keys)
print(json.dumps(clean_dataset['Frédéric_Chopin'],indent=4))

['Beyoncé', 'Frédéric_Chopin', 'Sino-Tibetan_relations_during_the_Ming_dynasty', 'IPod', 'The_Legend_of_Zelda:_Twilight_Princess', 'Spectre_(2015_film)', '2008_Sichuan_earthquake', 'New_York_City', 'To_Kill_a_Mockingbird', 'Solar_energy']
[
    {
        "context": "Fr\u00e9d\u00e9ric Fran\u00e7ois Chopin (/\u02c8\u0283o\u028ap\u00e6n/; French pronunciation: \u200b[f\u0281e.de.\u0281ik f\u0281\u0251\u0303.swa \u0283\u0254.p\u025b\u0303]; 22 February or 1 March 1810 \u2013 17 October 1849), born Fryderyk Franciszek Chopin,[n 1] was a Polish and French (by citizenship and birth of father) composer and a virtuoso pianist of the Romantic era, who wrote primarily for the solo piano. He gained and has maintained renown worldwide as one of the leading musicians of his era, whose \"poetic genius was based on a professional technique that was without equal in his generation.\" Chopin was born in what was then the Duchy of Warsaw, and grew up in Warsaw, which after 1815 became part of Congress P

In [27]:
total_samples=0
total_contexts=0
no_ans_ques=0
ans_ques=0
for key in list(clean_dataset.keys()):
    for item in clean_dataset[key]:
        total_contexts+=1
        for qa in item['qas']:
            if qa['is_impossible']:
                no_ans_ques+=1
            else:
                ans_ques+=1
            total_samples+=1
print(total_contexts,total_samples,no_ans_ques,ans_ques)

19035 130319 43498 86821


In [111]:
import random
import copy
original_dataset = []
for key in list(clean_dataset.keys()):
    data = clean_dataset[key]
    sample_size = len(data) // 3  
    sampled_data = random.sample(data, sample_size) 
    original_dataset += copy.deepcopy(sampled_data)

Data Augmentation

In [115]:
import json
import copy
from collections import defaultdict
from rank_bm25 import BM25Okapi
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.last_hidden_state.mean(dim=1)
    return embeddings
dataset_copy=copy.deepcopy(original_dataset)
# for key in list(clean_dataset.keys()):
#     dataset_copy=dataset_copy+copy.deepcopy(clean_dataset[key])
# dataset_copy = copy.deepcopy(clean_dataset['Beyoncé'])

contexts = [entry['context'] for entry in dataset_copy]
limited_contexts = contexts

tokenized_contexts = [context.split() for context in limited_contexts]
bm25 = BM25Okapi(tokenized_contexts)

similarity = {}

for i, entry in enumerate(dataset_copy):
    context_questions = entry['qas']
    question_relevance = defaultdict(list)
    bm25_scores_for_context = []
    context_embedding_i = get_bert_embedding(limited_contexts[i])

    for qa in context_questions:
        question = qa['question']
        question_tokens = question.split()
        bm25_scores = bm25.get_scores(question_tokens)

        bm25_scores[i] = 0
        
        most_relevant_context_idx = np.argmax(bm25_scores)
        
        question_relevance[most_relevant_context_idx].append((question, bm25_scores[most_relevant_context_idx]))
        
        bm25_scores_for_context.append(bm25_scores)

    context_scores = {}
    for context_idx, question_scores in question_relevance.items():
        avg_bm25_score = np.mean([score for _, score in question_scores])
        
        context_embedding_most_relevant = get_bert_embedding(limited_contexts[context_idx])

        cosine_sim = cosine_similarity(context_embedding_i.numpy(), context_embedding_most_relevant.numpy())[0][0]
        
        weighted_score = (2 * avg_bm25_score * cosine_sim) / (avg_bm25_score + cosine_sim)

        context_scores[context_idx] = weighted_score

    best_context_idx = max(context_scores, key=context_scores.get)
    best_context_text = limited_contexts[best_context_idx]

    valid_answers = 0
    for qa in context_questions:
        qa['is_impossible']=False
        answer_found = False
        for answer in qa['answers']:
            if answer.lower() in best_context_text.lower():
                answer_found = True
                valid_answers += 1
                break

        if not answer_found:
            qa['answers'] = ["no answer"]  
            qa['is_impossible']=True

    similarity[i] = {
        'cont1': limited_contexts[i],
        'bm25': {
            'top_related_context': best_context_text,
            'average_bm25_score': context_scores[best_context_idx],
            'qa': context_questions   
        }
    }






In [129]:
final_data = []
for i, entry in similarity.items():
    new_context = entry['bm25']['top_related_context']  
    qas = entry['bm25']['qa']  
    final_data.append({
        'context': new_context, 
        'qas': qas
    })

# print(json.dumps(final_data, indent=4))
# new_f_data=[]
# for i, entry in similarity.items():
#     new_context = entry['bm25']['top_related_context']  
#     qas = entry['bm25']['qa'] 
#     qss=[]
#     for j in qas:
#         qss.append(j['question'])
#     new_f_data.append({
#         'context': new_context, 
#         'qas': qss
#     })

File Path

In [None]:
file_path_augment = 'similarity_results_subset.txt'
file_path_original = 'original_results_subset.txt'

Write data into another file

In [134]:
with open(file_path_augment, 'w') as f:
    json.dump(final_data, f, indent=4)
with open(file_path_original, 'w') as f:
    json.dump(original_dataset, f, indent=4)

Read Data from file

In [4]:
with open(file_path_augment, 'r') as file:
    final_data = json.load(file)

with open(file_path_original, 'r') as file:
    original_dataset = json.load(file)
print(json.dumps(original_dataset[0:10],indent=4))

[
    {
        "context": "In July 2002, Beyonc\u00e9 continued her acting career playing Foxxy Cleopatra alongside Mike Myers in the comedy film, Austin Powers in Goldmember, which spent its first weekend atop the US box office and grossed $73 million. Beyonc\u00e9 released \"Work It Out\" as the lead single from its soundtrack album which entered the top ten in the UK, Norway, and Belgium. In 2003, Beyonc\u00e9 starred opposite Cuba Gooding, Jr., in the musical comedy The Fighting Temptations as Lilly, a single mother whom Gooding's character falls in love with. The film received mixed reviews from critics but grossed $30 million in the U.S. Beyonc\u00e9 released \"Fighting Temptation\" as the lead single from the film's soundtrack album, with Missy Elliott, MC Lyte, and Free which was also used to promote the film. Another of Beyonc\u00e9's contributions to the soundtrack, \"Summertime\", fared better on the US charts.",
        "qas": [
            {
                "question": "W

Evaluation matrix

In [6]:
from difflib import SequenceMatcher
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import unicodedata
import re

def normalize_text(text):
    text = unicodedata.normalize('NFC', text).lower().strip()
    text = re.sub(r'\s+', '', text)  # Remove all spaces
    return text
    
def is_correct_prediction(predicted_answer, actual_answers):
    predicted_answer = normalize_text(predicted_answer)
    actual_answers = [normalize_text(ans) for ans in actual_answers]
    
    if not predicted_answer:
        return False
    
    for ans in actual_answers:
        if predicted_answer in ans or ans in predicted_answer:
            return True
    
    return False

def evaluate(predictions):
    y_true = []
    y_pred = []
    
    for item in predictions:
        actual_answers = item['actual_answers']
        predicted_answer = item['predicted_answer']
        
        is_correct = is_correct_prediction(predicted_answer, actual_answers)
        y_true.append(1 if is_correct else 0)
        y_pred.append(1 if predicted_answer else 0)
    
    accuracy = accuracy_score(y_true, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=1)
    print(accuracy,precision,recall,f1)
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }



In [91]:
import torch
import random
import copy
from transformers import BertTokenizer, BertForQuestionAnswering
import json
import unicodedata
import re
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Reset model and free memory
def reset_model():
    global model, tokenizer
    del model
    torch.cuda.empty_cache()
    
    tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
    
    # Suppress warnings about unused weights
    model = BertForQuestionAnswering.from_pretrained(
        "bert-large-uncased-whole-word-masking-finetuned-squad",
        ignore_mismatched_sizes=True  # Ignore unused weights
    )

# Load pre-trained BERT model and tokenizer
reset_model()

# Use pre-processed dataset
predictions = []

def predict_answer(context, question):
    inputs = tokenizer(question, context, return_tensors='pt', truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    
    answer_start_scores, answer_end_scores = outputs.start_logits, outputs.end_logits
    answer_start = torch.argmax(answer_start_scores)
    answer_end = torch.argmax(answer_end_scores) + 1
    predicted_answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs['input_ids'][0][answer_start:answer_end]))
    return predicted_answer

# Iterate through the sampled dataset and make predictions
for paragraph in original_dataset:
    context = paragraph['context']
    for qa in paragraph['qas']:
        question = qa['question']
        actual_answers = qa['answers']
        predicted_answer = predict_answer(context, question)
        predictions.append({
            'context': context,
            'question': question,
            'actual_answers': actual_answers,
            'predicted_answer': predicted_answer,
            'is_impossible': qa['is_impossible']
        })

# Save predictions to a JSON file
with open('bert_predictions.json', 'w') as file:
    json.dump(predictions, file, indent=4)


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. 

In [92]:
with open('bert_predictions.json', 'w') as file:
    json.dump(predictions, file, indent=4)
evaluation_results = evaluate(predictions)
print("Evaluation Results:", evaluation_results)
print("Predictions saved to bert_predictions.json")

Evaluation Results: {'accuracy': 0.657550140891762, 'precision': 0.6538120885697187, 'recall': 1.0, 'f1': 0.7906727651690597}
Predictions saved to bert_predictions.json


In [101]:
def process_and_evaluate():
    with open('bert_predictions.json', 'r') as file:
        predictions = json.load(file)
    
    answerable_questions = [p for p in predictions if not p['is_impossible']]
    unanswerable_questions = [p for p in predictions if p['is_impossible']]
    
    print("Evaluating answerable questions...")
    answerable_results = evaluate(answerable_questions)
    print("Answerable Questions Evaluation:", answerable_results)
    
    print("Evaluating unanswerable questions...")
    unanswerable_results = evaluate(unanswerable_questions)
    print("Unanswerable Questions Evaluation:", unanswerable_results)

process_and_evaluate()

Evaluating answerable questions...
Answerable Questions Evaluation: {'accuracy': 0.9703548912658497, 'precision': 0.9702664298401421, 'recall': 1.0, 'f1': 0.9849088581576907}
Evaluating unanswerable questions...
Unanswerable Questions Evaluation: {'accuracy': 0.02657712366935772, 'precision': 0.0, 'recall': 1.0, 'f1': 0.0}


In [9]:
from sklearn.model_selection import train_test_split

final_combined_data=final_data+original_dataset

original_first_half = original_dataset[:len(original_dataset) // 2]

final_first_half = final_data[:len(final_data) // 2]

final_data_train, final_data_test = train_test_split(final_first_half, test_size=0.2, random_state=42)

original_data_train, original_data_test = train_test_split(original_first_half, test_size=0.2, random_state=42)

combined_data_train = final_data_train + original_data_train

combined_data_test = final_data_test + original_data_test

In [35]:
# Print the sizes of train and test datasets
# print(f"Size of original_data_train: {len(original_data_train)}")
# print(f"Size of final_data_test: {len(final_data_test)}")
# print(f"Size of original_data_test: {len(original_data_test)}")

# Function to count "No Answer" and "Answered" questions
def count_answers(dataset):
    no_answer_count = 0
    total_questions = 0
    for item in dataset:
        qas = item['qas']
        for qa in qas:
            total_questions += 1
            if bool(qa['is_impossible']) == True:
                no_answer_count += 1
    answered_count = total_questions - no_answer_count
    return no_answer_count, answered_count, total_questions

print(f"Size of Original Dataset: {len(original_first_half)}")

no_answer_test_org, answered_test_org, total_questions = count_answers(original_first_half+final_first_half)
print(f"Combined Data Train - Total Questions: {total_questions}, No Answer: {no_answer_test_org}, Answered: {answered_test_org}")

no_answer_test_org, answered_test_org, total_questions = count_answers(original_data_test)

no_answer_test_fin, answered_test_fin, total_questions = count_answers(final_data_test)

print(f"Final Data Test - No Answer: {no_answer_test_fin}, Answered: {answered_test_fin}")
print(f"Original Data Test - No Answer: {no_answer_test_org}, Answered: {answered_test_org}")


Size of Original Dataset: 3095
Combined Data Train - Total Questions: 40038, No Answer: 24955, Answered: 15083
Final Data Test - No Answer: 3783, Answered: 167
Original Data Test - No Answer: 1183, Answered: 2767


In [10]:
print(json.dumps(original_data_test[:10],indent=4))

[
    {
        "context": "The Early Cretaceous spans from 145 million to 100 million years ago. The Early Cretaceous saw the expansion of seaways, and as a result, the decline and extinction of sauropods (except in South America). Many coastal shallows were created, and that caused Ichthyosaurs to die out. Mosasaurs evolved to replace them as head of the seas. Some island-hopping dinosaurs, like Eustreptospondylus, evolved to cope with the coastal shallows and small islands of ancient Europe. Other dinosaurs rose up to fill the empty space that the Jurassic-Cretaceous extinction left behind, such as Carcharodontosaurus and Spinosaurus. Of the most successful would be the Iguanodon which spread to every continent. Seasons came back into effect and the poles got seasonally colder, but dinosaurs still inhabited this area like the Leaellynasaura which inhabited the polar forests year-round, and many dinosaurs migrated there during summer like Muttaburrasaurus. Since it was too cold for c

In [12]:
from datasets import Dataset, load_dataset, concatenate_datasets

train_dataset = Dataset.from_dict({
    'context': [d['context'] for d in combined_data_train],
    'qas': [d['qas'] for d in combined_data_train]
})

test_dataset = Dataset.from_dict({
    'context': [d['context'] for d in combined_data_test],
    'qas': [d['qas'] for d in combined_data_test]
})

  from .autonotebook import tqdm as notebook_tqdm


Fine Tuning

In [17]:
import json
import torch
from transformers import BertTokenizerFast, BertForQuestionAnswering, Trainer, TrainingArguments
from transformers import DefaultDataCollator
from datasets import Dataset, load_dataset, concatenate_datasets
import evaluate
import numpy as np

# Use a smaller model to reduce memory usage
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')

def preprocess_data(examples):
    contexts = examples['context']
    qas_list = examples['qas']
    questions = []
    all_contexts = []
    answers = []

    for i in range(len(contexts)):
        context = contexts[i]
        qas = qas_list[i]
        for qa in qas:
            question = qa['question']
            answer_text = qa['answers'][0] if qa['answers'] else "no answer"
            questions.append(question)
            all_contexts.append(context)
            if answer_text != "no answer" and answer_text in context:
                answer_start = context.find(answer_text)
                answers.append({"text": answer_text, "answer_start": answer_start})
            else:
                answers.append({"text": "", "answer_start": 0})

    tokenized_data = tokenizer(
        questions,
        all_contexts,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_offsets_mapping=True,
        return_tensors='pt'
    )

    start_positions = []
    end_positions = []

    for i, answer in enumerate(answers):
        if answer['text'] == "":
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_char = answer['answer_start']
            end_char = start_char + len(answer['text'])
            offsets = tokenized_data['offset_mapping'][i].numpy()

            start_token, end_token = None, None
            for j, (offset_start, offset_end) in enumerate(offsets):
                if offset_start <= start_char < offset_end:
                    start_token = j
                if offset_start < end_char <= offset_end:
                    end_token = j
                    break

            if start_token is None:
                start_token = tokenizer.model_max_length
            if end_token is None:
                end_token = tokenizer.model_max_length

            start_positions.append(start_token)
            end_positions.append(end_token)

    tokenized_data = {key: value.numpy().tolist() for key, value in tokenized_data.items()}
    tokenized_data['start_positions'] = start_positions
    tokenized_data['end_positions'] = end_positions

    return tokenized_data

# Create the dataset from the provided data
# dataset = Dataset.from_dict({
#     'context': [d['context'] for d in final_combined_data],
#     'qas': [d['qas'] for d in final_combined_data]
# })

# Split the dataset into training and testing sets
# train_test_split = dataset.train_test_split(test_size=0.2)
# train_dataset = train_test_split['train']
# test_dataset = train_test_split['test']

# Preprocess the datasets
processed_train_dataset = train_dataset.map(preprocess_data, batched=True, remove_columns=['context', 'qas'])
processed_test_dataset = test_dataset.map(preprocess_data, batched=True, remove_columns=['context', 'qas'])

# Load metrics for evaluation
metric_accuracy = evaluate.load("accuracy")
metric_f1 = evaluate.load("f1")
metric_recall = evaluate.load("recall")

def compute_metrics(p):
    start_preds = np.argmax(p.predictions[0], axis=1)
    end_preds = np.argmax(p.predictions[1], axis=1)
    start_true = p.label_ids[0]
    end_true = p.label_ids[1]

    start_acc = metric_accuracy.compute(predictions=start_preds, references=start_true)
    end_acc = metric_accuracy.compute(predictions=end_preds, references=end_true)

    f1_start = metric_f1.compute(predictions=start_preds, references=start_true, average='weighted')
    f1_end = metric_f1.compute(predictions=end_preds, references=end_true, average='weighted')

    recall_start = metric_recall.compute(predictions=start_preds, references=start_true, average='weighted')
    recall_end = metric_recall.compute(predictions=end_preds, references=end_true, average='weighted')

    f1_score = (f1_start['f1'] + f1_end['f1']) / 2
    recall = (recall_start['recall'] + recall_end['recall']) / 2
    accuracy = (start_acc['accuracy'] + end_acc['accuracy']) / 2

    return {
        'accuracy': accuracy,
        'f1': f1_score,
        'recall': recall
    }


training_args = TrainingArguments(
    output_dir="finetune-BERT-squad",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    gradient_accumulation_steps=2,
    save_total_limit=2,
    load_best_model_at_end=True,
    gradient_checkpointing=True,  # Enable gradient checkpointing
    fp16=False,  # Disable mixed precision to avoid memory issues
    no_cuda=True  # Force the training to use CPU if necessary
)

data_collator = DefaultDataCollator()

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=concatenate_datasets([processed_train_dataset, processed_test_dataset]),
    eval_dataset=processed_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

# Evaluate the model
metrics = trainer.evaluate()
print(metrics)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████████████████████| 4952/4952 [00:36<00:00, 137.40 examples/s]
Map: 100%|██████████████████████████| 1238/1238 [00:08<00:00, 148.31 examples/s]
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1,Recall
1,1.0294,0.795998,0.798101,0.782454,0.798101


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.795998215675354, 'eval_accuracy': 0.7981012658227848, 'eval_f1': 0.7824542352532943, 'eval_recall': 0.7981012658227848, 'eval_runtime': 3024.2122, 'eval_samples_per_second': 2.612, 'eval_steps_per_second': 0.653, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [148]:
pip install --upgrade torch transformers datasets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting transformers
  Downloading transformers-4.48.2-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m922.9 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.0-cp39-abi3-macosx_10_12_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.48.2-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hDownloading tokenizers-0.21.0-cp39-abi3-macosx_10_12_x86_64.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m24.4 M

Test Trained Model on old dataset

In [84]:
import torch
import json

def evaluate_on_new_data(model, tokenizer, dataset):
    # Use the fine-tuned model for prediction on the new dataset (e.g., Beyoncé data)
    # beyonce_data = dataset['Beyoncé']
    beyonce_data = dataset
    predictions = []

    # Make predictions for each question in the dataset
    for entry in beyonce_data:
        context = entry['context']
        context_questions = entry['qas']

        for qa in context_questions:
            question = qa['question']
            actual_answers = qa['answers']

            # Tokenize the input context and question
            inputs = tokenizer.encode_plus(
                question, 
                context, 
                return_tensors='pt', 
                max_length=512, 
                truncation=True, 
                padding='max_length'
            )

            # Perform inference with the fine-tuned model
            with torch.no_grad():
                outputs = model(**inputs)
                start_logits = outputs.start_logits
                end_logits = outputs.end_logits

            # Get the best start and end positions
            start_index = torch.argmax(start_logits)
            end_index = torch.argmax(end_logits) + 1

            # Convert token indices back to a string
            input_ids = inputs['input_ids'][0]
            predicted_answer = tokenizer.convert_tokens_to_string(
                tokenizer.convert_ids_to_tokens(input_ids[start_index:end_index])
            ).strip()

            # If predicted answer is empty or contains special tokens, mark it as "no answer"
            if not predicted_answer or '[CLS]' in predicted_answer or '[SEP]' in predicted_answer:
                predicted_answer = "no answer"

            # Store the prediction results
            predictions.append({
                'context': context,
                'question': question,
                'actual_answers': actual_answers,
                'predicted_answer': predicted_answer
            })

    # Evaluate predictions using the provided calculate_metrics function
    # accuracy, precision, recall, f1 = calculate_metrics(predictions)
    return predictions

    # print(json.dumps(predictions[:5], indent=4))

original_predictions = evaluate_on_new_data(model, tokenizer, original_data_test)

metrics = evaluate(original_predictions)

accuracy = metrics["accuracy"]*100
precision = metrics["precision"]*100
recall = metrics["recall"]*100
f1 = metrics["f1"]*100

print(f"Accuracy: {float(accuracy):.2f}")
print(f"Precision: {float(precision):.2f}")
print(f"Recall: {float(recall):.2f}")
print(f"F1 Score: {float(f1):.2f}")


0.4835443037974684 0.4835443037974684 1.0 0.6518771331058021
Accuracy: 48.35
Precision: 48.35
Recall: 100.00
F1 Score: 65.19


Test Trained Model on new dataset

In [86]:
final_predictions = evaluate_on_new_data(model, tokenizer, final_data_test)
metrics = evaluate(final_predictions)

accuracy = metrics["accuracy"]*100
precision = metrics["precision"]*100
recall = metrics["recall"]*100
f1 = metrics["f1"]*100

print(f"Accuracy: {float(accuracy):.2f}")
print(f"Precision: {float(precision):.2f}")
print(f"Recall: {float(recall):.2f}")
print(f"F1 Score: {float(f1):.2f}")

0.9529113924050633 0.9529113924050633 1.0 0.9758879958516982
Accuracy: 95.29
Precision: 95.29
Recall: 100.00
F1 Score: 97.59
