In [None]:
import json
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizerFast, Trainer, TrainingArguments
from transformers import pipeline
from datasets import Dataset
import torch
from torch.quantization import quantize_dynamic
from datasets import load_metric
from sklearn.model_selection import train_test_split

pathToSaveModel="./trained_model_1"
trainSet = "FinalDataset/train_How_has_this_trial_helped.json"
testSet = "FinalDataset/test_How_has_this_trial_helped.json"

In [None]:
# Load the pre-trained BERT model and tokenizer
# model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# Load a pre-trained DistilBERT model and tokenizer
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')


In [None]:
# Load the training dataset
with open(trainSet, 'r') as f:
    train_data = json.load(f)

# Load the test dataset
with open(testSet, 'r') as f:
    test_data = json.load(f)

# Prepare the datasets in the correct format
def prepare_data(data):
    formatted_data = {
        'question': [],
        'context': [],
        'answer': []
    }

    for item in data:
        formatted_data['question'].append(item['question'])
        formatted_data['context'].append(item['context'])
        formatted_data['answer'].append({
            'text': [item['answer']],
            'answer_start': [item['context'].find(item['answer'])]
        })
    
    return Dataset.from_dict(formatted_data)

In [None]:
# Convert to Dataset objects
full_train_dataset = prepare_data(train_data)
test_dataset = prepare_data(test_data)

# Split the training data into training and validation sets
train_size = 0.8  # 80% for training, 20% for validation
train_indices, val_indices = train_test_split(list(range(len(full_train_dataset))), train_size=train_size, random_state=42)

train_dataset = full_train_dataset.select(train_indices)
eval_dataset = full_train_dataset.select(val_indices)

# Load a pre-trained DistilBERT model and tokenizer
model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Preprocessing function
def preprocess_function(examples):
    inputs = tokenizer(
        examples['question'], 
        examples['context'], 
        truncation=True, 
        padding='max_length', 
        max_length=384,
        return_offsets_mapping=True
    )
    
    offset_mapping = inputs.pop("offset_mapping")
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = examples["answer"][i]
        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = inputs.sequence_ids(i)

        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_idx = context_start
            end_idx = context_end
            while start_idx <= context_end and offset[start_idx][0] <= start_char:
                start_idx += 1
            while end_idx >= context_start and offset[end_idx][1] >= end_char:
                end_idx -= 1
            start_positions.append(start_idx - 1)
            end_positions.append(end_idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply the preprocessing function to the datasets
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True, remove_columns=train_dataset.column_names)
tokenized_eval_dataset = eval_dataset.map(preprocess_function, batched=True, remove_columns=eval_dataset.column_names)
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=test_dataset.column_names)

In [None]:
# # Set training arguments
# training_args = TrainingArguments(
#     output_dir="./results",
#     evaluation_strategy="epoch",  # Evaluate at the end of each epoch
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     num_train_epochs=3,
#     weight_decay=0.01,
#     save_strategy="epoch",
# )

# # Initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train_dataset,
#     eval_dataset=tokenized_eval_dataset,
#     tokenizer=tokenizer,
# )

# # Train the model
# trainer.train()

# # Save the model
# model.save_pretrained(pathToSaveModel)
# tokenizer.save_pretrained(pathToSaveModel)

In [None]:
# Final evaluation on the test dataset
def get_predictions(question, context):
    inputs = tokenizer(
        question,
        context,
        truncation=True,
        padding='max_length',
        max_length=384,
        return_tensors="pt"
    )
    
    with torch.no_grad():
        outputs = model(**inputs)

    start_logits = outputs.start_logits
    end_logits = outputs.end_logits
    
    start_index = torch.argmax(start_logits, dim=1).item()
    end_index = torch.argmax(end_logits, dim=1).item()
    
    if end_index < start_index:
        return ""

    tokens = inputs['input_ids'][0][start_index:end_index + 1]
    answer = tokenizer.decode(tokens, skip_special_tokens=True)
    
    return answer if answer.strip() else ""

# Generate predictions on the original test dataset (before tokenization)
predictions = []
references = []

for i in range(len(test_dataset)):
    example = test_dataset[i]
    question = example['question']
    context = example['context']
    actual_answer = example['answer']['text'][0]

    if not actual_answer:
        print(f"Skipping example {i} due to missing answer.")
        continue

    pred = get_predictions(question, context)

    if pred == "":
        print(f"Prediction is None for example {i}.")

    predictions.append({"id": str(i), "prediction_text": pred})
    
    if actual_answer is None or not actual_answer.strip():
        print(f"Skipping reference {i} due to missing or invalid actual answer.")
        continue
    
    references.append({
        "id": str(i),
        "answer": {
            "text": [actual_answer],
            "answer_start": [context.find(actual_answer)]
        }
    })

# Debugging: Check if there are any issues with references
for ref in references:
    if ref["answer"]["text"][0] is None:
        print(f"Reference issue found: {ref}")

# Debugging: Check if there are any issues with predictions
for pred in predictions:
    if pred["prediction_text"] == "":
        print(f"Prediction issue found: {pred}")


In [None]:
_predictions = []
_references = []
for i in range(len(predictions)):
    if predictions[i].get('prediction_text')!="":
        _predictions.append(predictions[i].get('prediction_text'))
        _references.append(references[i].get('answer').get('text')[0])

print(len(_predictions))

In [None]:
import numpy as np
import re
import string
from collections import Counter
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

def normalize_answer(s):
    """Lower text and remove punctuation, articles, and extra whitespace."""
    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)
    def white_space_fix(text):
        return ' '.join(text.split())
    def remove_punctuation(text):
        return ''.join(ch for ch in text if ch not in set(string.punctuation))
    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punctuation(lower(s))))

def f1_score(prediction, ground_truth):
    prediction_tokens = normalize_answer(prediction).split()
    ground_truth_tokens = normalize_answer(ground_truth).split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(prediction_tokens)
    recall = 1.0 * num_same / len(ground_truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def exact_match_score(prediction, ground_truth):
    return normalize_answer(prediction) == normalize_answer(ground_truth)

def bleu_score(prediction, references):
    reference_tokens = [normalize_answer(ref).split() for ref in references]
    prediction_tokens = normalize_answer(prediction).split()
    smoothing = SmoothingFunction().method4
    return sentence_bleu(reference_tokens, prediction_tokens, smoothing_function=smoothing)

def rouge_scores(prediction, reference):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(normalize_answer(reference), normalize_answer(prediction))
    return scores

def calculate_metrics(predictions, references):
    total_em = 0
    total_f1 = 0
    total_bleu = 0
    total_rouge1 = 0
    total_rouge2 = 0
    total_rougeL = 0

    for prediction, reference in zip(predictions, references):
        total_em += exact_match_score(prediction, reference)
        total_f1 += f1_score(prediction, reference)
        total_bleu += bleu_score(prediction, [reference])
        rouge = rouge_scores(prediction, reference)
        total_rouge1 += rouge['rouge1'].fmeasure
        total_rouge2 += rouge['rouge2'].fmeasure
        total_rougeL += rouge['rougeL'].fmeasure

    n = len(references)
    metrics = {
        "Exact Match": 100.0 * total_em / n,
        "F1 Score": 100.0 * total_f1 / n,
        "BLEU Score": 100.0 * total_bleu / n,
        "ROUGE-1": 100.0 * total_rouge1 / n,
        "ROUGE-2": 100.0 * total_rouge2 / n,
        "ROUGE-L": 100.0 * total_rougeL / n
    }

    return metrics

metrics = calculate_metrics(_predictions, _references)
print(metrics)
