In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import nltk
from nltk.tokenize import sent_tokenize
import re
from tqdm import tqdm
import json
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertTokenizerFast,
    DistilBertForQuestionAnswering,
    TrainingArguments,
    Trainer,
)
from transformers import BertTokenizer, BertForQuestionAnswering
import wandb

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Train Data Cleaning

try:
    dataset_path = '/content/drive/MyDrive/10701_Project/train-v2.0.json'
    with open(dataset_path, "r") as f:
      data = json.load(f)

    records = []
    for article in data["data"]:
        for p in article["paragraphs"]:
            context = re.sub(r"\s+", " ", p["context"]).strip()
            for qa in p["qas"]:
                if qa.get("is_impossible"):
                    continue  # skip unanswerable
                for ans in qa.get("answers", []):
                    records.append({
                        "title": article["title"],
                        "context": context,
                        "question": re.sub(r"\s+", " ", qa["question"]).strip(),
                        "answer_text": ans["text"].strip(),
                        "answer_start": ans["answer_start"]
                    })

    train = pd.DataFrame(records).drop_duplicates(subset=["question", "context"])
    train.to_csv("cleaned_squad_train.csv", index=False)
except FileNotFoundError:
    print(f"Error: Dataset not found at {dataset_path}. Please check the path.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

In [None]:
#Test Data Cleaning

try:
    dataset_path = '/content/drive/MyDrive/10701_Project/dev-v2.0.json'
    with open(dataset_path, "r") as f:
      data = json.load(f)

    records = []
    for article in data["data"]:
        for p in article["paragraphs"]:
            context = re.sub(r"\s+", " ", p["context"]).strip()
            for qa in p["qas"]:
                if qa.get("is_impossible"):
                    continue  # skip unanswerable
                for ans in qa.get("answers", []):
                    records.append({
                        "title": article["title"],
                        "context": context,
                        "question": re.sub(r"\s+", " ", qa["question"]).strip(),
                        "answer_text": ans["text"].strip(),
                        "answer_start": ans["answer_start"]
                    })

    test = pd.DataFrame(records).drop_duplicates(subset=["question", "context"])
    test.to_csv("cleaned_squad_test.csv", index=False)
except FileNotFoundError:
    print(f"Error: Dataset not found at {dataset_path}. Please check the path.")
except Exception as e:
    print(f"An error occurred while loading the dataset: {e}")

#Task 1 -  Sentence-level Retrieval

To compare different retrieval methods, we evaluate their ability to select the sentence in the context that contains the ground-truth answer span.
This task does not require the model to predict the exact start and end positions; instead, we only check whether the method can pick the correct sentence.

Given a question and its context, we first split the context into sentences.
Each retrieval method then assigns a score to every sentence:

- TF-IDF computes cosine distance between the question vector and each sentence vector.

- BM25 computes relevance scores based on term frequency.

- Sentence Transformer encodes the question and each sentence into fixed-length vectors and ranks sentences by cosine similarity.

- BERT-based QA models assign a score to each sentence by combining the maximum start and end logits from the QA head. After completing Task 2, we also compare the sentence-level retrieval performance of these models before and after fine-tuning.

The sentence with the highest score is selected.
A prediction is counted as correct if the selected sentence contains the ground-truth answer text.

This gives a sentence-level accuracy that measures how well the method can locate the relevant sentence before span extraction.


##TF-IDF Retriever

In [None]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_distances
import unicodedata

nltk.download('punkt_tab')
print("NLTK 'punkt_tab' resource downloaded.")

NLTK 'punkt_tab' resource downloaded.


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
def normalize_text(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


def retrieve_tfidf(question, context, vectorizer):
    sentences = sent_tokenize(context)
    corpus = [question] + sentences

    tfidf_matrix = vectorizer.transform(corpus)

    question_vec = tfidf_matrix[0]
    sentence_vecs = tfidf_matrix[1:]

    distances = cosine_distances(question_vec, sentence_vecs).flatten()
    min_idx = np.argmin(distances)

    return sentences[min_idx], distances[min_idx], min_idx


def evaluate_tfidf(df, vectorizer):
    results = []

    for count, (idx, row) in enumerate(df.iterrows(), 1):
        Q = normalize_text(row['question'])
        C = normalize_text(row['context'])
        A = normalize_text(row['answer_text'])

        best_sentence, distance, sent_idx = retrieve_tfidf(Q, C, vectorizer)
        found = A.lower() in best_sentence.lower()

        results.append({
            'question': Q,
            'context': C,
            'answer_text': A,
            'retrieved_sentence': best_sentence,
            'distance': distance,
            'sentence_index': sent_idx,
            'answer_found': found
        })

        if count % 500 == 0:
            print(f"Processed {count}/{len(df)} examples...")

    results_df = pd.DataFrame(results)
    accuracy = results_df['answer_found'].mean()
    print(f"\nTF-IDF Sentence Retrieval Accuracy: {accuracy:.2%}")

    return results_df



In [None]:
#TF-IDF fit
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize

vectorizer = TfidfVectorizer(stop_words='english')

train_sentences = []
for _, row in train.iterrows():
    sentences = sent_tokenize(row["context"])
    train_sentences.extend(sentences)

#fit on training data
vectorizer.fit(train_sentences)

In [None]:
#evaluation

import time

start = time.time()

results_df = evaluate_tfidf(test, vectorizer)

end = time.time()

print(f"TF-IDF Evaluation Time: {end - start:.2f} seconds")



# results_df = evaluate_tfidf(test, vectorizer)
# # results_df.to_csv("tfidf_results_full_test.csv", index=False)

Processed 500/5923 examples...
Processed 1000/5923 examples...
Processed 1500/5923 examples...
Processed 2000/5923 examples...
Processed 2500/5923 examples...
Processed 3000/5923 examples...
Processed 3500/5923 examples...
Processed 4000/5923 examples...
Processed 4500/5923 examples...
Processed 5000/5923 examples...
Processed 5500/5923 examples...

TF-IDF Sentence Retrieval Accuracy: 75.25%
TF-IDF Evaluation Time: 9.19 seconds


In [None]:
# model = TfidfVectorizer(stop_words='english')

# print("\nEvaluating")
# results_df = evaluate_tfidf(df, sample_size=500, random_state=42)

# output_file = "TFIDF_retrieval_results.csv"
# results_df.to_csv(output_file, index=False)


# print(f"Total examples evaluated: {len(results_df)}")
# print(f"Examples with answer found: {results_df['answer_found'].sum()}")
# print(f"Retrieval accuracy: {results_df['answer_found'].mean():.2%}")
# print(f"Average distance: {results_df['distance'].mean():.4f}")
# print(f"Median distance: {results_df['distance'].median():.4f}")
# print(f"Min distance: {results_df['distance'].min():.4f}")
# print(f"Max distance: {results_df['distance'].max():.4f}")

##BM25

In [None]:
!pip install rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2


In [None]:
from rank_bm25 import BM25Okapi
from nltk.tokenize import sent_tokenize, word_tokenize

def retrieve_bm25(question, context):
    #Split context into sentences
    sentences = sent_tokenize(context)

    #Tokenize sentences
    tokenized_corpus = [word_tokenize(s.lower()) for s in sentences]

    #Build BM25 index
    bm25 = BM25Okapi(tokenized_corpus)

    #Tokenize question
    query_tokens = word_tokenize(question.lower())

    #Compute BM25 scores
    scores = bm25.get_scores(query_tokens)

    #Find the best matching sentence
    best_idx = int(np.argmax(scores))
    best_sentence = sentences[best_idx]
    best_score = scores[best_idx]

    return best_sentence, -best_score, best_idx

def evaluate_bm25(df):
    results = []

    for count, (idx, row) in enumerate(df.iterrows(), 1):
        Q = normalize_text(row['question'])
        C = normalize_text(row['context'])
        A = normalize_text(row['answer_text'])

        best_sentence, distance, sent_idx = retrieve_bm25(Q, C)
        found = A.lower() in best_sentence.lower()

        results.append({
            'question': Q,
            'context': C,
            'answer_text': A,
            'retrieved_sentence': best_sentence,
            'distance': distance,
            'sentence_index': sent_idx,
            'answer_found': found
        })

        if count % 500 == 0:
            print(f"Processed {count}/{len(df)} examples...")

    results_df = pd.DataFrame(results)
    accuracy = results_df['answer_found'].mean()
    print(f"\nBM25 Sentence Retrieval Accuracy: {accuracy:.2%}")

    return results_df


# def evaluate_bm25(df, sample_size=None, random_state=42):
#     if sample_size:
#         df = df.sample(n=min(sample_size, len(df)), random_state=random_state)

#     results = []

#     for count, (idx, row) in enumerate(df.iterrows(), 1):
#         Q = normalize_text(row['question'])
#         C = normalize_text(row['context'])
#         A = normalize_text(row['answer_text'])

#         best_sentence, distance, sent_idx = retrieve_bm25(Q, C)
#         found = A.lower() in best_sentence.lower() if best_sentence else False

#         results.append({
#             'question': Q,
#             'context': C,
#             'answer_text': A,
#             'retrieved_sentence': best_sentence,
#             'distance': distance,
#             'sentence_index': sent_idx,
#             'answer_found': found
#         })

#         if count % 100 == 0:
#             print(f"Processed {count}/{len(df)} examples...")

#     results_df = pd.DataFrame(results)
#     accuracy = results_df['answer_found'].mean()
#     print(f"\nBM25 Retrieval Accuracy: {accuracy:.2%}")

#     return results_df



In [None]:
start = time.time()

bm25_results = evaluate_bm25(test)

end = time.time()

print(f"BM25 Evaluation Time: {end - start:.2f} seconds")


# print(f"Total examples evaluated: {len(results_bm25)}")
# print(f"Examples with answer found: {results_bm25['answer_found'].sum()}")
# print(f"Retrieval accuracy: {results_bm25['answer_found'].mean():.2%}")
# print(f"Average distance: {results_bm25['distance'].mean():.4f}")
# print(f"Median distance: {results_bm25['distance'].median():.4f}")
# print(f"Min distance: {results_bm25['distance'].min():.4f}")
# print(f"Max distance: {results_bm25['distance'].max():.4f}")

# results_bm25.to_csv("bm25_retrieval_results.csv", index=False)
# print("Results saved to bm25_retrieval_results.csv")


Processed 500/5923 examples...
Processed 1000/5923 examples...
Processed 1500/5923 examples...
Processed 2000/5923 examples...
Processed 2500/5923 examples...
Processed 3000/5923 examples...
Processed 3500/5923 examples...
Processed 4000/5923 examples...
Processed 4500/5923 examples...
Processed 5000/5923 examples...
Processed 5500/5923 examples...

BM25 Sentence Retrieval Accuracy: 78.73%
BM25 Evaluation Time: 6.45 seconds


In [None]:
# def evaluate_bert_sentence(df, model, tokenizer, sample_size=None, random_state=42):
#     """
#     Evaluate sentence-level retrieval using a fine-tuned BERT/ALBERT model.
#     df: DataFrame with columns 'question', 'context', 'answer_text'
#     model: HuggingFace QA model
#     tokenizer: corresponding tokenizer
#     """
#     if sample_size:
#         df = df.sample(n=min(sample_size, len(df)), random_state=random_state)

#     results = []

#     for count, (idx, row) in enumerate(df.iterrows(), 1):
#         Q = normalize_text(row['question'])
#         C = normalize_text(row['context'])
#         A = normalize_text(row['answer_text'])


#         best_sentence, score, sent_idx = retrieve_sentence_with_bert(model, tokenizer, Q, C)


#         found = A.lower() in best_sentence.lower() if best_sentence else False

#         results.append({
#             'question': Q,
#             'context': C,
#             'answer_text': A,
#             'retrieved_sentence': best_sentence,
#             'score': score,
#             'sentence_index': sent_idx,
#             'answer_found': found
#         })

#         if count % 100 == 0:
#             print(f"Processed {count}/{len(df)} examples...")

#     results_df = pd.DataFrame(results)
#     accuracy = results_df['answer_found'].mean()
#     print(f"\nBERT Sentence Retrieval Accuracy: {accuracy:.2%}")

#     return results_df

##Sentence Transformer (all-MiniLM-L6-v2)

In [None]:
from sentence_transformers import SentenceTransformer, util

st_model = SentenceTransformer('all-MiniLM-L6-v2')


def retrieve_sentence_st(model, question, context):
    sentences = sent_tokenize(context)

    # encode question and sentences
    q_emb = model.encode(question, convert_to_tensor=True)
    s_embs = model.encode(sentences, convert_to_tensor=True)

    # compute similarity
    scores = util.cos_sim(q_emb, s_embs)[0]

    best_idx = int(scores.argmax())
    best_sentence = sentences[best_idx]
    best_score = float(scores[best_idx])

    return best_sentence, best_score, best_idx

def evaluate_sentence_transformer(df, model):
    results = []
    for count, (_, row) in enumerate(df.iterrows(), 1):
        Q = normalize_text(row["question"])
        C = normalize_text(row["context"])
        A = normalize_text(row["answer_text"])

        best_sentence, score, idx = retrieve_sentence_st(model, Q, C)
        found = A.lower() in best_sentence.lower()

        results.append({
            "question": Q,
            "retrieved_sentence": best_sentence,
            "answer_text": A,
            "score": score,
            "answer_found": found,
            "sentence_index": idx
        })

        if count % 500 == 0:
            print(f"Processed {count}/{len(df)} examples...")

    df_out = pd.DataFrame(results)
    accuracy = df_out["answer_found"].mean()
    print(f"\nSentence Transformer Retrieval Accuracy: {accuracy:.2%}")
    return df_out

start = time.time()
st_results    = evaluate_sentence_transformer(test, st_model)
end = time.time()
print(f"Sentence Transformer (all-MiniLM-L6-v2) Evaluation Time: {end - start:.2f} seconds")

Processed 500/5923 examples...
Processed 1000/5923 examples...
Processed 1500/5923 examples...
Processed 2000/5923 examples...
Processed 2500/5923 examples...
Processed 3000/5923 examples...
Processed 3500/5923 examples...
Processed 4000/5923 examples...
Processed 4500/5923 examples...
Processed 5000/5923 examples...
Processed 5500/5923 examples...

Sentence Transformer Retrieval Accuracy: 78.86%
Sentence Transformer (all-MiniLM-L6-v2) Evaluation Time: 80.82 seconds


##Sentence Transformer ('all-mpnet-base-v2')

In [None]:
from sentence_transformers import SentenceTransformer, util

st_model = SentenceTransformer('all-mpnet-base-v2')


start = time.time()
st_results    = evaluate_sentence_transformer(test, st_model)
end = time.time()
print(f"Sentence Transformer (all-mpnet-base-v2) Evaluation Time: {end - start:.2f} seconds")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processed 500/5923 examples...
Processed 1000/5923 examples...
Processed 1500/5923 examples...
Processed 2000/5923 examples...
Processed 2500/5923 examples...
Processed 3000/5923 examples...
Processed 3500/5923 examples...
Processed 4000/5923 examples...
Processed 4500/5923 examples...
Processed 5000/5923 examples...
Processed 5500/5923 examples...

Sentence Transformer Retrieval Accuracy: 80.35%
Sentence Transformer (all-MiniLM-L6-v2) Evaluation Time: 151.66 seconds


##Sentence Transformer (multi-qa-MiniLM-L6-cos-v1')

In [None]:
st_model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

start = time.time()
st_results    = evaluate_sentence_transformer(test, st_model)
end = time.time()
print(f"Sentence Transformer ('multi-qa-MiniLM-L6-cos-v1') Evaluation Time: {end - start:.2f} seconds")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processed 500/5923 examples...
Processed 1000/5923 examples...
Processed 1500/5923 examples...
Processed 2000/5923 examples...
Processed 2500/5923 examples...
Processed 3000/5923 examples...
Processed 3500/5923 examples...
Processed 4000/5923 examples...
Processed 4500/5923 examples...
Processed 5000/5923 examples...
Processed 5500/5923 examples...

Sentence Transformer Retrieval Accuracy: 77.93%
Sentence Transformer ('multi-qa-MiniLM-L6-cos-v1') Evaluation Time: 82.73 seconds


##Sentence Transformer ('distilbert-base-nli-stsb-mean-tokens')

In [None]:
# Load the pre-trained SentenceTransformer model
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
# model = SentenceTransformer('all-mpnet-base-v1')

# print("SentenceTransformer model 'distilbert-base-nli-stsb-mean-tokens' loaded successfully.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/555 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

SentenceTransformer model 'distilbert-base-nli-stsb-mean-tokens' loaded successfully.


In [None]:
def retrieve_answer_sentence(question, context):
    """
    Retrieves the sentence from context that has minimal distance to the question.

    Args:
        question (str): The question to answer
        context (str): The context paragraph containing the answer

    Returns:
        tuple: (best_sentence, distance, sentence_index)
    """
    try:
        nltk.data.find('tokenizers/punkt')
    except LookupError:
        nltk.download('punkt')
    # Split context into sentences
    sentences = sent_tokenize(context)

    if not sentences:
        return None, float('inf'), -1

    # Encode question
    question_embedding = model.encode(question, convert_to_tensor=False)

    # Encode all sentences
    sentence_embeddings = model.encode(sentences, convert_to_tensor=False)

    # Calculate cosine distances between question and each sentence
    distances = []
    for sent_emb in sentence_embeddings:
        # Cosine distance = 1 - cosine similarity
        dist = cosine(question_embedding, sent_emb)
        distances.append(dist)

    # Find sentence with minimal distance
    min_idx = np.argmin(distances)
    best_sentence = sentences[min_idx]
    min_distance = distances[min_idx]

    return best_sentence, min_distance, min_idx

In [None]:
def evaluate_retriever(df, sample_size=None):
    """
    Evaluates the semantic retriever on the dataset.

    Args:
        df (pd.DataFrame): DataFrame with questions, contexts, and answers
        sample_size (int): If provided, evaluate on a random sample of this size

    Returns:
        pd.DataFrame: Results with retrieved sentences and evaluation metrics
    """
    if sample_size:
        df = df.sample(n=min(sample_size, len(df)), random_state=42)

    results = []

    print(f"Processing {len(df)} question-context pairs...")
    for count, (idx, row) in enumerate(df.iterrows(), 1):
        question = row['question']
        context = row['context']
        answer_text = row['answer_text']

        # Retrieve best sentence
        best_sentence, distance, sent_idx = retrieve_answer_sentence(question, context)

        # Check if answer is in the retrieved sentence
        answer_found = answer_text.lower() in best_sentence.lower() if best_sentence else False

        results.append({
            'question': question,
            'context': context,
            'answer_text': answer_text,
            'retrieved_sentence': best_sentence,
            'distance': distance,
            'sentence_index': sent_idx,
            'answer_found': answer_found
        })

        if count % 100 == 0:
            print(f"Processed {count}/{len(df)} examples...")

    results_df = pd.DataFrame(results)

    # Calculate accuracy
    accuracy = results_df['answer_found'].mean()
    print(f"\nRetrieval Accuracy: {accuracy:.2%}")
    print(f"Average Distance: {results_df['distance'].mean():.4f}")

    return results_df

In [None]:
import nltk
nltk.download('punkt_tab')
print("NLTK 'punkt_tab' resource downloaded.")

NLTK 'punkt_tab' resource downloaded.


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Evaluate on a sample (use smaller sample for faster testing)
print("\nEvaluating on sample of 500 examples...")
results_df = evaluate_retriever(df, sample_size=500)

# Save results
output_file = "semantic_retrieval_results.csv"
results_df.to_csv(output_file, index=False)
print(f"\nResults saved to {output_file}")

# Show some statistics
print("\n" + "="*80)
print("STATISTICS")
print("="*80)
print(f"Total examples evaluated: {len(results_df)}")
print(f"Examples with answer found: {results_df['answer_found'].sum()}")
print(f"Retrieval accuracy: {results_df['answer_found'].mean():.2%}")
print(f"Average distance: {results_df['distance'].mean():.4f}")
print(f"Median distance: {results_df['distance'].median():.4f}")
print(f"Min distance: {results_df['distance'].min():.4f}")
print(f"Max distance: {results_df['distance'].max():.4f}")


Evaluating on sample of 500 examples...
Processing 500 question-context pairs...


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Processed 100/500 examples...
Processed 200/500 examples...
Processed 300/500 examples...
Processed 400/500 examples...
Processed 500/500 examples...

Retrieval Accuracy: 71.20%
Average Distance: 0.4393

Results saved to semantic_retrieval_results.csv

STATISTICS
Total examples evaluated: 500
Examples with answer found: 356
Retrieval accuracy: 71.20%
Average distance: 0.4393
Median distance: 0.4333
Min distance: 0.0914
Max distance: 0.9163


In [None]:
st_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')


start = time.time()
st_results    = evaluate_sentence_transformer(test, st_model)
end = time.time()
print(f"Sentence Transformer ('distilbert-base-nli-stsb-mean-tokens') Evaluation Time: {end - start:.2f} seconds")


Processed 500/5923 examples...
Processed 1000/5923 examples...
Processed 1500/5923 examples...
Processed 2000/5923 examples...
Processed 2500/5923 examples...
Processed 3000/5923 examples...
Processed 3500/5923 examples...
Processed 4000/5923 examples...
Processed 4500/5923 examples...
Processed 5000/5923 examples...
Processed 5500/5923 examples...

Sentence Transformer Retrieval Accuracy: 71.01%
Sentence Transformer ('distilbert-base-nli-stsb-mean-tokens') Evaluation Time: 76.68 seconds


##Untuned Bert (base-uncased)

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import nltk
from nltk.tokenize import sent_tokenize
import re
from tqdm import tqdm
import json
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertTokenizerFast,
    BertTokenizerFast,
    DistilBertForQuestionAnswering,
    TrainingArguments,
    Trainer,
)
from transformers import BertTokenizer, BertForQuestionAnswering
import wandb
import matplotlib.pyplot as plt
import string
import collections
import os
from google.colab import drive

In [None]:
def bert_sentence_scores_batch(model, tokenizer, question, sentences, max_length=384):
    """
    Return 1 score per sentence using a single batched forward pass.
    """
    enc = tokenizer(
        [question] * len(sentences),
        sentences,
        truncation=True,
        max_length=max_length,
        padding=True,
        return_tensors="pt"
    ).to("cuda")

    with torch.no_grad():
        out = model(**enc)

    scores = out.start_logits.max(dim=1).values + out.end_logits.max(dim=1).values
    return scores.cpu().numpy()

def retrieve_sentence_with_bert(model, tokenizer, question, context):
    sentences = sent_tokenize(context)
    if len(sentences) == 0:
        return "", -1e9, -1

    scores = bert_sentence_scores_batch(model, tokenizer, question, sentences)
    best_idx = int(scores.argmax())

    return sentences[best_idx], float(scores[best_idx]), best_idx

def evaluate_bert_sentence(df, model, tokenizer):
    results = []
    hits = 0
    n = len(df)

    for i, (idx, row) in enumerate(df.iterrows(), 1):
        q = row["question"]
        c = row["context"]
        a = row["answer_text"]

        best_sentence, score, s_idx = retrieve_sentence_with_bert(model, tokenizer, q, c)
        found = normalize_text(a) in normalize_text(best_sentence)
        hits += int(found)

        results.append({
            "question": q,
            "context": c,
            "answer_text": a,
            "retrieved_sentence": best_sentence,
            "score": score,
            "sentence_index": s_idx,
            "answer_found": found
        })

        if i % 200 == 0:
            print(f"Processed {i}/{n} examples...")

    acc = hits / n
    print(f"\nBERT Sentence-Level Retrieval Accuracy: {acc:.2%}")

    return pd.DataFrame(results)



model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name).to("cuda")
model.eval()

start = time.time()
results_bert = evaluate_bert_sentence(test, model, tokenizer)
end = time.time()
print(f"Untuned Bert Evaluation Time: {end - start:.2f} seconds")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed 200/5923 examples...
Processed 400/5923 examples...
Processed 600/5923 examples...
Processed 800/5923 examples...
Processed 1000/5923 examples...
Processed 1200/5923 examples...
Processed 1400/5923 examples...
Processed 1600/5923 examples...
Processed 1800/5923 examples...
Processed 2000/5923 examples...
Processed 2200/5923 examples...
Processed 2400/5923 examples...
Processed 2600/5923 examples...
Processed 2800/5923 examples...
Processed 3000/5923 examples...
Processed 3200/5923 examples...
Processed 3400/5923 examples...
Processed 3600/5923 examples...
Processed 3800/5923 examples...
Processed 4000/5923 examples...
Processed 4200/5923 examples...
Processed 4400/5923 examples...
Processed 4600/5923 examples...
Processed 4800/5923 examples...
Processed 5000/5923 examples...
Processed 5200/5923 examples...
Processed 5400/5923 examples...
Processed 5600/5923 examples...
Processed 5800/5923 examples...

BERT Sentence-Level Retrieval Accuracy: 30.20%
Untuned Bert Evaluation Time

##Tuned Bert (base-uncased)

In [None]:
def bert_sentence_scores_batch(model, tokenizer, question, sentences, max_length=384):
    """
    Return 1 score per sentence using a single batched forward pass.
    """
    enc = tokenizer(
        [question] * len(sentences),
        sentences,
        truncation=True,
        max_length=max_length,
        padding=True,
        return_tensors="pt"
    ).to("cuda")

    with torch.no_grad():
        out = model(**enc)

    scores = out.start_logits.max(dim=1).values + out.end_logits.max(dim=1).values
    return scores.cpu().numpy()

def retrieve_sentence_with_bert(model, tokenizer, question, context):
    sentences = sent_tokenize(context)
    if len(sentences) == 0:
        return "", -1e9, -1

    scores = bert_sentence_scores_batch(model, tokenizer, question, sentences)
    best_idx = int(scores.argmax())

    return sentences[best_idx], float(scores[best_idx]), best_idx

def evaluate_bert_sentence(df, model, tokenizer):
    results = []
    hits = 0
    n = len(df)

    for i, (idx, row) in enumerate(df.iterrows(), 1):
        q = row["question"]
        c = row["context"]
        a = row["answer_text"]

        best_sentence, score, s_idx = retrieve_sentence_with_bert(model, tokenizer, q, c)
        found = normalize_text(a) in normalize_text(best_sentence)
        hits += int(found)

        results.append({
            "question": q,
            "context": c,
            "answer_text": a,
            "retrieved_sentence": best_sentence,
            "score": score,
            "sentence_index": s_idx,
            "answer_found": found
        })

        if i % 200 == 0:
            print(f"Processed {i}/{n} examples...")

    acc = hits / n
    print(f"\nBERT Sentence-Level Retrieval Accuracy: {acc:.2%}")

    return pd.DataFrame(results)


path = "/content/drive/MyDrive/10701_Project/saved_models/bert-finetuned"
tokenizer = BertTokenizerFast.from_pretrained(path)
model = BertForQuestionAnswering.from_pretrained(path).to("cuda")
model.eval()

start = time.time()
results_bert = evaluate_bert_sentence(test, model, tokenizer)
end = time.time()
print(f"Tuned Bert Evaluation Time: {end - start:.2f} seconds")


Processed 200/5923 examples...
Processed 400/5923 examples...
Processed 600/5923 examples...
Processed 800/5923 examples...
Processed 1000/5923 examples...
Processed 1200/5923 examples...
Processed 1400/5923 examples...
Processed 1600/5923 examples...
Processed 1800/5923 examples...
Processed 2000/5923 examples...
Processed 2200/5923 examples...
Processed 2400/5923 examples...
Processed 2600/5923 examples...
Processed 2800/5923 examples...
Processed 3000/5923 examples...
Processed 3200/5923 examples...
Processed 3400/5923 examples...
Processed 3600/5923 examples...
Processed 3800/5923 examples...
Processed 4000/5923 examples...
Processed 4200/5923 examples...
Processed 4400/5923 examples...
Processed 4600/5923 examples...
Processed 4800/5923 examples...
Processed 5000/5923 examples...
Processed 5200/5923 examples...
Processed 5400/5923 examples...
Processed 5600/5923 examples...
Processed 5800/5923 examples...

BERT Sentence-Level Retrieval Accuracy: 82.88%
Tuned Bert Evaluation Time: 

##Untuned Albert

In [None]:
from transformers import AlbertTokenizer, AlbertForQuestionAnswering
from transformers import AlbertTokenizerFast
tokenizer = AlbertTokenizerFast.from_pretrained("albert-base-v2")
model = AlbertForQuestionAnswering.from_pretrained("albert-base-v2").to("cuda")
model.eval()


start = time.time()
results_bert = evaluate_bert_sentence(test, model, tokenizer)
end = time.time()
print(f"Ununed Albert Evaluation Time: {end - start:.2f} seconds")


Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed 200/5923 examples...
Processed 400/5923 examples...
Processed 600/5923 examples...
Processed 800/5923 examples...
Processed 1000/5923 examples...
Processed 1200/5923 examples...
Processed 1400/5923 examples...
Processed 1600/5923 examples...
Processed 1800/5923 examples...
Processed 2000/5923 examples...
Processed 2200/5923 examples...
Processed 2400/5923 examples...
Processed 2600/5923 examples...
Processed 2800/5923 examples...
Processed 3000/5923 examples...
Processed 3200/5923 examples...
Processed 3400/5923 examples...
Processed 3600/5923 examples...
Processed 3800/5923 examples...
Processed 4000/5923 examples...
Processed 4200/5923 examples...
Processed 4400/5923 examples...
Processed 4600/5923 examples...
Processed 4800/5923 examples...
Processed 5000/5923 examples...
Processed 5200/5923 examples...
Processed 5400/5923 examples...
Processed 5600/5923 examples...
Processed 5800/5923 examples...

BERT Sentence-Level Retrieval Accuracy: 27.01%
Ununed Albert Evaluation Tim

##Tuned Albert

In [None]:
path = "/content/drive/MyDrive/10701_Project/saved_models/albert-finetuned"
tokenizer = AlbertTokenizerFast.from_pretrained(path)
model = AlbertForQuestionAnswering.from_pretrained(path).to("cuda")
model.eval()


start = time.time()
results_bert = evaluate_bert_sentence(test, model, tokenizer)
end = time.time()
print(f"Tuned Albert Evaluation Time: {end - start:.2f} seconds")


Processed 200/5923 examples...
Processed 400/5923 examples...
Processed 600/5923 examples...
Processed 800/5923 examples...
Processed 1000/5923 examples...
Processed 1200/5923 examples...
Processed 1400/5923 examples...
Processed 1600/5923 examples...
Processed 1800/5923 examples...
Processed 2000/5923 examples...
Processed 2200/5923 examples...
Processed 2400/5923 examples...
Processed 2600/5923 examples...
Processed 2800/5923 examples...
Processed 3000/5923 examples...
Processed 3200/5923 examples...
Processed 3400/5923 examples...
Processed 3600/5923 examples...
Processed 3800/5923 examples...
Processed 4000/5923 examples...
Processed 4200/5923 examples...
Processed 4400/5923 examples...
Processed 4600/5923 examples...
Processed 4800/5923 examples...
Processed 5000/5923 examples...
Processed 5200/5923 examples...
Processed 5400/5923 examples...
Processed 5600/5923 examples...
Processed 5800/5923 examples...

BERT Sentence-Level Retrieval Accuracy: 74.89%
Tuned Albert Evaluation Time

##Untuned Roberta

In [None]:
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
model = RobertaForQuestionAnswering.from_pretrained("roberta-base").to("cuda")
model.eval()



start = time.time()
results_bert = evaluate_bert_sentence(test, model, tokenizer)
end = time.time()
print(f"Untuned Roberta Evaluation Time: {end - start:.2f} seconds")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processed 200/5923 examples...
Processed 400/5923 examples...
Processed 600/5923 examples...
Processed 800/5923 examples...
Processed 1000/5923 examples...
Processed 1200/5923 examples...
Processed 1400/5923 examples...
Processed 1600/5923 examples...
Processed 1800/5923 examples...
Processed 2000/5923 examples...
Processed 2200/5923 examples...
Processed 2400/5923 examples...
Processed 2600/5923 examples...
Processed 2800/5923 examples...
Processed 3000/5923 examples...
Processed 3200/5923 examples...
Processed 3400/5923 examples...
Processed 3600/5923 examples...
Processed 3800/5923 examples...
Processed 4000/5923 examples...
Processed 4200/5923 examples...
Processed 4400/5923 examples...
Processed 4600/5923 examples...
Processed 4800/5923 examples...
Processed 5000/5923 examples...
Processed 5200/5923 examples...
Processed 5400/5923 examples...
Processed 5600/5923 examples...
Processed 5800/5923 examples...

BERT Sentence-Level Retrieval Accuracy: 35.46%
Untuned Roberta Evaluation T

##Tuned Roberta

In [None]:
path = "/content/drive/MyDrive/10701_Project/saved_models/roberta-finetuned"
tokenizer = RobertaTokenizerFast.from_pretrained(path)
model = RobertaForQuestionAnswering.from_pretrained(path).to("cuda")
model.eval()

start = time.time()
results_bert = evaluate_bert_sentence(test, model, tokenizer)
end = time.time()
print(f"Tuned Roberta Evaluation Time: {end - start:.2f} seconds")

Processed 200/5923 examples...
Processed 400/5923 examples...
Processed 600/5923 examples...
Processed 800/5923 examples...
Processed 1000/5923 examples...
Processed 1200/5923 examples...
Processed 1400/5923 examples...
Processed 1600/5923 examples...
Processed 1800/5923 examples...
Processed 2000/5923 examples...
Processed 2200/5923 examples...
Processed 2400/5923 examples...
Processed 2600/5923 examples...
Processed 2800/5923 examples...
Processed 3000/5923 examples...
Processed 3200/5923 examples...
Processed 3400/5923 examples...
Processed 3600/5923 examples...
Processed 3800/5923 examples...
Processed 4000/5923 examples...
Processed 4200/5923 examples...
Processed 4400/5923 examples...
Processed 4600/5923 examples...
Processed 4800/5923 examples...
Processed 5000/5923 examples...
Processed 5200/5923 examples...
Processed 5400/5923 examples...
Processed 5600/5923 examples...
Processed 5800/5923 examples...

BERT Sentence-Level Retrieval Accuracy: 88.15%
Tuned Roberta Evaluation Tim

##Untuned SpanBert

In [None]:
model_name = "SpanBERT/spanbert-base-cased"
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name).to("cuda")
model.eval()



start = time.time()
results_bert = evaluate_bert_sentence(test, model, tokenizer)
end = time.time()
print(f"Untuned SpanBert Evaluation Time: {end - start:.2f} seconds")



config.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/215M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/215M [00:00<?, ?B/s]

Processed 200/5923 examples...
Processed 400/5923 examples...
Processed 600/5923 examples...
Processed 800/5923 examples...
Processed 1000/5923 examples...
Processed 1200/5923 examples...
Processed 1400/5923 examples...
Processed 1600/5923 examples...
Processed 1800/5923 examples...
Processed 2000/5923 examples...
Processed 2200/5923 examples...
Processed 2400/5923 examples...
Processed 2600/5923 examples...
Processed 2800/5923 examples...
Processed 3000/5923 examples...
Processed 3200/5923 examples...
Processed 3400/5923 examples...
Processed 3600/5923 examples...
Processed 3800/5923 examples...
Processed 4000/5923 examples...
Processed 4200/5923 examples...
Processed 4400/5923 examples...
Processed 4600/5923 examples...
Processed 4800/5923 examples...
Processed 5000/5923 examples...
Processed 5200/5923 examples...
Processed 5400/5923 examples...
Processed 5600/5923 examples...
Processed 5800/5923 examples...

BERT Sentence-Level Retrieval Accuracy: 27.17%
Untuned SpanBert Evaluation 

##Tuned SpanBert

In [None]:
path = "/content/drive/MyDrive/10701_Project/saved_models/SpanBERT-finetuned"
tokenizer = AutoTokenizer.from_pretrained(path)
model = AutoModelForQuestionAnswering.from_pretrained(path).to("cuda")
model.eval()

start = time.time()
results_bert = evaluate_bert_sentence(test, model, tokenizer)
end = time.time()
print(f"Tuned SpanBert Evaluation Time: {end - start:.2f} seconds")

Processed 200/5923 examples...
Processed 400/5923 examples...
Processed 600/5923 examples...
Processed 800/5923 examples...
Processed 1000/5923 examples...
Processed 1200/5923 examples...
Processed 1400/5923 examples...
Processed 1600/5923 examples...
Processed 1800/5923 examples...
Processed 2000/5923 examples...
Processed 2200/5923 examples...
Processed 2400/5923 examples...
Processed 2600/5923 examples...
Processed 2800/5923 examples...
Processed 3000/5923 examples...
Processed 3200/5923 examples...
Processed 3400/5923 examples...
Processed 3600/5923 examples...
Processed 3800/5923 examples...
Processed 4000/5923 examples...
Processed 4200/5923 examples...
Processed 4400/5923 examples...
Processed 4600/5923 examples...
Processed 4800/5923 examples...
Processed 5000/5923 examples...
Processed 5200/5923 examples...
Processed 5400/5923 examples...
Processed 5600/5923 examples...
Processed 5800/5923 examples...

BERT Sentence-Level Retrieval Accuracy: 83.71%
Tuned SpanBert Evaluation Ti

#Task 2 - Fine-Tuned BERT-based Span Extraction (Token-level QA)

The second task trains several BERT-based models for extractive QA.
Each model receives a question and a full context paragraph.
The fine-tuning objective is to predict the start and end token positions of the answer span.

During training, the input is tokenized together, and the gold start/end positions are mapped from character indices to token indices.
The models are fine-tuned on SQuAD v2.0 Training Data using the standard cross-entropy loss for span prediction.

After training, each model is evaluated on the development/test set using:

- Exact Match (EM): predicted span matches the gold span after normalization.

- F1: token-level overlap between predicted and gold spans.

This task compares how fine-tuning affects token-level extraction accuracy for different BERT-based architectures.

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.spatial.distance import cosine
import nltk
from nltk.tokenize import sent_tokenize
import re
from tqdm import tqdm
import json
import torch
from transformers import (
    DistilBertTokenizer,
    DistilBertTokenizerFast,
    BertTokenizerFast,
    DistilBertForQuestionAnswering,
    TrainingArguments,
    Trainer,
)
from transformers import BertTokenizer, BertForQuestionAnswering
import wandb
import matplotlib.pyplot as plt
import string
import collections
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from datasets import Dataset

def load_squad(path):
    with open(path, "r") as f:
        data = json.load(f)
    records = []
    for article in data["data"]:
        for p in article["paragraphs"]:
            context = re.sub(r"\s+", " ", p["context"]).strip()
            for qa in p["qas"]:
                if qa.get("is_impossible"):
                    continue
                for ans in qa.get("answers", []):
                    records.append({
                        "context": context,
                        "question": re.sub(r"\s+", " ", qa["question"]).strip(),
                        "answer_text": ans["text"].strip(),
                        "answer_start": ans["answer_start"]
                    })

    return pd.DataFrame(records).drop_duplicates(subset=["question", "context"])

train_df = load_squad('/content/drive/MyDrive/10701_Project/train-v2.0.json')
dev_df   = load_squad('/content/drive/MyDrive/10701_Project/dev-v2.0.json')


train_dataset = Dataset.from_pandas(train_df)
eval_dataset  = Dataset.from_pandas(dev_df)

In [None]:
def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

def compute_f1(prediction, ground_truth):
    """
    Compute token-level F1 score between predicted answer and true answer
    """
    pred_tokens = get_tokens(prediction)
    truth_tokens = get_tokens(ground_truth)

    common = set(pred_tokens) & set(truth_tokens)
    if len(common) == 0:
        return 0.0

    # precision = overlap / predicted
    precision = len(common) / len(pred_tokens)
    # recall = overlap / truth
    recall    = len(common) / len(truth_tokens)
    # harmonic mean
    return 2 * (precision * recall) / (precision + recall)

def evaluate_answers(pred_answers, gold_answers):
    total = len(pred_answers)
    exact_match = 0
    f1_total = 0

    for pred_ans, true_ans in zip(pred_answers, gold_answers):
        if normalize_answer(pred_ans) == normalize_answer(true_ans):
            exact_match += 1
        f1_total += compute_f1(pred_ans, true_ans)

    EM = exact_match / total
    F1 = f1_total / total
    return EM, F1


def evaluate_qa(model, tokenizer, dataset, n_samples=200):
    model.eval()
    pred_answers = []
    gold_answers = []

    for i in tqdm(range(min(n_samples, len(dataset)))):
        row = dataset[i]
        question = row['question']
        context  = row['context']
        true_ans = row['answer_text']

        inputs = tokenizer(question, context, return_tensors="pt", truncation=True)

        with torch.no_grad():
            outputs = model(**inputs)

        start_idx = torch.argmax(outputs.start_logits)
        end_idx   = torch.argmax(outputs.end_logits)

        pred_ans = tokenizer.decode(
            inputs["input_ids"][0][start_idx:end_idx+1]
        ).strip()

        pred_answers.append(pred_ans)
        gold_answers.append(true_ans)

    EM, F1 = evaluate_answers(pred_answers, gold_answers)
    print(f"Test size     = {len(pred_answers)}")
    print(f"Exact Match   = {EM:.4f}")
    print(f"F1 Score      = {F1:.4f}")
    return EM, F1



In [None]:
bert_qa_pretrained_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_qa_pretrained_model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
evaluate_qa(bert_qa_pretrained_model, bert_qa_pretrained_tokenizer , eval_dataset)


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-cased-distilled-squad")
model     = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-cased-distilled-squad")
evaluate_qa(model, tokenizer, eval_dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|▏         | 4/200 [00:08<07:15,  2.22s/it]


KeyboardInterrupt: 

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
evaluate_qa(model, tokenizer, eval_dataset)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 200/200 [10:03<00:00,  3.02s/it]

Test size     = 200
Exact Match   = 0.6300
F1 Score      = 0.7765





(0.63, 0.7764747335997336)

In [None]:
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mhaoqiny[0m ([33m10701-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import time
run_name = f"run-{time.strftime('%Y%m%d-%H%M%S')}"


# TODO START:  change here
model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"#"twmkn9/albert-base-v2-squad2" #"bert-large-uncased-whole-word-masking-finetuned-squad" #"distilbert-base-cased-distilled-squad"

# tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
# model     = DistilBertForQuestionAnswering.from_pretrained(model_name)

tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)


run_name = run_name + model_name
wandb.init(project="qa-finetune-albert", entity="10701-team", name=run_name, reinit=True)


train_dataset = Dataset.from_pandas(train_df)
eval_dataset  = Dataset.from_pandas(dev_df)

def preprocess_train(example):
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_offsets_mapping=True,
    )

    offsets = tokenized.pop("offset_mapping")
    start_char = example["answer_start"]
    end_char   = start_char + len(example["answer_text"])

    start_token = 0
    end_token   = 0
    for i, (s,e) in enumerate(offsets):
        if s <= start_char < e:
            start_token = i
        if s < end_char <= e:
            end_token = i

    tokenized["start_positions"] = start_token
    tokenized["end_positions"]   = end_token
    return tokenized

def preprocess_eval(example):
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_offsets_mapping=True,
    )


    tokenized["context"] = example["context"]
    tokenized["answer_text"] = example["answer_text"]

    return tokenized

train_dataset = train_dataset.map(preprocess_train)
eval_dataset  = eval_dataset.map(preprocess_eval)

# training hyper-parameters
args = TrainingArguments(
    output_dir="bert-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    #save_total_limit=2,
    learning_rate=2e-5,
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    fp16=True,
    report_to="wandb",
)

def compute_metrics_for_trainer(eval_pred):
    start_logits, end_logits = eval_pred.predictions
    features = eval_pred.label_ids

    pred_answers = []
    gold_answers = []

    for i in range(len(start_logits)):
        start_idx = int(np.argmax(start_logits[i]))
        end_idx   = int(np.argmax(end_logits[i]))

        offsets = features["offset_mapping"][i]
        context = features["context"][i]

        char_start = offsets[start_idx][0]
        char_end   = offsets[end_idx][1]

        pred_ans = context[char_start:char_end]

        pred_answers.append(pred_ans)
        gold_answers.append(features["answer_text"][i])

    EM, F1 = evaluate_answers(pred_answers, gold_answers)
    wandb.log({"eval_EM": EM, "eval_F1": F1})

    return {"EM": EM, "F1": F1}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics_for_trainer,
)

trainer.train()

model.save_pretrained("./albert-finetuned")
tokenizer.save_pretrained("./albert-finetuned")

wandb.finish()

NameError: name 'BertTokenizerFast' is not defined

##FineTuning - BERT

In [None]:
import time
import wandb
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
)
from transformers import BertTokenizerFast, BertForQuestionAnswering

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')


def split_sentences(text):
    return nltk.sent_tokenize(text)

import torch


#sentence-level score
def bert_score_sentences_batch(model, tokenizer, question, sentences):
    enc = tokenizer(
        [question] * len(sentences),
        sentences,
        truncation=True,
        max_length=384,
        padding=True,
        return_tensors="pt"
    ).to("cuda")

    with torch.no_grad():
        out = model(**enc)

    #score for each sentence
    start_max = out.start_logits.max(dim=1).values
    end_max = out.end_logits.max(dim=1).values
    scores = (start_max + end_max).cpu().numpy()

    return scores


def sentence_level_hit(model, tokenizer, example):
    context = example["context"]
    question = example["question"]
    true_ans = example["answer_text"].lower()

    sentences = split_sentences(context)

    scores = bert_score_sentences_batch(model, tokenizer, question, sentences)

    best_sentence = sentences[scores.argmax()].lower()

    return int(true_ans in best_sentence)


# def bert_sentence_score(model, tokenizer, question, sentence):
#     encoding = tokenizer(
#         question,
#         sentence,
#         truncation=True,
#         max_length=384,
#         padding="max_length",
#         return_tensors="pt"
#     )

#     encoding = {k: v.cuda() for k, v in encoding.items()}

#     with torch.no_grad():
#         outputs = model(**encoding)

#     start = outputs.start_logits[0].max().item()
#     end   = outputs.end_logits[0].max().item()

#     return start + end




# Load dataset
train_df = load_squad('/content/drive/MyDrive/10701_Project/train-v2.0.json')
dev_df   = load_squad('/content/drive/MyDrive/10701_Project/dev-v2.0.json')

# train_df = train_df.head(500)
# dev_df   = dev_df.head(500)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset  = Dataset.from_pandas(dev_df)


run_name = f"run-{time.strftime('%Y%m%d-%H%M%S')}"


model_name = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

run_name = run_name + model_name
wandb.init(project="qa-finetune-bert", entity="10701-team", name=run_name, reinit=True)


def preprocess(example):
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_offsets_mapping=True,
    )

    offsets = tokenized.pop("offset_mapping")
    start_char = example["answer_start"]
    end_char   = start_char + len(example["answer_text"])

    start_token = 0
    end_token   = 0

    for i, (s, e) in enumerate(offsets):
        if s <= start_char < e:
            start_token = i
        if s < end_char <= e:
            end_token = i

    tokenized["start_positions"] = start_token
    tokenized["end_positions"] = end_token


    tokenized["context"] = example["context"]
    tokenized["answer_text"] = example["answer_text"]
    tokenized["offset_mapping"] = offsets

    return tokenized

train_enc = train_dataset.map(preprocess)
eval_enc  = eval_dataset.map(preprocess)


train_enc = train_enc.remove_columns(["__index_level_0__"])
eval_enc  = eval_enc.remove_columns(["__index_level_0__"])


args = TrainingArguments(
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    output_dir="bert-finetuned",
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    bf16=True,
    fp16=False,
    report_to="wandb",
)


def compute_metrics_for_trainer(eval_pred):
    start_logits, end_logits = eval_pred.predictions

    pred_answers = []
    gold_answers = []

    bert_hits = 0
    total = len(start_logits)

    for i in range(total):
        example = eval_enc[i]


        # span-level

        s_idx = int(start_logits[i].argmax())
        e_idx = int(end_logits[i].argmax())

        offsets = example["offset_mapping"]
        context = example["context"]

        char_start = offsets[s_idx][0]
        char_end   = offsets[e_idx][1]

        pred_span = context[char_start:char_end]
        true_span = example["answer_text"]

        pred_answers.append(pred_span)
        gold_answers.append(true_span)


        # sentence-level
        bert_hits += sentence_level_hit(model, tokenizer, example)



    EM, F1 = evaluate_answers(pred_answers, gold_answers)
    bert_acc = bert_hits / total


    wandb.log({
        "eval_EM": EM,
        "eval_F1": F1,
        "eval_bert_sentence_retrieval_acc": bert_acc
    })

    return {"EM": EM, "F1": F1, "bert_sentence_retrieval_acc": bert_acc}



trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_enc,
    eval_dataset=eval_enc,
    compute_metrics=compute_metrics_for_trainer,
)

trainer.train()

model.save_pretrained("./bert-finetuned")
tokenizer.save_pretrained("./bert-finetuned")

wandb.finish()

save_path = "/content/drive/MyDrive/10701_Project/saved_models/bert-finetuned"

import os
os.makedirs(save_path, exist_ok=True)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Model saved to:", save_path)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0,1
eval/EM,▁
eval/F1,▁
eval/bert_sentence_retrieval_acc,▁
eval/loss,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁
eval_EM,▁
eval_F1,▁
eval_bert_sentence_retrieval_acc,▁

0,1
eval/EM,0.53419
eval/F1,0.68213
eval/bert_sentence_retrieval_acc,0.85734
eval/loss,1.35439
eval/runtime,416.7501
eval/samples_per_second,14.212
eval/steps_per_second,0.89
eval_EM,0.53419
eval_F1,0.68213
eval_bert_sentence_retrieval_acc,0.85734


Map:   0%|          | 0/86776 [00:00<?, ? examples/s]

Map:   0%|          | 0/5923 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Em,F1,Bert Sentence Retrieval Acc
1000,1.3013,1.354544,0.533176,0.682159,0.856323
2000,1.2324,1.222911,0.577072,0.728757,0.862907
3000,1.1474,1.15098,0.592267,0.741938,0.859362
4000,1.1691,1.116469,0.603073,0.75393,0.857673
5000,1.0872,1.075527,0.615229,0.765378,0.86392
6000,0.803,1.120517,0.618268,0.770048,0.85244
7000,0.8657,1.100073,0.619956,0.771208,0.835387
8000,0.8776,1.105225,0.620125,0.771752,0.859699
9000,0.842,1.06508,0.630424,0.77719,0.860712
10000,0.752,1.084462,0.624008,0.775583,0.86257


0,1
eval/EM,▁▄▅▆▇▇▇▇█▇▇▇█▇███████
eval/F1,▁▄▅▆▇▇▇▇▇▇███████████
eval/bert_sentence_retrieval_acc,▇█▇▇█▆▃▇▇█▅▄▃▄▁▃▁▃▃▁▁
eval/loss,█▅▃▂▁▂▂▂▁▁▄▃▄▃▃▄▆▆▆▆▆
eval/runtime,▃▂▂▂▆▄▇▆▆▇▆▃▇▅▇█▅▅▄▁▁
eval/samples_per_second,▆▇▇▇▃▄▂▃▃▂▃▆▂▄▂▁▄▄▅██
eval/steps_per_second,▆▇▇▇▃▄▂▃▃▂▃▆▂▄▂▁▄▄▅██
eval_EM,▁▄▅▆▇▇▇▇█▇▇▇█▇███████
eval_F1,▁▄▅▆▇▇▇▇▇▇███████████
eval_bert_sentence_retrieval_acc,▇█▇▇█▆▃▇▇█▅▄▃▄▁▃▁▃▃▁▁

0,1
eval/EM,0.62823
eval/F1,0.78344
eval/bert_sentence_retrieval_acc,0.8288
eval/loss,1.25785
eval/runtime,102.2141
eval/samples_per_second,57.947
eval/steps_per_second,3.63
eval_EM,0.62823
eval_F1,0.78344
eval_bert_sentence_retrieval_acc,0.8288


Model saved to: /content/drive/MyDrive/10701_Project/saved_models/bert-finetuned


##FineTuning - Albert

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlylefenglin[0m ([33m10701-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
bert_qa_pretrained_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_qa_pretrained_model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")
evaluate_qa(bert_qa_pretrained_model, bert_qa_pretrained_tokenizer , eval_dataset)


from transformers import AlbertTokenizer, AlbertForQuestionAnswering
from transformers import AlbertTokenizerFast
tokenizer = AlbertTokenizerFast.from_pretrained("albert-base-v2")
model = AlbertForQuestionAnswering.from_pretrained("albert-base-v2")
evaluate_qa(model, tokenizer, eval_dataset)



from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering


tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
model = RobertaForQuestionAnswering.from_pretrained("roberta-base")
evaluate_qa(model, tokenizer, eval_dataset)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 200/200 [00:17<00:00, 11.43it/s]


Test size     = 200
Exact Match   = 0.0000
F1 Score      = 0.0305


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 200/200 [00:17<00:00, 11.15it/s]


Test size     = 200
Exact Match   = 0.0000
F1 Score      = 0.0352


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 200/200 [00:16<00:00, 11.91it/s]

Test size     = 200
Exact Match   = 0.0000
F1 Score      = 0.0611





(0.0, 0.061144337292220706)

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlylefenglin[0m ([33m10701-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import time
import wandb
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
)
from transformers import AlbertTokenizer, AlbertForQuestionAnswering
from transformers import AlbertTokenizerFast

train_df = load_squad('/content/drive/MyDrive/10701_Project/train-v2.0.json')
dev_df   = load_squad('/content/drive/MyDrive/10701_Project/dev-v2.0.json')

# train_df = train_df.head(500)
# dev_df   = dev_df.head(500)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset  = Dataset.from_pandas(dev_df)


import time
run_name = f"run-{time.strftime('%Y%m%d-%H%M%S')}"



model_name = "albert-base-v2"

tokenizer = AlbertTokenizerFast.from_pretrained(model_name)
model = AlbertForQuestionAnswering.from_pretrained(model_name)


run_name = run_name + model_name
wandb.init(project="qa-finetune-albert", entity="10701-team", name=run_name, reinit=True)



def preprocess(example):
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_offsets_mapping=True,
    )

    offsets = tokenized.pop("offset_mapping")
    start_char = example["answer_start"]
    end_char   = start_char + len(example["answer_text"])

    start_token = 0
    end_token   = 0

    for i, (s, e) in enumerate(offsets):
        if s <= start_char < e:
            start_token = i
        if s < end_char <= e:
            end_token = i

    tokenized["start_positions"] = start_token
    tokenized["end_positions"] = end_token

    # keep for EM/F1
    tokenized["context"] = example["context"]
    tokenized["answer_text"] = example["answer_text"]
    tokenized["offset_mapping"] = offsets

    return tokenized

train_enc = train_dataset.map(preprocess)
eval_enc  = eval_dataset.map(preprocess)

# remove HF warnings
train_enc = train_enc.remove_columns(["__index_level_0__"])
eval_enc  = eval_enc.remove_columns(["__index_level_0__"])


# training hyper-parameters
args = TrainingArguments(
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    output_dir="albert-finetuned",
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="epoch",
    #save_total_limit=2,
    learning_rate=2e-5,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    bf16=True,
    fp16=False,
    report_to="wandb",
)

def compute_metrics_for_trainer(eval_pred):
    start_logits, end_logits = eval_pred.predictions
    features = eval_pred.label_ids

    pred_answers = []
    gold_answers = []

    for i in range(len(start_logits)):
        example = eval_enc[i]

        s_idx = int(start_logits[i].argmax())
        e_idx = int(end_logits[i].argmax())

        offsets = example["offset_mapping"]
        context = example["context"]

        char_start = offsets[s_idx][0]
        char_end   = offsets[e_idx][1]

        pred = context[char_start:char_end]
        true = example["answer_text"]

        pred_answers.append(pred)
        gold_answers.append(true)


    EM, F1 = evaluate_answers(pred_answers, gold_answers)
    wandb.log({"eval_EM": EM, "eval_F1": F1})

    return {"EM": EM, "F1": F1}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_enc,
    eval_dataset=eval_enc,
    compute_metrics=compute_metrics_for_trainer,
)

trainer.train()

model.save_pretrained("./albert-finetuned")
tokenizer.save_pretrained("./albert-finetuned")

wandb.finish()

save_path = "/content/drive/MyDrive/10701_Project/saved_models/albert-finetuned"

import os
os.makedirs(save_path, exist_ok=True)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Model saved to:", save_path)


Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/86776 [00:00<?, ? examples/s]

Map:   0%|          | 0/5923 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Em,F1
1000,0.9484,1.107717,0.602904,0.760099
2000,0.9722,1.026036,0.6208,0.781346
3000,0.9262,0.991346,0.63971,0.790257
4000,0.9822,1.044524,0.62367,0.776562
5000,0.9734,1.024035,0.635995,0.786829
6000,0.6852,1.003459,0.649333,0.799097
7000,0.7279,0.988576,0.650346,0.798476
8000,0.7675,0.957263,0.657437,0.80803
9000,0.7155,0.958015,0.657268,0.806821
10000,0.6387,0.966206,0.656931,0.808673


0,1
eval/EM,▁▃▆▄▅▇▇███▇▇▇███▇███▇
eval/F1,▁▄▅▃▅▇▇████▆▇██▇▇███▇
eval/loss,▄▃▂▃▃▂▂▁▁▁▃▃▄▃▃▄▇████
eval/runtime,█▇▇▅▃▅▅▃▄▃▂▆▃▃▁▂▅▄▅▁▃
eval/samples_per_second,▁▂▂▄▆▄▄▆▅▆▇▃▆▆█▇▄▅▄█▆
eval/steps_per_second,▁▂▂▄▆▄▄▆▅▆▇▃▆▆█▇▄▅▄█▆
eval_EM,▁▃▆▄▅▇▇███▇▇▇███▇███▇
eval_F1,▁▄▅▃▅▇▇████▆▇██▇▇███▇
train/epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▇▇▇▇▇███
train/global_step,▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█

0,1
eval/EM,0.6517
eval/F1,0.80309
eval/loss,1.24525
eval/runtime,26.2438
eval/samples_per_second,225.691
eval/steps_per_second,14.137
eval_EM,0.6517
eval_F1,0.80309
total_flos,7665376403718144.0
train/epoch,4


Model saved to: /content/drive/MyDrive/10701_Project/saved_models/albert-finetuned


##FineTuning - Roberta

In [None]:
import time
import wandb
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
)
from transformers import RobertaTokenizerFast, RobertaForQuestionAnswering

train_df = load_squad('/content/drive/MyDrive/10701_Project/train-v2.0.json')
dev_df   = load_squad('/content/drive/MyDrive/10701_Project/dev-v2.0.json')

# train_df = train_df.head(500)
# dev_df   = dev_df.head(500)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset  = Dataset.from_pandas(dev_df)


import time
run_name = f"run-{time.strftime('%Y%m%d-%H%M%S')}"


# TODO START:  change here


model_name = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
model = RobertaForQuestionAnswering.from_pretrained("roberta-base")

# TODO END
run_name = run_name + model_name
wandb.init(project="qa-finetune-roberta", entity="10701-team", name=run_name, reinit=True)


train_dataset = Dataset.from_pandas(train_df)
eval_dataset  = Dataset.from_pandas(dev_df)

def preprocess(example):
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_offsets_mapping=True,
    )

    offsets = tokenized.pop("offset_mapping")
    start_char = example["answer_start"]
    end_char   = start_char + len(example["answer_text"])

    start_token = 0
    end_token   = 0

    for i, (s, e) in enumerate(offsets):
        if s <= start_char < e:
            start_token = i
        if s < end_char <= e:
            end_token = i

    tokenized["start_positions"] = start_token
    tokenized["end_positions"] = end_token

    # keep for EM/F1
    tokenized["context"] = example["context"]
    tokenized["answer_text"] = example["answer_text"]
    tokenized["offset_mapping"] = offsets

    return tokenized

train_enc = train_dataset.map(preprocess)
eval_enc  = eval_dataset.map(preprocess)

# remove HF warnings
train_enc = train_enc.remove_columns(["__index_level_0__"])
eval_enc  = eval_enc.remove_columns(["__index_level_0__"])


# training hyper-parameters
# args = TrainingArguments(
#     output_dir="roberta-finetuned",
#     eval_strategy="epoch",
#     save_strategy="epoch",
#     #save_total_limit=2,
#     learning_rate=2e-5,
#     num_train_epochs=3,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     weight_decay=0.01,
#     logging_dir="./logs",
#     logging_steps=50,
#     fp16=True,
#     report_to="wandb",
# )
args = TrainingArguments(
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    output_dir="roberta-finetuned",
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="epoch",
    #save_total_limit=2,
    learning_rate=2e-5,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    bf16=True,
    fp16=False,
    report_to="wandb",
)

def compute_metrics_for_trainer(eval_pred):
    start_logits, end_logits = eval_pred.predictions
    features = eval_pred.label_ids

    pred_answers = []
    gold_answers = []

    for i in range(len(start_logits)):
        example = eval_enc[i]

        s_idx = int(start_logits[i].argmax())
        e_idx = int(end_logits[i].argmax())

        offsets = example["offset_mapping"]
        context = example["context"]

        char_start = offsets[s_idx][0]
        char_end   = offsets[e_idx][1]

        pred = context[char_start:char_end]
        true = example["answer_text"]

        pred_answers.append(pred)
        gold_answers.append(true)


    EM, F1 = evaluate_answers(pred_answers, gold_answers)
    wandb.log({"eval_EM": EM, "eval_F1": F1})

    return {"EM": EM, "F1": F1}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_enc,
    eval_dataset=eval_enc,
    compute_metrics=compute_metrics_for_trainer,
)

trainer.train()

model.save_pretrained("./roberta-finetuned")
tokenizer.save_pretrained("./roberta-finetuned")

wandb.finish()

save_path = "/content/drive/MyDrive/10701_Project/saved_models/roberta-finetuned"

import os
os.makedirs(save_path, exist_ok=True)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Model saved to:", save_path)


Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/86776 [00:00<?, ? examples/s]

Map:   0%|          | 0/5923 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Em,F1
1000,1.0154,1.0693,0.621982,0.773128
2000,1.0055,1.007417,0.633463,0.787429
3000,0.9319,0.979861,0.650008,0.804305
4000,1.0241,0.93852,0.663853,0.812102
5000,0.9334,0.941798,0.657775,0.806834
6000,0.6976,0.967676,0.66419,0.814589
7000,0.7662,0.951025,0.668918,0.816181
8000,0.7594,0.936473,0.670944,0.817214
9000,0.7383,0.913982,0.671619,0.819722
10000,0.6562,0.986148,0.667736,0.816983


0,1
eval/EM,▁▂▄▆▅▆▇▇▇▆▇▇▇▇██▇▇███
eval/F1,▁▃▅▆▅▆▇▇▇▇█▇█▇▇██████
eval/loss,▇▅▄▂▂▃▂▂▁▄▄▄▅▄▄▅████▇
eval/runtime,▂▂▂▂▂▃▄▂▃▁▄▃▃▁▁▂▂▃▄▅█
eval/samples_per_second,▇▇▇▇▇▆▅▇▆█▅▆▆██▇▇▆▅▄▁
eval/steps_per_second,▇▇▇▇▇▆▅▇▆█▅▆▆██▇▇▆▅▄▁
eval_EM,▁▂▄▆▅▆▇▇▇▆▇▇▇▇██▇▇███
eval_F1,▁▃▅▆▅▆▇▇▇▇█▇█▇▇██████
train/epoch,▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▅▆▆▆▆▆▆▇▇▇▇▇▇▇▇▇▇▇████

0,1
eval/EM,0.67685
eval/F1,0.82321
eval/loss,1.07302
eval/runtime,19.5975
eval/samples_per_second,302.232
eval/steps_per_second,18.931
eval_EM,0.67685
eval_F1,0.82321
total_flos,9.069714945009254e+16
train/epoch,4


Model saved to: /content/drive/MyDrive/10701_Project/saved_models/roberta-finetuned


##FineTuning - SpanBert

In [None]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlylefenglin[0m ([33m10701-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
import time
import wandb
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
)


train_df = load_squad('/content/drive/MyDrive/10701_Project/train-v2.0.json')
dev_df   = load_squad('/content/drive/MyDrive/10701_Project/dev-v2.0.json')

# train_df = train_df.head(500)
# dev_df   = dev_df.head(500)

train_dataset = Dataset.from_pandas(train_df)
eval_dataset  = Dataset.from_pandas(dev_df)


import time
run_name = f"run-{time.strftime('%Y%m%d-%H%M%S')}"




model_name = "SpanBERT/spanbert-base-cased"
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


run_name = run_name + model_name
wandb.init(project="qa-finetune-SpanBERT", entity="10701-team", name=run_name, reinit=True)


train_dataset = Dataset.from_pandas(train_df)
eval_dataset  = Dataset.from_pandas(dev_df)

def preprocess(example):
    tokenized = tokenizer(
        example["question"],
        example["context"],
        truncation=True,
        max_length=512,
        padding="max_length",
        return_offsets_mapping=True,
    )

    offsets = tokenized.pop("offset_mapping")
    start_char = example["answer_start"]
    end_char   = start_char + len(example["answer_text"])

    start_token = 0
    end_token   = 0

    for i, (s, e) in enumerate(offsets):
        if s <= start_char < e:
            start_token = i
        if s < end_char <= e:
            end_token = i

    tokenized["start_positions"] = start_token
    tokenized["end_positions"] = end_token

    # keep for EM/F1
    tokenized["context"] = example["context"]
    tokenized["answer_text"] = example["answer_text"]
    tokenized["offset_mapping"] = offsets

    return tokenized

train_enc = train_dataset.map(preprocess)
eval_enc  = eval_dataset.map(preprocess)

# remove HF warnings
train_enc = train_enc.remove_columns(["__index_level_0__"])
eval_enc  = eval_enc.remove_columns(["__index_level_0__"])


args = TrainingArguments(
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    output_dir="SpanBERT-finetuned",
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="epoch",
    #save_total_limit=2,
    learning_rate=2e-5,
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    bf16=True,
    fp16=False,
    report_to="wandb",
)

def compute_metrics_for_trainer(eval_pred):
    start_logits, end_logits = eval_pred.predictions
    features = eval_pred.label_ids

    pred_answers = []
    gold_answers = []

    for i in range(len(start_logits)):
        example = eval_enc[i]

        s_idx = int(start_logits[i].argmax())
        e_idx = int(end_logits[i].argmax())

        offsets = example["offset_mapping"]
        context = example["context"]

        char_start = offsets[s_idx][0]
        char_end   = offsets[e_idx][1]

        pred = context[char_start:char_end]
        true = example["answer_text"]

        pred_answers.append(pred)
        gold_answers.append(true)


    EM, F1 = evaluate_answers(pred_answers, gold_answers)
    wandb.log({"eval_EM": EM, "eval_F1": F1})

    return {"EM": EM, "F1": F1}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_enc,
    eval_dataset=eval_enc,
    compute_metrics=compute_metrics_for_trainer,
)

trainer.train()

model.save_pretrained("./SpanBERT-finetuned")
tokenizer.save_pretrained("./SpanBERT-finetuned")

wandb.finish()

save_path = "/content/drive/MyDrive/10701_Project/saved_models/SpanBERT-finetuned"

import os
os.makedirs(save_path, exist_ok=True)

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print("Model saved to:", save_path)


config.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/215M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)


model.safetensors:   0%|          | 0.00/215M [00:00<?, ?B/s]

[34m[1mwandb[0m: Currently logged in as: [33mlylefenglin[0m ([33m10701-team[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Map:   0%|          | 0/86776 [00:00<?, ? examples/s]

Map:   0%|          | 0/5923 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss,Em,F1
1000,1.467,1.402221,0.576735,0.739491
2000,1.1592,1.104636,0.618437,0.775948
3000,1.0111,1.023228,0.63971,0.795137
4000,1.0618,0.966372,0.650177,0.802299
5000,0.9866,0.966152,0.657268,0.807065
6000,0.7574,1.002852,0.658619,0.811022
7000,0.8111,0.956404,0.66419,0.816148
8000,0.8277,0.976028,0.663853,0.815195
9000,0.7461,0.946285,0.666048,0.814863
10000,0.6859,1.00132,0.668242,0.816004


0,1
eval/EM,▁▄▆▆▇▇▇▇█████████████
eval/F1,▁▄▆▆▇▇▇▇▇▇███████████
eval/loss,█▃▂▁▁▂▁▁▁▂▂▂▂▂▁▂▃▃▃▃▃
eval/runtime,▃▄█▃▇▇▁▆▄▆▂▃▁▂▇▂▅▂▄▆▅
eval/samples_per_second,▆▅▁▆▂▂█▃▅▃▇▆█▇▂▇▄▇▅▃▄
eval/steps_per_second,▆▅▁▆▂▂█▃▅▃▇▆█▇▂▇▄▇▅▃▄
eval_EM,▁▄▆▆▇▇▇▇█████████████
eval_F1,▁▄▆▆▇▇▇▇▇▇███████████
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇█████
train/global_step,▁▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇█

0,1
eval/EM,0.66976
eval/F1,0.82125
eval/loss,1.05507
eval/runtime,20.9749
eval/samples_per_second,282.385
eval/steps_per_second,17.688
eval_EM,0.66976
eval_F1,0.82125
total_flos,9.069714945009254e+16
train/epoch,4


Model saved to: /content/drive/MyDrive/10701_Project/saved_models/SpanBERT-finetuned
