In [None]:
# Uncomment these lines if needed:
!pip install nltk requests transformers
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')


import requests
import re
import time
import json
from urllib.parse import quote

#########################
# 1. Transformer Baselines
#########################
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GPT2LMHeadModel, GPT2Tokenizer

def baseline_generate_t5(question, num_candidates=5):
    """
    Generates candidate answers for a question using T5 (without any external knowledge).
    """
    model_name = "t5-small"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    input_text = "question: " + question
    input_ids = tokenizer.encode(input_text, return_tensors="pt")
    outputs = model.generate(
        input_ids,
        max_length=50,
        num_return_sequences=num_candidates,
        do_sample=True,
        top_k=50
    )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

def baseline_generate_bart(question, num_candidates=5):
    """
    Generates candidate answers for a question using BART (without knowledge integration).
    """
    model_name = "facebook/bart-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    input_ids = tokenizer.encode(question, return_tensors="pt")
    outputs = model.generate(
        input_ids,
        max_length=50,
        num_return_sequences=num_candidates,
        num_beams=num_candidates,
        do_sample=True,
        top_k=50
    )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

def baseline_generate_gpt2(question, num_candidates=5):
    """
    Generates candidate answers for a question using GPT-2 (without knowledge integration).
    """
    model_name = "gpt2"
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    encoded = tokenizer(question, return_tensors="pt")
    input_ids = encoded["input_ids"]
    attention_mask = encoded["attention_mask"]
    outputs = model.generate(
         input_ids,
         attention_mask=attention_mask,
         max_length=50,
         num_return_sequences=num_candidates,
         num_beams=num_candidates,
         do_sample=False,
         pad_token_id=tokenizer.eos_token_id
    )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]


#########################
# 2. NLTK Utilities for Lemmatization, POS Tagging, and Expansion
#########################
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn

lemmatizer = WordNetLemmatizer()

def tokenize_and_lemmatize(sentence):
    tokens = nltk.word_tokenize(sentence.lower())
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return set(lemmas)

def expand_word(word):
    expansion = set([word])
    for syn in wn.synsets(word):
        for lemma in syn.lemma_names():
            expansion.add(lemma.lower().replace("_", " "))
        for hyper in syn.hypernyms():
            for lemma in hyper.lemma_names():
                expansion.add(lemma.lower().replace("_", " "))
    return expansion

def expand_tokens(token_set):
    expanded = set()
    for token in token_set:
        expanded.update(expand_word(token))
    return expanded

def determine_expected_answer_type(question):
    """
    Simple heuristic: if the question contains "do", "perform", or "act", expect an action; else an object.
    """
    tokens = nltk.word_tokenize(question.lower())
    if any(word in tokens for word in ["do", "perform", "act"]):
        return "action"
    return "object"

#########################
# 3. Improved Candidate Generation Using ConceptNet
#########################
BASE_URL = "http://api.conceptnet.io"

def get_edges_for_term(term, limit=50):
    url = f"{BASE_URL}{term}?offset=0&limit={limit}"
    try:
        resp = requests.get(url)
        resp.raise_for_status()
        data = resp.json()
        return data.get("edges", [])
    except requests.RequestException as e:
        print(f"Error fetching ConceptNet data for {term}: {e}")
        return []

def text_to_conceptnet_uri(text, lang='en'):
    text_underscored = text.strip().lower().replace(" ", "_")
    return f"/c/{lang}/{quote(text_underscored)}"

def extract_keywords_improved(question_text, expected_type):
    """
    Improved keyword extraction:
      - For actions, extract verbs but filter out trivial ones.
      - For objects, extract nouns.
      - If too few, fallback to all content words.
    """
    tokens = nltk.word_tokenize(question_text.lower())
    pos_tags = nltk.pos_tag(tokens)
    stopwords = {"what", "name", "something", "you", "on", "the", "a", "an", "of"}

    if expected_type == "action":
        trivial_verbs = {"name", "do", "get", "be", "have"}
        keywords = [word for word, tag in pos_tags if tag.startswith("VB") and word not in trivial_verbs]
        if len(keywords) < 1:
            keywords = [word for word in tokens if word not in stopwords]
    else:
        keywords = [word for word, tag in pos_tags if tag.startswith("NN")]
        if len(keywords) < 1:
            keywords = [word for word in tokens if word not in stopwords]

    return keywords

def filter_candidates_by_pos(candidates, expected_type):
    """
    For "action", require at least one verb; for "object", require at least one noun.
    """
    filtered = []
    for candidate in candidates:
        tokens = nltk.word_tokenize(candidate)
        pos_tags = nltk.pos_tag(tokens)
        if expected_type == "action":
            if any(tag.startswith("VB") for _, tag in pos_tags):
                filtered.append(candidate)
        else:
            if any(tag.startswith("NN") for _, tag in pos_tags):
                filtered.append(candidate)
    return filtered

def generate_candidates_improved(question_text, top_k=20):
    """
    Candidate generation using ConceptNet, with broader relation filters and contextual cues.
    """
    expected_type = determine_expected_answer_type(question_text)
    keywords = extract_keywords_improved(question_text, expected_type)

    # If too few keywords, fallback to all content words
    if len(keywords) < 1:
        tokens = nltk.word_tokenize(question_text.lower())
        stopwords = {"what", "name", "something", "you", "on", "the", "a", "an", "of", "do"}
        keywords = [word for word in tokens if word not in stopwords]

    # Add contextual cues: if the question mentions "rain", add related words.
    if "rain" in question_text.lower():
        contextual_cues = ["rain", "rainy", "weather", "wet", "drizzle"]
        keywords = list(set(keywords).union(set(contextual_cues)))

    # Relaxed relation filters:
    if expected_type == "action":
        relation_set = {
            "/r/UsedFor", "/r/CapableOf", "/r/MotivatedBy", "/r/Desires",
            "/r/RelatedTo", "/r/HasSubevent", "/r/HasPrerequisite"
        }
    else:
        relation_set = {
            "/r/IsA", "/r/PartOf", "/r/HasA", "/r/RelatedTo", "/r/AtLocation"
        }

    candidates = set()
    for kw in keywords:
        uri = text_to_conceptnet_uri(kw)
        edges = get_edges_for_term(uri, limit=top_k)
        for e in edges:
            rel_id = e.get("rel", {}).get("@id", "")
            if rel_id in relation_set:
                end_id = e.get("end", {}).get("@id", "")
                start_id = e.get("start", {}).get("@id", "")
                if start_id == uri and end_id.startswith("/c/en/"):
                    candidates.add(end_id)
                elif end_id == uri and start_id.startswith("/c/en/"):
                    candidates.add(start_id)

    text_candidates = set()
    for c in candidates:
        cleaned = c.replace("/c/en/", "").replace("_", " ")
        text_candidates.add(cleaned)

    # Filter candidates by expected POS
    filtered_candidates = filter_candidates_by_pos(list(text_candidates), expected_type)
    return filtered_candidates


#########################
# 4. Answer Ranking
#########################
def score_brevity(answer):
    tokens = answer.split()
    return 1.0 / len(tokens) if tokens else 0.0

def get_concreteness(answer):
    # Example dictionary for demonstration
    concreteness_dict = {
        'apple': 5.0,
        'cherry': 4.5,
        'blueberry': 4.0,
        'watermelon': 4.0,
        'lemon': 4.0,
        'lime': 4.0,
        'pineapple': 4.0,
        'peach': 4.0,
        'raspberry': 4.0,
        'orange': 4.0,
        'grape': 4.0,
        'mango': 4.0,
        'plums': 4.0,
    }
    tokens = answer.lower().split()
    scores = [concreteness_dict.get(token, 3.0) for token in tokens]
    return sum(scores) / len(scores) if scores else 0.0

def score_typicality(question, answer):
    q_tokens = tokenize_and_lemmatize(question)
    a_tokens = tokenize_and_lemmatize(answer)
    q_expanded = expand_tokens(q_tokens)
    a_expanded = expand_tokens(a_tokens)
    if not q_expanded or not a_expanded:
        return 0.0
    intersection = q_expanded.intersection(a_expanded)
    union = q_expanded.union(a_expanded)
    return len(intersection) / len(union)

def rank_candidates_by_composite(question, candidates, weights=(0.3, 0.4, 0.3)):
    """
    Ranks candidate answers by a weighted sum of brevity, concreteness, and typicality.
    """
    ranked = []
    for candidate in candidates:
        brevity = score_brevity(candidate)
        concreteness = get_concreteness(candidate)
        typicality = score_typicality(question, candidate)
        composite_score = weights[0]*brevity + weights[1]*concreteness + weights[2]*typicality
        ranked.append((candidate, composite_score, brevity, concreteness, typicality))
    ranked.sort(key=lambda x: x[1], reverse=True)
    return ranked


#########################
# 5. Enhanced Post-Processing
#########################
import difflib

def postprocess_candidates(ranked_candidates, question, similarity_threshold=0.8):
    """
    - Removes near-duplicates (redundancy removal).
    - Performs a basic coherence check:
      * Removes extremely short or nonsensical answers.
    """
    final = []
    existing = []  # Accepted answers for duplication check

    for cand_tuple in ranked_candidates:
        candidate, comp_score, brevity, conc, typ = cand_tuple

        # Basic length check: avoid extremely short answers.
        if len(candidate.strip()) < 2:
            continue

        # Near-duplicate check using difflib.
        is_duplicate = False
        for accepted in existing:
            ratio = difflib.SequenceMatcher(None, candidate.lower(), accepted.lower()).ratio()
            if ratio >= similarity_threshold:
                is_duplicate = True
                break
        if is_duplicate:
            continue

        final.append(cand_tuple)
        existing.append(candidate)

    return final


#########################
# 6. Main Pipeline
#########################
def main():
    sample_data = [
        {
            "question": "Name a fruit you might put in a pie",
            "answers": ["apple", "cherry", "blueberry"]
        },
        {
            "question": "Name something you might do at a party",
            "answers": ["dance", "talk", "mingle"]
        },
        {
            "question": "Name something you might find in a kitchen",
            "answers": ["knife", "fork", "spoon"]
        }
    ]

    for entry in sample_data:
        question = entry["question"]
        gold_answers = entry.get("answers", [])
        print("\n==============================")
        print(f"Processing Question: {question}")
        print("==============================")

        # 1) Baseline Transformer Outputs
        print("\nBaseline Evaluation (without knowledge integration):")
        print("\nT5 Generated Answers:")
        for ans in baseline_generate_t5(question):
            print(" -", ans)
        print("\nBART Generated Answers:")
        for ans in baseline_generate_bart(question):
            print(" -", ans)
        print("\nGPT-2 Generated Answers:")
        for ans in baseline_generate_gpt2(question):
            print(" -", ans)

        # 2) Candidate Generation using ConceptNet
        print("\nCandidate Generation Using ConceptNet:")
        candidates = generate_candidates_improved(question, top_k=20)
        if not candidates:
            print("No candidates found.")
            print("\nGold Answers (for reference):", gold_answers)
            continue
        print("Generated Candidates (raw):")
        print(candidates)

        # 3) Ranking
        ranked_candidates = rank_candidates_by_composite(question, candidates)
        print("\nRanked Candidates (candidate | composite | brevity | concreteness | typicality):")
        for cand, comp, brev, conc, typ in ranked_candidates:
            print(f"{cand:30s} | {comp:8.3f} | {brev:7.3f} | {conc:11.3f} | {typ:10.3f}")

        # 4) Enhanced Post-Processing
        print("\nPost-Processing for Redundancy & Coherence:")
        final_candidates = postprocess_candidates(ranked_candidates, question, similarity_threshold=0.8)
        if not final_candidates:
            print("All candidates removed during post-processing.")
        else:
            print("Final Post-Processed Candidates:")
            for cand, comp, brev, conc, typ in final_candidates:
                print(f"{cand:30s} | {comp:8.3f} | {brev:7.3f} | {conc:11.3f} | {typ:10.3f}")

        print("\nGold Answers (for reference):")
        print(gold_answers)

        time.sleep(0.5)

        time.sleep(0.5)

if __name__ == "__main__":
    main()




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.



Processing Question: Name a fruit you might put in a pie

Baseline Evaluation (without knowledge integration):

T5 Generated Answers:


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

 - ( d) nomen it the fruit you might put in a pie
 - nomen a fruit you might put in a pie
 - name a fruit
 - name
 - name a fruit you might put in a pie

BART Generated Answers:


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

 - Name a fruit you might put in a pie
 - Name a fruit you might put in a pie.
 - Name a fruit you might put in a pieadvertisement
 - Name a fruit you might put in a pie (
 - Namea fruit you might put in a pie

GPT-2 Generated Answers:


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

 - Name a fruit you might put in a pie crust.

You can also make your own pie crust by using a pastry brush.

You can also make your own pie crust by using a pastry brush. You can also make your own pie
 - Name a fruit you might put in a pie crust.

You can also make your own pie crust by using a pastry brush.

You can also make your own pie crust by using a pastry brush.

You can also make your
 - Name a fruit you might put in a pie crust.

You can also make your own pie crust by using a pie cutter.

You can also make your own pie crust by using a pie cutter. You can also make your own pie
 - Name a fruit you might put in a pie crust.

You can also make your own pie crust by using a pie crust cutter.

You can also make your own pie crust by using a pie crust cutter. You can also make your
 - Name a fruit you might put in a pie crust.

You can also make your own pie crust by using a pie crust cutter.

You can also make your own pie crust by using a pie crust cutter.

You can also

Cand

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

 - Name something
 - Name something
 - at a party
 - name something you might make at a party
 - nomnate

BART Generated Answers:
 - Name something you might do at a party
 - Name something you might do at a partyadvertisement
 - Name something you might do at a partyadvertisementadvertisement
 - Name something you might do at a party.
 - Name something you might do at a partyShare something you are passionate about at a

GPT-2 Generated Answers:
 - Name something you might do at a party.

If you don't know what you're doing, ask for help.

If you don't know what you're doing, ask for help. If you don't know what you're doing
 - Name something you might do at a party.

If you don't know what you're doing, ask for help.

If you don't know what you're doing, ask for help.

If you don't know what you
 - Name something you might do at a party.

If you don't know what you're doing, ask for help.

If you don't know what you're doing, ask for help.

If you don't know how to
 - Name something 

ISHANS CODE ON PROTOQA TRAIN DATASET

In [None]:
import json

# Load the JSONL file
def load_protoqa_questions(file_path):
    questions = []
    with open(file_path, "r") as f:
        for line in f:
            entry = json.loads(line)  # Load each line as a JSON object
            questions.append(entry["question"]["original"])
    return questions

protoqa_questions = load_protoqa_questions("/content/train.jsonl")
print(protoqa_questions[:5])

['At The Beach, Name Something That Might Protect You From Sun. ', 'After A Week Of Camping, What Luxury Oh Home Are You Most Excited To Have Again? ', 'After Having Kids Name Something That Happens That Interrupts A Couples Alone Time At Night', 'At What Age Might A Man Have Midlifecrisis? ', 'At What Age Does It Become Embarrassing To Still Live With Your Parents (Numeric Only)? ']


In [None]:
def main():
    # Load ProtoQA questions from JSONL
    protoqa_questions = load_protoqa_questions("train.jsonl")

    for question in protoqa_questions:
        print("\n==============================")
        print(f"Processing Question: {question}")
        print("==============================")

        # 1) Baseline Transformer Outputs
        print("\nBaseline Evaluation (without knowledge integration):")
        print("\nT5 Generated Answers:")
        for ans in baseline_generate_t5(question):
            print(" -", ans)
        print("\nBART Generated Answers:")
        for ans in baseline_generate_bart(question):
            print(" -", ans)
        print("\nGPT-2 Generated Answers:")
        for ans in baseline_generate_gpt2(question):
            print(" -", ans)

        # 2) Candidate Generation using ConceptNet
        print("\nCandidate Generation Using ConceptNet:")
        candidates = generate_candidates_improved(question, top_k=20)
        if not candidates:
            print("No candidates found.")
            continue
        print("Generated Candidates (raw):")
        print(candidates)

        # 3) Ranking
        ranked_candidates = rank_candidates_by_composite(question, candidates)
        print("\nRanked Candidates (candidate | composite | brevity | concreteness | typicality):")
        for cand, comp, brev, conc, typ in ranked_candidates:
            print(f"{cand:30s} | {comp:8.3f} | {brev:7.3f} | {conc:11.3f} | {typ:10.3f}")

        # 4) Enhanced Post-Processing
        print("\nPost-Processing for Redundancy & Coherence:")
        final_candidates = postprocess_candidates(ranked_candidates, question, similarity_threshold=0.8)
        if not final_candidates:
            print("All candidates removed during post-processing.")
        else:
            print("Final Post-Processed Candidates:")
            for cand, comp, brev, conc, typ in final_candidates:
                print(f"{cand:30s} | {comp:8.3f} | {brev:7.3f} | {conc:11.3f} | {typ:10.3f}")

        print("\n--------------------------------------------")

        # Small delay to avoid flooding output
        time.sleep(0.5)

if __name__ == "__main__":
    main()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

--------------------------------------------

Processing Question: Name a sporting event that only happens once a year. 

Baseline Evaluation (without knowledge integration):

T5 Generated Answers:
 - sporting event
 - sporting event
 - Sport
 - the sport event that only occurs once annually
 - sport

BART Generated Answers:
 - Name a sporting event that only happens once a year. 
 - Name a sporting event that only happens once a year.
 - Name a sporting event that only happens once a year. ________________________
 - Name a sporting event that only happens once a year. _________________________
 - Name a sporting event that only happens once a year. ____________________________

GPT-2 Generated Answers:
 - Name a sporting event that only happens once a year.  If you're going to do it this year, you're going to want to do it this year.  If you're going to do it this year, you're going to
 - Name a sporting event that onl

KeyboardInterrupt: 

In [None]:
import json
import time
import pandas as pd
from IPython.display import display

# Load the JSONL file
def load_protoqa_questions(file_path, limit=100):
    """Loads the first `limit` questions from a JSONL file."""
    questions = []
    with open(file_path, "r") as f:
        for i, line in enumerate(f):
            if i >= limit:
                break  # Stop after processing the first `limit` questions
            entry = json.loads(line)  # Load each line as a JSON object
            questions.append(entry["question"]["original"])
    return questions

def main():
    # Load only the first 100 ProtoQA questions
    protoqa_questions = load_protoqa_questions("/content/train.jsonl", limit=1)

    # Data storage for summary table
    results = []

    for idx, question in enumerate(protoqa_questions):
        print(f"\nProcessing Question {idx+1}: {question}")
        print("="*50)

        # 1) Baseline Transformer Outputs
        t5_answers = baseline_generate_t5(question)
        bart_answers = baseline_generate_bart(question)
        gpt2_answers = baseline_generate_gpt2(question)

        # 2) Candidate Generation using ConceptNet
        candidates = generate_candidates_improved(question, top_k=20)
        if not candidates:
            candidates = ["No candidates found"]

        # 3) Ranking
        ranked_candidates = rank_candidates_by_composite(question, candidates)

        # **Keep only the top 3 ranked candidates**
        top_candidates = ranked_candidates[:3] if ranked_candidates else [("No candidates", 0, 0, 0, 0)]

        # 4) Enhanced Post-Processing
        final_candidates = postprocess_candidates(top_candidates, question, similarity_threshold=0.8)

        # Store results in a structured format (only top 3)
        for cand, comp, brev, conc, typ in final_candidates:
            results.append({
                "Question": question,
                "T5 Answer": ", ".join(t5_answers[:3]),  # Show top 3
                "BART Answer": ", ".join(bart_answers[:3]),
                "GPT-2 Answer": ", ".join(gpt2_answers[:3]),
                "Candidate": cand,
                "Composite Score": round(comp, 3),
                "Brevity": round(brev, 3),
                "Concreteness": round(conc, 3),
                "Typicality": round(typ, 3)
            })

        # Small delay to avoid flooding output
        time.sleep(0.2)

    # Convert results to a DataFrame and display it in the Colab notebook
    df = pd.DataFrame(results)
    display(df)  # This will display the DataFrame directly in Google Colab

if __name__ == "__main__":
    main()



Processing Question 1: At The Beach, Name Something That Might Protect You From Sun. 


Unnamed: 0,Question,T5 Answer,BART Answer,GPT-2 Answer,Candidate,Composite Score,Brevity,Concreteness,Typicality
0,"At The Beach, Name Something That Might Protec...",Name Something That Might Protect You From Sun...,"At The Beach, Name Something That Might Protec...","At The Beach, Name Something That Might Protec...",light,1.507,1.0,3.0,0.024
1,"At The Beach, Name Something That Might Protec...",Name Something That Might Protect You From Sun...,"At The Beach, Name Something That Might Protec...","At The Beach, Name Something That Might Protec...",mark,1.507,1.0,3.0,0.022
2,"At The Beach, Name Something That Might Protec...",Name Something That Might Protect You From Sun...,"At The Beach, Name Something That Might Protec...","At The Beach, Name Something That Might Protec...",label,1.504,1.0,3.0,0.014


MODEL WITH SAMPLE SIZE

In [None]:
import json
import nltk
import requests
import torch
import pandas as pd
import time
import numpy as np
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    Trainer,
    TrainingArguments
)

# Ensure required NLTK datasets are downloaded
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

#########################
# 1. Data Loading Functions
#########################

def load_protoqa_data(filename):
    """Load JSONL file and return parsed JSON objects."""
    data = []
    with open(filename, "r", encoding="utf-8") as f:
        for line in f:
            try:
                json_obj = json.loads(line.strip())  # Ensure proper JSON parsing
                if isinstance(json_obj, dict):
                    data.append(json_obj)
            except json.JSONDecodeError as e:
                print(f"Error parsing JSON in {filename}: {e}")
    return data

def extract_gold_answers(entry):
    """Extracts gold answers from ProtoQA clusters."""
    clusters = entry.get("answers", {}).get("clusters", {})
    gold_answers = set()
    for cluster in clusters.values():
        gold_answers.update(cluster.get("answers", []))
    return list(gold_answers)

def load_protoqa_train(filename, max_samples=1000):
    raw_data = load_protoqa_data(filename)
    train_data = []
    for entry in raw_data[:max_samples]:
        question_text = entry["question"]["normalized"]
        gold_answers = extract_gold_answers(entry)
        train_data.append({"question": question_text, "gold_answers": gold_answers})
    return train_data

def load_protoqa_dev(filename, start_index=1000, num_samples=100):
    raw_data = load_protoqa_data(filename)
    test_data = []
    for entry in raw_data[start_index:start_index + num_samples]:
        question_text = entry["question"]["normalized"]
        gold_answers = extract_gold_answers(entry)
        test_data.append({"question": question_text, "gold_answers": gold_answers})
    return test_data

#########################
# 2. Fine-Tuning T5 Model
#########################

class ProtoQADataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_input_length=64, max_target_length=20):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        example = self.data[index]
        question = example["question"]
        gold_answer = example["gold_answers"][0] if example["gold_answers"] else "unknown"

        input_text = f"Provide a plausible answer for: {question}"
        input_enc = self.tokenizer(input_text, truncation=True, max_length=self.max_input_length, padding="max_length")
        target_enc = self.tokenizer(gold_answer, truncation=True, max_length=self.max_target_length, padding="max_length")

        return {
            "input_ids": torch.tensor(input_enc.input_ids),
            "attention_mask": torch.tensor(input_enc.attention_mask),
            "labels": torch.tensor(target_enc.input_ids)
        }

def fine_tune_t5(train_file, max_train_samples=1000, max_eval_samples=100):
    tokenizer = T5Tokenizer.from_pretrained("t5-base")

    train_data = load_protoqa_train(train_file, max_samples=max_train_samples)
    eval_data = load_protoqa_dev(train_file, start_index=max_train_samples, num_samples=max_eval_samples)

    train_dataset = ProtoQADataset(train_data, tokenizer)
    eval_dataset = ProtoQADataset(eval_data, tokenizer)

    training_args = TrainingArguments(
        output_dir="./t5-finetuned-protoqa",
        num_train_epochs=3,  # Increased to help learn better
        per_device_train_batch_size=8,  # Reduced to fit memory
        save_strategy="epoch",
        evaluation_strategy="epoch",
        learning_rate=5e-5,  # Slightly increased learning rate
        load_best_model_at_end=True,
        fp16=True,
        report_to="none"
    )

    model = T5ForConditionalGeneration.from_pretrained("t5-base")
    trainer = Trainer(model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset)
    trainer.train()

    return model, tokenizer

#########################
# 3. Answer Generation and Evaluation
#########################

def generate_answers(model, tokenizer, question):
    """Generate answers using beam search to avoid empty results."""
    input_ids = tokenizer(question, return_tensors="pt").input_ids
    output_ids = model.generate(
        input_ids,
        max_length=20,
        num_beams=5,
        length_penalty=1.2,
        early_stopping=True
    )
    return tokenizer.decode(output_ids[0], skip_special_tokens=True).lower().strip()

def fuzzy_match(predicted, gold_answers, threshold=0.7):
    """Match predicted answer with gold answers using similarity threshold."""
    predicted_norm = predicted.lower().strip()
    return any(SequenceMatcher(None, predicted_norm, gold.lower().strip()).ratio() > threshold for gold in gold_answers)

def test_pipeline(test_data, model, tokenizer):
    results = []
    for entry in test_data:
        question = entry["question"]
        gold_answers = set(ans.lower().strip() for ans in entry["gold_answers"])

        # Generate model answers
        model_answer = generate_answers(model, tokenizer, question)

        # Compute Precision@5 (single answer case)
        precision_at_5 = 1.0 if fuzzy_match(model_answer, gold_answers) else 0.0

        results.append({
            "Question": question,
            "Predicted Answer": model_answer,
            "Gold Answers": list(gold_answers),
            "Precision@5": precision_at_5
        })

    df = pd.DataFrame(results)
    print(df[["Question", "Predicted Answer", "Gold Answers", "Precision@5"]])
    print("\nAverage Precision@5 across test set:", df["Precision@5"].mean())
    return df

#########################
# 4. Main Function
#########################

def main():
    train_file = "train.jsonl"

    model, tokenizer = fine_tune_t5(
        train_file,
        max_train_samples=1000,
        max_eval_samples=100
    )

    test_data = load_protoqa_dev(train_file, start_index=1000, num_samples=100)
    df_results = test_pipeline(test_data, model, tokenizer)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Epoch,Training Loss,Validation Loss
1,No log,0.632911


Epoch,Training Loss,Validation Loss
1,No log,0.632911
2,No log,0.575854
3,No log,0.561754


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


                                             Question    Predicted Answer  \
0   name something a good fortune teller tells a w...             fortune   
1   name an animal you might hear in the forest at...               tiger   
2         name something people might bring to a gym.                 gym   
3        name a food served with melted cheese on it.              cheese   
4   name something that might be taught in a new p...  new parents course   
..                                                ...                 ...   
95  tell me a breakfast item that only tastes good...           pancakes?   
96  tell me the age when boys stop playing with st...     stuffed animals   
97  tell me the day of the week when you start thi...            saturday   
98  tell me a place where you feel the need to get...  stretch your legs.   
99        tell me a profession that works long hours.     work long hours   

                                         Gold Answers  Precision@5  
0   [h