In [None]:
!pip install sentence-transformers

In [None]:
from openai import OpenAI
import difflib
import pandas as pd
from typing import List
from sentence_transformers import SentenceTransformer, util

In [None]:
client = OpenAI(api_key="")

In [None]:
# ---------- CONFIG ----------
MODEL = "gpt-3.5-turbo"
SIMILARITY_THRESHOLD = 0.7
model_1 = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# ---------- STEP 1: Generate Objective Questions ----------
def generate_objective_questions(question_text: str) -> List[str]:
    prompt = f"""
   You are a Key Fact Analyzer.

   Your role is to generate short, objective, fact-based questions from a given text and an optional scoring rubric.
   These questions will later be used to check whether a model-generated summary or answer includes the same factual content as the original text.

   Instructions:
Extract facts from the input text that are:

-- Names of people, roles, companies, or organizations

-- Dates of events, appointments, filings, or deadlines

-- Figures, statistics, or numerical values (percentages, amounts, durations)

-- Policies, decisions, actions, or outcomes

-- Sources or references mentioned explicitly

Frame one factual question per fact. Ensure each question:

-- Has a clear, direct answer

-- Is objective, factual, and specific (not open-ended or opinion-based)

-- Is formulated so that it can help evaluate correctness of another text

Supports a scoring rubric, if provided

-- At least one question targeting the factual detail relevant to the score


Example:
Scoring Question:
Q: Within how many months of the fiscal year end was the last AGM held?

Score 0 – More than six months

Score 1 – Within 4–6 months

Score 2 – Within 4 months

Generated Questions:

When did the fiscal year end?

When was the AGM held?

How many months after fiscal year end was the AGM conducted?

Mandatory Questions: Make sure to add these questions to teh list of questions

-- What are the source of information?

-- Based on the information, what should be the score?

    Text:
    {question_text}

    List of Questions:
    """
    response = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}]
    )
    answer = response.choices[0].message.content.strip()
    return [q.strip("- ").strip() for q in answer.split("\n") if q.strip()]

In [None]:
# ---------- STEP 2: Answer Questions from Text ----------
def answer_question_from_text(text: str, question: str) -> str:
    prompt = f"""You are a fact extractor. Given the following text, answer the specific question.

Text:
{text}

Question:
{question}

Answer:"""
    response = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content.strip()

In [None]:
# ---------- STEP 3: Compare Answers based on facts ----------
def check_key_fact_equivalence(gt_answer: str, llm_answer: str) -> int:
    prompt = f"""You are a key fact evaluator.
Determine if both answers are expressing the same key fact.
Return 1 if they refer to the same key fact even if phrased differently.
Return 0 if they refer to different facts.

GT Answer: {gt_answer}
LLM Answer: {llm_answer}

Respond with only 1 or 0."""
    response = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}]
    )
    result = response.choices[0].message.content.strip()
    return int(result) if result in ["0", "1"] else 0

In [None]:
# # ---------- STEP 3: Compare Answers Semantically ----------
# def compute_similarity(ans1: str, ans2: str) -> float:
#     emb1 = model_1.encode(ans1, convert_to_tensor=True)
#     emb2 = model_1.encode(ans2, convert_to_tensor=True)
#     score = util.cos_sim(emb1, emb2).item()
#     return score


In [None]:
# # ---------- STEP 3: Compare Answers ----------
# def is_similar(a: str, b: str, threshold: float = SIMILARITY_THRESHOLD) -> bool:
#     return difflib.SequenceMatcher(None, a.lower(), b.lower()).ratio() >= threshold

In [None]:
# ---------- MAIN EVALUATION ----------
def evaluate(question_text: str, gt_text: str, llm_output: str):
    questions = generate_objective_questions(question_text)

    results = []
    for q in questions:
        gt_answer = answer_question_from_text(gt_text, q)
        llm_answer = answer_question_from_text(llm_output, q)

        # match = is_similar(gt_answer, llm_answer)
        # score = difflib.SequenceMatcher(None, gt_answer.lower(), llm_answer.lower()).ratio()
        score = check_key_fact_equivalence(gt_answer, llm_answer)
        # match = score >= SIMILARITY_THRESHOLD

        results.append({
            "Question": q,
            "GT Answer": gt_answer,
            "LLM Answer": llm_answer,
            "Match": "✅" if score else "❌",
            "Score": score
        })

    df = pd.DataFrame(results)
    print(df)
    match_percentage = (df["Score"]).mean() * 100
    return df, match_percentage

In [None]:
# ---------- EXAMPLE USAGE ----------
if __name__ == "__main__":
    question_text = """
Question: Within how many months of the fiscal year end was the last AGM held?

Score 0 - More than six months after the fiscal year end

Score 1 - Within four-six months of the fiscal year end

Score 2 - Within four months of the fiscal year end
    """

    gt_text = """
The company scored 2 because the gap between FYE (March 31, 2024) and AGM (June 24, 2024) is less than 4 months. The calculation shows that there are 85 days between these two dates, which is less than 4 months. This information can be found on page 5 of the annual report, specifically in the section titled "About the Report" for the FYE, and on page 1 for the AGM notice.
Sources: pp. 1, 5 (annual_report.pdf)
"""

    llm_output = """
    The company scored 2 because the gap between FYE (March 31, 2024) and AGM (June 24, 2024) is less than 4 months. The calculation shows that there are 85 days between these two dates, which is less than 4 months. This information can be found on page 1 of the annual report, specifically in the section titled 'FYE (Financial Year End):' and 'AGM (Annual General Meeting) Date:'. Sources: pp. 1-4, 8-11, 24-28, 52-55 (annual_report_url_1_1_1.pdf)
    """
    df, final_score = evaluate(question_text, gt_text, llm_output)
    final_match = final_score >= SIMILARITY_THRESHOLD

     # Append final score row
    final_row = pd.DataFrame([{
        "Question": "FINAL SCORE",
        "GT Answer": "",
        "LLM Answer": "",
        "Match": "✅" if final_match else "❌",
        "Similarity Score": f"{final_score:.2f}%"
    }])

    df = pd.concat([df, final_row], ignore_index=True)
    print(df)
    # Save
    df.to_csv("evaluation_results_53.csv", index=False)

    print(f"\nFinal Match Score: {final_score:.2f}%")

                                            Question  \
0                   1. When did the fiscal year end?   
1                     2. When was the last AGM held?   
2  3. How many months after the fiscal year end w...   
3  4. What is the source of information for the m...   
4  5. Based on the information provided, what sho...   

                                           GT Answer  \
0           The fiscal year ended on March 31, 2024.   
1                                     June 24, 2024.   
2  The last AGM was conducted approximately 2.8 m...   
3  The source of information for the mentioned AG...   
4  The score assigned for the timing of the last ...   

                                          LLM Answer Match  Score  
0           The fiscal year ended on March 31, 2024.     ✅      1  
1            The last AGM was held on June 24, 2024.     ✅      1  
2                                Less than 4 months.     ✅      1  
3  The source of information for the mentioned AG...  