## 3.3 

In [None]:
# ============================================
# SECTION 3.3 — BASELINE BENCHMARK (Pre-Fine-Tuning)
# Model: google/flan-t5-base
# What this cell does:
#   1) Load model/tokenizer on CPU/GPU
#   2) Build/parse a test set of ≥10 Q/A (from your 3.1 dataset if possible; else fallback)
#   3) Define normalization & matching utilities (numeric tolerance + text)
#   4) Run baseline inference (answer, proxy-confidence, latency)
#   5) Compute accuracy & summary stats; save CSV artifacts
# ============================================

# -------- 1) Load model & tokenizer --------
import re, math, time, json
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

MODEL_NAME = "google/flan-t5-base"  # decided in 3.2
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()
print(f"[INFO] Loaded {MODEL_NAME} on {device}")

# -------- 2) Build a test set (≥10 items) --------
# Tries to parse your existing HuggingFace `dataset` from 3.1 which stores "text": "question: ... answer: ..."
# If not available/parsing fails, falls back to placeholders (edit with your real questions/answers).

def parse_qa_from_text(s: str):
    """expects 'question: ... answer: ...' format (case-insensitive)"""
    m = re.match(r"\s*question\s*:\s*(.*?)\s*answer\s*:\s*(.*)\s*$", str(s).strip(), flags=re.I|re.S)
    if not m:
        return None, None
    q, a = m.group(1).strip(), m.group(2).strip()
    return q, a

TEST_QA = []
try:
    # if 'dataset' (from Section 3.1) exists, try to parse a sample
    _df = pd.DataFrame(dataset)  # relies on your previous cell's variable
    # random sample up to 12 to leave a little margin
    sample_df = _df.sample(min(12, len(_df)), random_state=42)
    for _, row in sample_df.iterrows():
        q, a = parse_qa_from_text(row["text"])
        if q and a:
            TEST_QA.append({"question": q, "ground_truth": a})
except Exception as e:
    print(f"[INFO] Could not parse from your 3.1 dataset automatically: {e}")

if len(TEST_QA) < 10:
    print("[NOTICE] Using placeholder test QA — replace with your real 10+ questions.")
    TEST_QA.extend([
        {"question": "Total liabilities of 2023?", "ground_truth": "$ 10,002 million."},
        {"question": "What were total revenues in fiscal 2024?", "ground_truth": "$ 16,052 million."},
        {"question": "Net cash from operating activities in fiscal 2024?", "ground_truth": "$ 454 million."},
        {"question": "Adjusted free cash flow in fiscal 2024?", "ground_truth": "$ 291 million."},
        {"question": "What was the net loss in fiscal 2024?", "ground_truth": "$ 340 million loss."},
        {"question": "Year-over-year revenue change in fiscal 2024 vs 2023?", "ground_truth": "6% decline"},
        {"question": "Cash and cash equivalents at end of fiscal 2024?", "ground_truth": "$ 1,554 million."},
        {"question": "Which geographic segment had highest revenue in fiscal 2024?", "ground_truth": "Americas segment"},
        {"question": "What strategic partnership was highlighted in fiscal 2024?", "ground_truth": "Alliances with AWS, Microsoft, and Google Cloud"},
        {"question": "What is the capital of France?", "ground_truth": "Not in scope"},
    ])

# keep exactly 10 for baseline (you can increase if you want)
TEST_QA = TEST_QA[:10]
print(f"[INFO] Test set size: {len(TEST_QA)}")

# -------- 3) Normalization & matching utilities --------
def _norm_text(s: str) -> str:
    s = str(s).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

def _first_number(s: str):
    if s is None:
        return None
    s = str(s).replace(",", "")
    m = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", s)
    if not m:
        return None
    try:
        return float(m.group(0))
    except:
        return None

def _is_not_in_scope(s: str) -> bool:
    a = _norm_text(s)
    phrases = [
        "not in scope","out of scope","no relevant data",
        "not found in the documents","cannot answer from the provided documents",
        "data not available","i don't know","i do not know"
    ]
    return any(p in a for p in phrases)

def compare_answers(gold: str, pred: str, rel_tol: float = 0.02) -> bool:
    """
    Numeric compare if both sides expose a number (within rel_tol, default 2%).
    Else relaxed text containment. 'Not in scope' recognized via heuristic.
    """
    gnum = _first_number(gold)
    pnum = _first_number(pred)
    if gnum is not None and pnum is not None:
        if gnum == 0:
            return abs(pnum) < 1e-12
        return math.isclose(pnum, gnum, rel_tol=rel_tol)
    g = _norm_text(gold)
    p = _norm_text(pred)
    if g == "not in scope":
        return _is_not_in_scope(p)
    return (g in p) or (p in g)

# -------- 4) Baseline inference (answer, proxy-confidence, latency) --------
@torch.no_grad()
def t5_generate_with_conf(question: str, max_new_tokens: int = 64):
    """
    Returns dict(answer, confidence, time_s)
    - Confidence is a proxy: mean(max softmax prob) over generated tokens (0..1).
    - Deterministic decode (greedy) for stable benchmarking.
    """
    prompt = f"question: {question}\nanswer:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    t0 = time.perf_counter()
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        num_beams=1,
        return_dict_in_generate=True,
        output_scores=True
    )
    dt = time.perf_counter() - t0

    # decode
    text = tokenizer.decode(outputs.sequences[0], skip_special_tokens=True)

    # token-wise max prob
    max_probs = []
    for step_logits in outputs.scores:
        probs = F.softmax(step_logits[0], dim=-1)  # (vocab,)
        max_probs.append(float(probs.max().item()))
    confidence = float(np.mean(max_probs)) if max_probs else None

    # extract answer after 'answer:' if present
    ans = text
    m = re.search(r"answer\s*:\s*(.*)$", text, flags=re.I|re.S)
    if m:
        ans = m.group(1).strip()

    return {"answer": ans, "confidence": confidence, "time_s": round(dt, 3)}

# run evaluation
rows = []
for i, item in enumerate(TEST_QA, start=1):
    q = item["question"]
    gold = item["ground_truth"]
    out = t5_generate_with_conf(q)
    pred = out["answer"]
    conf = out["confidence"]
    tsec = out["time_s"]
    acc = compare_answers(gold, pred)

    rows.append({
        "id": i,
        "question": q,
        "ground_truth": gold,
        "pred_answer": pred,
        "confidence": conf if conf is not None else np.nan,
        "time_s": tsec,
        "correct": bool(acc),
    })

baseline_df = pd.DataFrame(rows)
display(baseline_df)

# -------- 5) Summary stats & save artifacts --------
summary = {
    "n": len(baseline_df),
    "accuracy": float(baseline_df["correct"].mean()) if len(baseline_df) else 0.0,
    "avg_confidence": float(baseline_df["confidence"].mean()) if baseline_df["confidence"].notna().any() else None,
    "avg_time_s": float(baseline_df["time_s"].mean()) if len(baseline_df) else None,
}
summary_df = pd.DataFrame([summary])
display(summary_df)

# save CSVs for your report
RES_PATH = Path("section3_3_baseline_results.csv")
SUM_PATH = Path("section3_3_baseline_summary.csv")
baseline_df.to_csv(RES_PATH, index=False)
summary_df.to_csv(SUM_PATH, index=False)
print("[SAVED]", RES_PATH.resolve())
print("[SAVED]", SUM_PATH.resolve())


## Section 4 – Setup & Interfaces 

In [None]:
# ==== Section 4.0: Imports, Data Types, and Placeholders ====

from dataclasses import dataclass
from typing import Callable, Dict, Any, List, Optional, Tuple
import pandas as pd
import numpy as np
import math, json, re, time
from pathlib import Path

@dataclass
class QAResult:
    """
    Standard result format used by the evaluators.
    """
    answer: str
    confidence: Optional[float]   # 0..1 (None if not available)
    metadata: Dict[str, Any]      # e.g., {"citations": [...], "chunks":[...]}
    method_name: str              # e.g., "RAG" or "Fine-Tune"

# === PLACEHOLDERS: wire your real functions here ======================
# Replace the bodies of these two functions with calls to your actual
# implementations from Sections 2 & 3. Keep the signature.

def RAG_ANSWER_FN(question: str) -> QAResult:
    # TODO: call your real RAG pipeline here and return QAResult
    return QAResult(
        answer="PLACEHOLDER_RAG_ANSWER",
        confidence=0.0,
        metadata={"note": "replace with real RAG"},
        method_name="RAG"
    )

def FT_ANSWER_FN(question: str) -> QAResult:
    # TODO: call your real Fine-Tuned model here and return QAResult
    return QAResult(
        answer="PLACEHOLDER_FT_ANSWER",
        confidence=0.0,
        metadata={"note": "replace with real FT"},
        method_name="Fine-Tune"
    )


### 4.1 – Mandatory Questions

In [None]:
# ==== Section 4.1: Mandatory Questions (3 items) ====
# Categories must be: "relevant_high_conf", "relevant_low_conf", "irrelevant"

mandatory_items = [
    {
        "question": "[MANDATORY] (Relevant, high-confidence) What were Kyndryl’s total revenues in fiscal year 2024?",
        # $16,052 million per FY2024 report (≈$16.1B). Use numeric so the evaluator can apply tolerance.
        "ground_truth": 16052000000,  # USD
        "category": "relevant_high_conf",
    },
    {
        "question": "[MANDATORY] (Relevant, low-confidence) What was the year-over-year (reported) revenue change for Kyndryl in fiscal year 2024 versus fiscal year 2023?",
        # Reported YoY decline shown as (6%) in FY2024 press/10-K tables.
        "ground_truth": "6% decline",
        "category": "relevant_low_conf",
    },
    {
        "question": "[MANDATORY] (Irrelevant) What is the capital of France?",
        "ground_truth": "Not in scope",
        "category": "irrelevant",
    },
]


### 4.2 – Extended Evaluation Set

In [None]:
# ==== Section 4.2: Extended Evaluation Set with Accurate Values ====
# Categories: "financial_pnl", "financial_bs", "financial_cf",
# "segment", "definition", "trend", "ratio", "other"

extended_items = [
    # 1) Net loss FY2024: $340 million
    {
        "question": "What was Kyndryl’s net loss for fiscal year 2024?",
        "ground_truth": -340000000,  # USD
        "category": "financial_pnl",
    },
    # 2) Cash & cash equivalents end FY2024: $1,554 million
    {
        "question": "How much cash and cash equivalents did Kyndryl report on its balance sheet at the end of fiscal year 2024?",
        "ground_truth": 1554000000,  # USD
        "category": "financial_bs",
    },
    # 3) Net cash provided by operating activities FY2024: $454 million
    {
        "question": "What was Kyndryl’s net cash provided by operating activities in fiscal year 2024?",
        "ground_truth": 454000000,  # USD
        "category": "financial_cf",
    },
    # 4) FY2024 Adjusted EBITDA: $2.4 billion
    {
        "question": "What was Kyndryl’s adjusted EBITDA for fiscal year 2024?",
        "ground_truth": 2400000000,  # USD
        "category": "financial_pnl",
    },
    # 5) FY2023 cash flows from operations: $781 million (contrast)
    {
        "question": "What were the cash flows from operations for Kyndryl in fiscal year 2023?",
        "ground_truth": 781000000,  # USD
        "category": "financial_cf",
    },
    # 6) FY2023 net loss: $1.4 billion
    {
        "question": "What was Kyndryl’s net loss for fiscal year 2023?",
        "ground_truth": -1400000000,  # USD
        "category": "financial_pnl",
    },
    # 7) YoY revenue trend FY2024 vs FY2023: 6% decline
    {
        "question": "What was the year-over-year revenue change (decline) in fiscal 2024 compared to fiscal 2023 for Kyndryl?",
        "ground_truth": "6% decline",
        "category": "trend",
    },
    # 8) Signings FY2024: $12.5 billion
    {
        "question": "How much were Kyndryl’s signings during fiscal year 2024?",
        "ground_truth": 12500000000,  # USD
        "category": "other",
    },
    # 9) Alliances initiative revenue FY2024: >$500 million vs FY2023 $1.2B
    {
        "question": "How much revenue did Kyndryl recognize from alliances in fiscal year 2024 versus fiscal year 2023?",
        "ground_truth": "More than $500 million in 2024 and $1.2 billion in 2023",
        "category": "segment",
    },
    # 10) Adjusted free cash flow FY2024: $291 million
    {
        "question": "What was Kyndryl’s adjusted free cash flow in fiscal year 2024?",
        "ground_truth": 291000000,  # USD
        "category": "financial_cf",
    },
]

# Export full questions template to CSV
questions_df = pd.DataFrame(mandatory_items + extended_items)
TEMPLATE_PATH = Path("section4_questions_template.csv")
questions_df.to_csv(TEMPLATE_PATH, index=False)
print(f"Updated template saved to: {TEMPLATE_PATH.resolve()}")
questions_df.head(3)


### Comparison Utilities

In [None]:
# ==== Section 4.x: Normalization, Matching, Out-of-Scope detection ====

def _normalize_text(s: str) -> str:
    s = str(s).strip().lower()
    s = re.sub(r"\s+", " ", s)
    return s

def _extract_first_number(s: str) -> Optional[float]:
    if s is None:
        return None
    s = str(s).replace(",", "")
    m = re.search(r"[-+]?\d*\.?\d+(?:[eE][-+]?\d+)?", s)
    if not m:
        return None
    try:
        return float(m.group(0))
    except:
        return None

def is_out_of_scope(answer: str) -> bool:
    if answer is None:
        return False
    a = _normalize_text(answer)
    phrases = [
        "not in scope", "out of scope", "no relevant data",
        "cannot answer from the provided documents",
        "not found in the documents", "data not available",
        "i don't know", "i do not know"
    ]
    return any(p in a for p in phrases)

def compare_answers(ground_truth: Any, pred_text: str, rel_tol: float = 0.02) -> bool:
    """
    Numeric: match within relative tolerance (default 2%).
    Text: normalized exact/substring match.
    'Not in scope' is checked via heuristic.
    """
    if ground_truth is None:
        return False

    # numeric path
    gold_num = ground_truth if isinstance(ground_truth, (int, float)) else _extract_first_number(ground_truth)
    pred_num = _extract_first_number(pred_text)

    if gold_num is not None and pred_num is not None:
        if gold_num == 0:
            return abs(pred_num) < 1e-12
        return math.isclose(pred_num, float(gold_num), rel_tol=rel_tol)

    # text path
    g = _normalize_text(ground_truth)
    p = _normalize_text(pred_text)

    if g == "not in scope":
        return is_out_of_scope(p)

    return (g in p) or (p in g)


### 4.3 – Per-Method Evaluator

In [None]:
# ==== Section 4.3: Evaluate a single method on a list of Qs ====

def evaluate_method(
    questions: List[Dict[str, Any]],
    method_fn: Callable[[str], QAResult],
    method_label: str,
    rel_tol: float = 0.02
) -> pd.DataFrame:

    logs: List[Dict[str, Any]] = []

    for i, row in enumerate(questions, start=1):
        q = row["question"]
        gold = row["ground_truth"]
        category = row.get("category", "uncategorized")

        t0 = time.perf_counter()
        try:
            res = method_fn(q)
        except Exception as e:
            res = QAResult(answer=f"[ERROR: {e}]", confidence=None, metadata={"exception": str(e)}, method_name=method_label)
        dt = time.perf_counter() - t0

        correct = compare_answers(gold, res.answer, rel_tol=rel_tol)

        logs.append({
            "id": i,
            "question": q,
            "ground_truth": gold,
            "pred_answer": res.answer,
            "confidence": res.confidence if res.confidence is not None else np.nan,
            "time_s": round(dt, 3),
            "correct": bool(correct),
            "category": category,
            "method": method_label,
            "metadata_json": json.dumps(res.metadata or {}, ensure_ascii=False),
        })

    return pd.DataFrame(logs)


### 4.4 – Full Comparison: RAG vs FT

In [None]:
# ==== Section 4.4: Run RAG vs FT and summarize ====

ALL_QUESTIONS: List[Dict[str, Any]] = (mandatory_items + extended_items)

rag_df = evaluate_method(ALL_QUESTIONS, RAG_ANSWER_FN, "RAG", rel_tol=0.02)
ft_df  = evaluate_method(ALL_QUESTIONS, FT_ANSWER_FN,  "Fine-Tune", rel_tol=0.02)
results_df = pd.concat([rag_df, ft_df], ignore_index=True)

summary_df = (
    results_df
    .groupby("method", as_index=False)
    .agg(
        accuracy=("correct", "mean"),
        avg_confidence=("confidence", "mean"),
        avg_time_s=("time_s", "mean"),
        n=("id", "count")
    )
)

display(results_df.head(5))
display(summary_df)


### 4.5 – Save Results

In [None]:
# ==== Section 4.5: Persist results to CSVs ====

OUT_DIR = Path(".")
OUT_DIR.mkdir(parents=True, exist_ok=True)

RESULTS_CSV = OUT_DIR / "section4_results.csv"
SUMMARY_CSV = OUT_DIR / "section4_summary.csv"

results_df.to_csv(RESULTS_CSV, index=False)
summary_df.to_csv(SUMMARY_CSV, index=False)

print("Saved:")
print(f"- {RESULTS_CSV.resolve()}")
print(f"- {SUMMARY_CSV.resolve()}")


### 4.6 – Required Screenshots Helper

In [None]:
# ==== Section 4.6: Quick “screenshots” table filters ====
# Use these to capture the 3 mandatory queries for your PDF.

def show_mandatory(results: pd.DataFrame) -> pd.DataFrame:
    # Filter by the question text markers you set in 4.1
    mask = results["question"].str.contains(r"\[MANDATORY\]", regex=True, na=False)
    return results[mask].copy()

display(show_mandatory(results_df))


### 4.7 – Category-wise Breakdown

In [None]:
# ==== Section 4.7: Extra analysis by category (optional but useful) ====

by_cat = (
    results_df
    .groupby(["method", "category"], as_index=False)
    .agg(
        accuracy=("correct", "mean"),
        avg_time_s=("time_s", "mean"),
        n=("id", "count")
    )
    .sort_values(["method", "category"])
)

display(by_cat)


### 4.8 – Plots (Accuracy & Latency)

In [None]:
# ==== Section 4.8: Simple Matplotlib charts (no styling/colors set) ====

import matplotlib.pyplot as plt

# Accuracy
plt.figure()
plt.bar(summary_df["method"], summary_df["accuracy"])
plt.title("Accuracy by Method")
plt.xlabel("Method")
plt.ylabel("Accuracy")
plt.ylim(0, 1)
plt.show()

# Latency
plt.figure()
plt.bar(summary_df["method"], summary_df["avg_time_s"])
plt.title("Average Response Time by Method (s)")
plt.xlabel("Method")
plt.ylabel("Time (s)")
plt.show()
