# Generic Pipeline Demo (v2)
_Generated 2025-10-02T03:09:05.772454Z_

**What's new:** Fix retrieval for TruthfulQA by indexing **questions** with **TF‑IDF (1–2 grams, stopwords removed)**. This avoids the earlier Jaccard issue where generic answers like 'Nothing happens...' dominated.

**Includes:** Extraction → Canonicalization (now captures `source` if present) → EDA (extra plots) → Improved retrieval demo with a real example (watermelon seeds).

In [None]:
import os, json, zipfile, re
from typing import List, Dict, Any
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

BASE = '/mnt/data'
RAW_DIR = os.path.join(BASE, 'data', 'raw')
PROC_DIR = os.path.join(BASE, 'data', 'processed')
EDA_DIR = os.path.join(BASE, 'data', 'eda', 'figures')
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROC_DIR, exist_ok=True)
os.makedirs(EDA_DIR, exist_ok=True)

# Uploaded sources in this environment
UPLOADED_FEVER = '/mnt/data/train.jsonl'
UPLOADED_TRUTH = '/mnt/data/archive.zip'

FEVER_JSONL = os.path.join(RAW_DIR, 'fever_train.jsonl')
TRUTH_ZIP   = os.path.join(RAW_DIR, 'truthfulqa_archive.zip')
HOTPOT_DEV  = os.path.join(RAW_DIR, 'hotpotqa', 'dev.json')  # optional

import shutil
if os.path.exists(UPLOADED_FEVER) and not os.path.exists(FEVER_JSONL):
    shutil.copy(UPLOADED_FEVER, FEVER_JSONL)
if os.path.exists(UPLOADED_TRUTH) and not os.path.exists(TRUTH_ZIP):
    shutil.copy(UPLOADED_TRUTH, TRUTH_ZIP)

print('RAW_DIR:', RAW_DIR)
print('Files present:', os.listdir(RAW_DIR))

In [None]:
def to_str_id(x):
    try:
        return str(x)
    except Exception:
        return str(hash(x))

def ci_get(df: pd.DataFrame, *cands):
    cols = {c.lower(): c for c in df.columns}
    for c in cands:
        if c.lower() in cols:
            return cols[c.lower()]
    return None

def clean_text(s):
    if s is None:
        return None
    if not isinstance(s, str):
        s = str(s)
    return s.strip()

def write_jsonl(rows, path):
    with open(path, 'w', encoding='utf-8') as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + '\n')

CANONICAL_OUT = os.path.join(PROC_DIR, 'generic_canonical.jsonl')

In [None]:
def load_fever_canonical(path: str):
    rows = []
    if not os.path.exists(path):
        print(f"[FEVER] Not found: {path}")
        return rows
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            try:
                obj = json.loads(line)
            except Exception:
                continue
            cid = to_str_id(obj.get('id', i))
            claim = clean_text(obj.get('claim'))
            label = obj.get('label')
            evidence = obj.get('evidence', [])
            rows.append({
                'dataset': 'fever',
                'id': cid,
                'query': claim,
                'answer': None,
                'contexts': [],
                'label': label,
                'meta': {'evidence': evidence}
            })
    print(f'[FEVER] Canonicalized rows: {len(rows)}')
    return rows

def load_truthfulqa_canonical(zip_path: str):
    out = []
    if not os.path.exists(zip_path):
        print(f'[TruthfulQA] Zip not found: {zip_path}')
        return out
    try:
        with zipfile.ZipFile(zip_path, 'r') as z:
            z.extractall(os.path.join(RAW_DIR, 'truthfulqa_extracted'))
    except Exception as e:
        print('[TruthfulQA] Extraction failed:', e)
        return out

    csvs = []
    for root, _, files in os.walk(os.path.join(RAW_DIR, 'truthfulqa_extracted')):
        for fn in files:
            if fn.lower().endswith('.csv'):
                csvs.append(os.path.join(root, fn))

    gen_candidates = [p for p in csvs if 'generation' in os.path.basename(p).lower()]
    mc_candidates  = [p for p in csvs if 'multiple'   in os.path.basename(p).lower()]

    if not gen_candidates and csvs:
        gen_candidates = [csvs[0]]
    if len(csvs) > 1 and not mc_candidates:
        mc_candidates = [csvs[1]]

    # generation CSVs
    for p in gen_candidates:
        try:
            df = pd.read_csv(p)
        except Exception as e:
            print('[TruthfulQA] Could not read:', p, e)
            continue
        q_col = ci_get(df, 'question', 'prompt', 'query')
        a_col = ci_get(df, 'best_answer', 'correct_answer', 'answer')
        cat   = ci_get(df, 'category', 'topic', 'domain')
        incor = ci_get(df, 'incorrect_answers', 'incorrect', 'distractors')
        src   = ci_get(df, 'source', 'url', 'reference')
        if q_col is None or a_col is None:
            print('[TruthfulQA] Skipping generation CSV with missing cols:', p)
            continue
        for i, r in df.iterrows():
            out.append({
                'dataset': 'truthfulqa_gen',
                'id': to_str_id(('truth_gen', p, i)),
                'query': clean_text(r[q_col]),
                'answer': clean_text(r[a_col]),
                'contexts': [],
                'label': None,
                'meta': {
                    'category': clean_text(r[cat]) if cat else None,
                    'incorrect_answers': clean_text(r[incor]) if incor else None,
                    'source': clean_text(r[src]) if src else None,
                    'source_csv': os.path.basename(p)
                }
            })

    # multiple-choice CSVs
    for p in mc_candidates:
        try:
            df = pd.read_csv(p)
        except Exception as e:
            print('[TruthfulQA] Could not read:', p, e)
            continue
        q_col = ci_get(df, 'question', 'prompt', 'query')
        cat   = ci_get(df, 'category', 'topic', 'domain')
        src   = ci_get(df, 'source', 'url', 'reference')
        mc_cols = [c for c in df.columns if 'mc' in c.lower() and ('target' in c.lower() or 'targets' in c.lower() or 'choice' in c.lower())]
        if not mc_cols:
            mc_cols = [c for c in df.columns if 'option' in c.lower() or 'choice' in c.lower()]
        if q_col is None or not mc_cols:
            print('[TruthfulQA] Skipping MC CSV with missing cols:', p)
            continue
        for i, r in df.iterrows():
            choices = {c: r.get(c, None) for c in mc_cols}
            out.append({
                'dataset': 'truthfulqa_mc',
                'id': to_str_id(('truth_mc', p, i)),
                'query': clean_text(r[q_col]),
                'answer': None,
                'contexts': [],
                'label': None,
                'meta': {
                    'category': clean_text(r[cat]) if cat else None,
                    'choices': choices,
                    'source': clean_text(r[src]) if src else None,
                    'source_csv': os.path.basename(p)
                }
            })
    print(f'[TruthfulQA] Canonicalized rows: {len(out)}')
    return out

def load_hotpotqa_canonical(path: str, max_rows: int = 1000):
    rows = []
    if not os.path.exists(path):
        print(f'[HotpotQA] Not found: {path} (optional)')
        return rows
    try:
        with open(path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        print('[HotpotQA] Could not read dev.json:', e)
        return rows
    for i, ex in enumerate(data[:max_rows]):
        contexts = []
        for title, sents in ex.get('context', []):
            contexts.append({'doc_id': title, 'text': ' '.join(sents)})
        rows.append({
            'dataset': 'hotpotqa',
            'id': ex.get('_id', to_str_id(i)),
            'query': clean_text(ex.get('question')),
            'answer': clean_text(ex.get('answer')),
            'contexts': contexts,
            'label': None,
            'meta': {'supporting_facts': ex.get('supporting_facts', [])}
        })
    print(f'[HotpotQA] Canonicalized rows: {len(rows)}')
    return rows

In [None]:
all_rows = []
fever_rows     = load_fever_canonical(FEVER_JSONL)
truthful_rows  = load_truthfulqa_canonical(TRUTH_ZIP)
hotpot_rows    = load_hotpotqa_canonical(HOTPOT_DEV, max_rows=1000)

if fever_rows: all_rows.extend(fever_rows)
if truthful_rows: all_rows.extend(truthful_rows)
if hotpot_rows: all_rows.extend(hotpot_rows)

print('Counts:', {'fever': len(fever_rows), 'truthfulqa': len(truthful_rows), 'hotpotqa': len(hotpot_rows), 'total': len(all_rows)})
if all_rows:
    write_jsonl(all_rows, CANONICAL_OUT)
    print('Wrote canonical JSONL →', CANONICAL_OUT)

In [None]:
from collections import Counter

def length_stats(texts):
    lengths = [len(str(t).split()) for t in texts if isinstance(t, str)]
    if not lengths: return {'count':0,'mean':0,'median':0,'p95':0}
    arr = np.array(lengths)
    return {'count': int(len(arr)), 'mean': float(arr.mean()), 'median': float(np.median(arr)), 'p95': float(np.percentile(arr, 95))}

stats = []
if fever_rows:
    s = length_stats([r['query'] for r in fever_rows]); s.update({'dataset':'FEVER'}); stats.append(s)
if truthful_rows:
    s = length_stats([r['query'] for r in truthful_rows]); s.update({'dataset':'TruthfulQA'}); stats.append(s)
if hotpot_rows:
    s = length_stats([r['query'] for r in hotpot_rows]); s.update({'dataset':'HotpotQA'}); stats.append(s)

summary_df = pd.DataFrame(stats)[['dataset','count','mean','median','p95']]
summary_df

In [None]:
# FEVER plots
if fever_rows:
    df = pd.DataFrame(fever_rows)
    plt.figure(); df['label'].value_counts(dropna=False).plot(kind='bar')
    plt.title('FEVER label distribution'); plt.xlabel('Label'); plt.ylabel('Count'); plt.tight_layout(); plt.show()

    lengths = df['query'].fillna('').apply(lambda s: len(str(s).split()))
    plt.figure(); plt.hist(lengths, bins=40)
    plt.title('FEVER claim length (words)'); plt.xlabel('Words'); plt.ylabel('Frequency'); plt.tight_layout(); plt.show()

    vals = np.sort(lengths.values); cdf = np.arange(1, len(vals)+1) / len(vals)
    plt.figure(); plt.plot(vals, cdf)
    plt.title('FEVER claim length CDF'); plt.xlabel('Words'); plt.ylabel('CDF'); plt.tight_layout(); plt.show()

    df[['id','query','label']].head(3)

In [None]:
# TruthfulQA plots
if truthful_rows:
    df = pd.DataFrame(truthful_rows)
    cats = df['meta'].apply(lambda m: (m or {}).get('category', None))
    top = cats.value_counts(dropna=True).head(15)
    if not top.empty:
        plt.figure(); top.plot(kind='bar')
        plt.title('TruthfulQA categories (top 15)'); plt.xlabel('Category'); plt.ylabel('Count'); plt.tight_layout(); plt.show()

    q_lengths = df['query'].fillna('').apply(lambda s: len(str(s).split()))
    plt.figure(); plt.hist(q_lengths, bins=40)
    plt.title('TruthfulQA question length (words)'); plt.xlabel('Words'); plt.ylabel('Frequency'); plt.tight_layout(); plt.show()

    vals = np.sort(q_lengths.values); cdf = np.arange(1, len(vals)+1) / len(vals)
    plt.figure(); plt.plot(vals, cdf)
    plt.title('TruthfulQA question length CDF'); plt.xlabel('Words'); plt.ylabel('CDF'); plt.tight_layout(); plt.show()

    df[['id','query','answer']].head(3)

In [None]:
# HotpotQA plots (if present)
if hotpot_rows:
    df = pd.DataFrame(hotpot_rows)
    lengths = df['query'].fillna('').apply(lambda s: len(str(s).split()))
    plt.figure(); plt.hist(lengths, bins=40)
    plt.title('HotpotQA question length (words)'); plt.xlabel('Words'); plt.ylabel('Frequency'); plt.tight_layout(); plt.show()

    num_ctx = df['contexts'].apply(lambda cs: len(cs) if isinstance(cs, list) else 0)
    plt.figure(); plt.hist(num_ctx, bins=40)
    plt.title('HotpotQA #contexts per question'); plt.xlabel('#contexts'); plt.ylabel('Frequency'); plt.tight_layout(); plt.show()

    df[['id','query','answer']].head(3)

## Improved Retrieval for TruthfulQA (TF‑IDF over Questions)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

truth_q_rows = [r for r in truthful_rows if r['dataset'].startswith('truthfulqa')]

if truth_q_rows:
    questions = [r['query'] or '' for r in truth_q_rows]
    tfidf = TfidfVectorizer(ngram_range=(1,2), stop_words='english')
    X = tfidf.fit_transform(questions)

    def search_truthfulqa(query: str, k: int = 5):
        qv = tfidf.transform([query])
        sims = linear_kernel(qv, X).ravel()  # cosine because TF-IDF vectors are L2-normalized by default
        topk = sims.argsort()[::-1][:k]
        results = []
        for idx in topk:
            r = truth_q_rows[idx]
            results.append({
                'score': float(sims[idx]),
                'id': r['id'],
                'query': r['query'],
                'answer': r.get('answer'),
                'category': (r.get('meta') or {}).get('category'),
                'source': (r.get('meta') or {}).get('source')
            })
        return results
else:
    def search_truthfulqa(query: str, k: int = 5):
        return []

print('TruthfulQA TF-IDF index built:', len(truth_q_rows), 'questions')

In [None]:
demo_q = "What happens to you if you eat watermelon seeds?"
print("DEMO query:", demo_q)
for item in search_truthfulqa(demo_q, k=5):
    print(f"score={item['score']:.3f} | Q={item['query']}")
    print(f"  → ANSWER: {item['answer']}")
    if item.get('source'):
        print(f"  source: {item['source']}")
    print()

### ✅ Why this works better
- We index **questions**, not answers.  
- We use **TF‑IDF with bigrams + stopword removal**, which matches key phrases like *“watermelon seeds”* and avoids generic noise like *“Nothing happens…”*.  
- For HotpotQA (when added), you’ll switch to **document/context retrieval**, which is true RAG. For Workbook‑1, this question‑level retrieval is sufficient to show the pipeline is working and accurate on Generic datasets.

**Next steps**:  
- Swap TF‑IDF with **embedding retrieval (BGE)**, add **reranker**, and then the **guardrails** (NLI + judge + trust score).