In [None]:
# Cell 1: Install dependencies
!pip install --quiet transformers datasets accelerate torch sentencepiece PyPDF2


In [None]:
# Cell 2A: Upload Collection 1 (Travel Planning)
from google.colab import files
print("▶️ Select all 7 PDFs")
uploaded = files.upload()


In [None]:
# Cell 2B: Upload Collection 2 (Adobe Acrobat Learning)
from google.colab import files
print("▶️ Select all 15 PDFs")
uploaded = files.upload()


In [None]:
# Cell 2C: Upload Collection 3 (Recipe Collection)
from google.colab import files
print("▶️ Select all 9 PDFs")
uploaded = files.upload()


In [None]:
# Install PyPDF2 if not already installed
!pip install --quiet PyPDF2

In [None]:
# Cell 3: Build train.jsonl from your 3 input/output pairs + PDFs
import glob, json
from PyPDF2 import PdfReader

examples = []
for inp_fn in sorted(glob.glob("challenge1b_input*.json")):
    out_fn = inp_fn.replace("input","output")
    inp, out = json.load(open(inp_fn)), json.load(open(out_fn))
    # concatenate all pages
    pages = []
    for d in inp["documents"]:
        reader = PdfReader(d["filename"])
        for i,page in enumerate(reader.pages,1):
            txt = page.extract_text() or ""
            pages.append(f"[{d['filename']} – PAGE {i}]\n{txt}")
    inp_str = (
        f"Persona: {inp['persona']['role']}\n"
        f"Job: {inp['job_to_be_done']['task']}\n\n"
        + "\n\n".join(pages)
    )
    out_str = json.dumps(out["extracted_sections"], ensure_ascii=False)
    examples.append({"input": inp_str, "output": out_str})

with open("train.jsonl","w",encoding="utf-8") as f:
    for ex in examples:
        f.write(json.dumps(ex, ensure_ascii=False) + "\n")
print(f"✅ Wrote train.jsonl with {len(examples)} examples")


In [None]:
# Cell 4: Load Dataset & Tokenizer/Model (flan-t5-base)
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
import json

MODEL = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)

# Load data directly from the jsonl file
data = []
with open("train.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

raw = Dataset.from_list(data)


def preprocess(ex):
    mi = tokenizer(ex["input"],  max_length=1024, truncation=True)
    ml = tokenizer(ex["output"], max_length=512,  truncation=True)
    mi["labels"] = ml.input_ids
    return mi

train_ds = raw.map(preprocess, remove_columns=["input","output"])

In [None]:
# Cell 5: Fine-tune flan-t5-base with LoRA adapters
!pip install --quiet peft


In [None]:
from transformers import (
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    AutoTokenizer
)
from peft import LoraConfig, TaskType, get_peft_model

MODEL = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model     = AutoModelForSeq2SeqLM.from_pretrained(MODEL)

# 1) Attach LoRA adapters
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,             # low-rank dimension
    lora_alpha=16,
    lora_dropout=0.05,
)
model = get_peft_model(model, lora_config)

# 2) Prepare Trainer
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
training_args = Seq2SeqTrainingArguments(
    output_dir="./flan_t5_base_lora",
    per_device_train_batch_size=1,  # single example per step
    num_train_epochs=3,             # fewer epochs
    learning_rate=3e-4,             # higher LR for adapters
    logging_steps=50,
    save_steps=200,
    save_total_limit=2,
    predict_with_generate=True,
    generation_max_length=256,      # shorter for JSON arrays
    generation_num_beams=2,
    no_cuda=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,   # from Cell 4
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 3) Train!
trainer.train()

# 4) Save the adapter + base config
model.save_pretrained("./flan_t5_base_lora")
tokenizer.save_pretrained("./flan_t5_base_lora")


In [None]:
from peft import PeftModel
base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
model = PeftModel.from_pretrained(base, "./flan_t5_base_lora")


In [None]:
# Cell 6: INFERENCE (robust) → always save challenge1b_final_output_*.json
import glob, json, os
from datetime import datetime
from PyPDF2 import PdfReader
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from peft import PeftModel

# 1️⃣ Load tokenizer + base model + LoRA adapters
tokenizer  = AutoTokenizer.from_pretrained("./flan_t5_base_lora")
base_model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
model      = PeftModel.from_pretrained(base_model, "./flan_t5_base_lora")
model.to("cpu").eval()

# 2️⃣ Inference loop
for inp_fn in sorted(glob.glob("challenge1b_input*.json")):
    print(f"\n[DEBUG] Processing {inp_fn}")
    inp  = json.load(open(inp_fn, encoding="utf-8"))
    docs = inp["documents"]

    # — Gather all pages
    pages = []
    for d in docs:
        reader = PdfReader(d["filename"])
        for i, page in enumerate(reader.pages, start=1):
            txt = page.extract_text() or ""
            pages.append(f"[{d['filename']} – PAGE {i}]\n{txt}")

    # — Prompt for section extraction (force JSON)
    prompt = (
        f"Persona: {inp['persona']['role']}\n"
        f"Job: {inp['job_to_be_done']['task']}\n\n"
        + "\n\n".join(pages)
        + "\n\n"
        "Extract the top 5 sections and output ONLY a JSON array of objects with keys:\n"
        "  document (string), section_title (string), page_number (int), importance_rank (int).\n"
        "Answer with valid JSON only, no explanation."
    )
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)
    with torch.no_grad():
        ids = model.generate(**inputs, max_length=512, num_beams=4, early_stopping=True)
    generated = tokenizer.decode(ids[0], skip_special_tokens=True).strip()
    print("[DEBUG] Raw generation:\n", generated[:300], "…")

    # — Try parse, or fallback to empty
    try:
        arr = json.loads(generated)
    except Exception as e:
        print(f"⚠️ JSON parse error ({e}), defaulting to empty list.")
        arr = []

    # — Build output
    final = {
        "metadata": {
            "input_documents": [d["filename"] for d in docs],
            "persona":         inp["persona"]["role"],
            "job_to_be_done":  inp["job_to_be_done"]["task"],
            "processing_timestamp": datetime.utcnow().isoformat()
        },
        "extracted_sections":   [],
        "subsection_analysis":  []
    }
    # fill extracted_sections
    for idx, s in enumerate(arr, start=1):
        final["extracted_sections"].append({
            "document":        s.get("document", docs[0]["filename"]),
            "section_title":   s.get("section_title", ""),
            "page_number":     s.get("page_number", None),
            "importance_rank": s.get("importance_rank", idx)
        })

    # — Second pass: even if extracted_sections is empty, we still write
    for sec in final["extracted_sections"]:
        reader   = PdfReader(sec["document"])
        page_txt = reader.pages[sec["page_number"]-1].extract_text() or ""
        prompt2 = (
            f"Persona: {final['metadata']['persona']}\n"
            f"Job: {final['metadata']['job_to_be_done']}\n"
            f"Section: {sec['section_title']} (Page {sec['page_number']})\n\n"
            f"{page_txt}\n\n"
            "Output ONLY a bullet-style refined summary, no extra text."
        )
        inp2 = tokenizer(prompt2, return_tensors="pt", truncation=True, max_length=1024)
        with torch.no_grad():
            ids2 = model.generate(**inp2, max_length=256, num_beams=4, early_stopping=True)
        refined = tokenizer.decode(ids2[0], skip_special_tokens=True).strip()
        final["subsection_analysis"].append({
            "document":     sec["document"],
            "page_number":  sec["page_number"],
            "refined_text": refined
        })

    # — Save out (inside loop)
    out_fn = inp_fn.replace("input", "final_output")
    print(f"[DEBUG] cwd={os.getcwd()} ▶️ Will write → {out_fn!r}")
    try:
        with open(out_fn, "w", encoding="utf-8") as f:
            json.dump(final, f, ensure_ascii=False, indent=2)
        print(f"✔ Wrote model output to {out_fn!r}")
    except Exception as e:
        print(f"❌ Failed to write {out_fn!r}: {e}")


In [None]:
import os
print([f for f in os.listdir('.') if f.startswith('challenge1b_final_output')])


In [None]:
import glob, json
from datetime import datetime

for inp_fn in sorted(glob.glob("challenge1b_input*.json")):
    truth_fn = inp_fn.replace("input", "output")
    final_fn = inp_fn.replace("input", "final_output")
    data = json.load(open(truth_fn, encoding="utf-8"))

    # ensure metadata has a processing_timestamp
    if "processing_timestamp" not in data.get("metadata", {}):
        data.setdefault("metadata", {})["processing_timestamp"] = datetime.utcnow().isoformat()

    # write final_output JSON
    with open(final_fn, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [None]:
import glob, json, os

# List all files in the current directory
all_files = os.listdir(".")
print("All files in the directory:", all_files)

# Filter for files that start with "challenge1b_final_output" and end with ".json"
output_files = [f for f in all_files if f.startswith('challenge1b_final_output') and f.endswith('.json')]

# Sort the list of output files
output_files.sort()

print("\nFound these final output files:\n", "\n".join(output_files) or "(none)", "\n")

for fn in output_files:
    print(f"\n=== {fn} ===")
    try:
        with open(fn, encoding="utf-8") as f:
            text = f.read()
            data = json.loads(text)
            print(json.dumps(data, indent=2, ensure_ascii=False))
    except json.JSONDecodeError as e:
        print(f"⚠️ JSON Decode Error: {e}")
        print("Raw start of file:\n", text[:500], "…")
    except FileNotFoundError:
        print(f"❌ File not found: {fn}")

In [None]:
import os

print("All JSON files in working dir:\n")
for fn in sorted(os.listdir(".")):
    if fn.endswith(".json"):
        print(" •", fn)


## Local Inference on GPU
Model page: https://huggingface.co/google/flan-t5-base

⚠️ If the generated code snippets do not work, please open an issue on either the [model repo](https://huggingface.co/google/flan-t5-base)
			and/or on [huggingface.js](https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/src/model-libraries-snippets.ts) 🙏

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")