In [1]:
!pip install -q transformers accelerate sentencepiece

import json
from pathlib import Path
from typing import Dict, Any
from tqdm import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


In [2]:
CLEANED_PATH = Path("/content/cleaned.jsonl")   # change this
OUTPUT_PATH = Path("refined.jsonl")


In [3]:
!pip install -q huggingface_hub

from huggingface_hub import login
login()   # it will ask you to paste your token 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

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
model_name = "google/gemma-2-2b-it"  # small & ungated - TinyLlama/TinyLlama-1.1B-Chat-v1.0 -> too slow took 2:30min for 1 chunk

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
model.eval()

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/google/gemma-2-2b-it.
401 Client Error. (Request ID: Root=1-6916e7a7-5764cbb0500100430e9bef62;28ef9862-25ee-428c-8499-a057d77c5b4d)

Cannot access gated repo for url https://huggingface.co/google/gemma-2-2b-it/resolve/main/config.json.
Access to model google/gemma-2-2b-it is restricted. You must have access to it and be authenticated to access it. Please log in.

In [None]:
def build_prompt(chunk_text: str, rule_label: str) -> str:
    return f"""
You are an expert in corporate strategy and annual reports.

You receive a passage extracted from a company's annual report. It may contain information about:
- Mission: why the organization exists, its purpose.
- Vision: what the organization wants to become or achieve in the future.
- Goals: specific objectives or targets to achieve.
- Strategy: how the organization plans to achieve its goals or competitive advantage.
- Core Values: guiding principles or beliefs.
- Other: none of the above.

The passage was preliminarily labeled (using rules) as: "{rule_label}". This label may be correct or incorrect.

TASKS:
1. Decide which SINGLE label fits this passage best:
   ["Mission", "Vision", "Goals", "Strategy", "Core Values", "Other"].
2. From the passage, extract only the most relevant sentence(s) that express this element.
3. Optionally, rewrite it in a concise, clear form while keeping the original meaning.
4. If the passage is not actually about mission, vision, goals, strategy, or core values, choose "Other" and leave the extracted and canonical fields empty.

Passage:
\"\"\"{chunk_text}\"\"\"

Respond in pure JSON (no explanation, no markdown), exactly in this structure:

{{
  "predicted_label": "Mission | Vision | Goals | Strategy | Core Values | Other",
  "extracted_original": [
    "first key sentence",
    "second key sentence"
  ],
  "canonical_rewrite": "short clean version, or empty string if not applicable"
}}
"""


In [None]:
def call_llm(prompt: str, max_new_tokens: int = 256) -> str:
    """
    Call the loaded HF model with a plain text prompt and return the generated text.
    """
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=2000,
    ).to(model.device)

    with torch.no_grad():
        output_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=0.4,
            top_p=0.9,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

    generated = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    # We only want the part *after* the prompt
    if generated.startswith(prompt):
        return generated[len(prompt):].strip()
    return generated.strip()


In [None]:
def parse_llm_json(raw: str) -> Dict[str, Any]:
    raw = raw.strip()
    # Try direct parse
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        # Try to cut from first '{' to last '}'
        start = raw.find("{")
        end = raw.rfind("}")
        if start != -1 and end != -1 and end > start:
            snippet = raw[start:end+1]
            return json.loads(snippet)
        # Fallback: minimal default
        return {
            "predicted_label": "Other",
            "extracted_original": [],
            "canonical_rewrite": ""
        }


In [None]:
def iter_cleaned_rows(path: Path):
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                yield json.loads(line)


def refine_chunks(
    input_path: Path,
    output_path: Path,
    limit: int = None  # for testing; None = all
):
    with output_path.open("w", encoding="utf-8") as out:
        for i, row in enumerate(tqdm(iter_cleaned_rows(input_path))):
            if limit is not None and i >= limit:
                break

            text = row["text"]
            rule_label = row.get("label", "Other")

            prompt = build_prompt(text, rule_label)

            try:
                raw_resp = call_llm(prompt)
                parsed = parse_llm_json(raw_resp)
            except Exception as e:
                print(f"Error on row {i}: {e}")
                parsed = {
                    "predicted_label": rule_label,
                    "extracted_original": [],
                    "canonical_rewrite": ""
                }

            refined_row = {
                # original metadata
                "pdf": row.get("pdf"),
                "page": row.get("page"),
                "rule_label": rule_label,
                "text": text,
                # LLM outputs
                "predicted_label": parsed.get("predicted_label", "Other"),
                "extracted_original": parsed.get("extracted_original", []),
                "canonical_rewrite": parsed.get("canonical_rewrite", "")
            }

            out.write(json.dumps(refined_row, ensure_ascii=False) + "\n")


# First: test on a small subset (e.g. 2 chunks)
refine_chunks(CLEANED_PATH, OUTPUT_PATH, limit=2)


In [None]:
!head -n 5 refined.jsonl


In [None]:
refine_chunks(CLEANED_PATH, OUTPUT_PATH, limit=None)
