### Setting cache to eb within our project folder

In [None]:
!nvidia-smi
!export HF_HOME="/home/jovyan/groupshare/lukelo_duoaa_project"
!export PYTHONPYCACHEPREFIX="/home/jovyan/groupshare/lukelo_duoaa_project"
!export PIP_CACHE_DIR="/home/jovyan/groupshare/lukelo_duoaa_project"
!export JUPYTER_RUNTIME_DIR="/home/jovyan/groupshare/lukelo_duoaa_project"

## LLM as a JUDGE

In [None]:
import os
import glob
import json
import re

import torch
import transformers
import pandas as pd
from tqdm.auto import tqdm
import os
from dotenv import load_dotenv
load_dotenv()

HF_TOKEN = os.getenv('HF_token')

# ======================================================
# 1. Judge prompt (with 1–4 scale: Bad…Excellent)
# ======================================================

JUDGE_SYSTEM = (
    "You are a strict, careful evaluator of autonomous driving scene descriptions. "
    "You must follow the instructions exactly and output in the required format."
)

JUDGE_USER_TEMPLATE = """
You are evaluating descriptions of autonomous driving scenes.

You are given:
- Ground-truth object annotations from the dataset for ONE frame.
- A VLM scene description.
- A VLM object-focused description.

Use ONLY the information in the ground-truth annotations and the two VLM descriptions.
Ignore any prior knowledge about driving scenes.

Ground-truth annotations (list of objects for this frame):
{annotations}

VLM scene description:
{scene_desc}

VLM object description:
{object_desc}

You must give 3 ratings. For EACH rating, use the following 4-point scale:

1 = Bad
    - Very poor, mostly wrong or unhelpful
2 = Could be Improved
    - Some relevant content but many issues or missing pieces
3 = Acceptable
    - Generally okay and usable, but still clearly improvable
4 = Excellent
    - Clear, relevant, and strong according to the definition below

Apply this scale to the following criteria:

1) Hallucinations
“Does the VLM description introduce any objects or events that are not present in the ground-truth annotations of this sample?”
- 1 (Bad) = many hallucinated objects / events
- 2 (Could be Improved) = several hallucinations or serious mistakes
- 3 (Acceptable) = a few minor hallucinations
- 4 (Excellent) = no obvious hallucinations

(Rate this based on BOTH the VLM scene description and the VLM object description.)

2) Safety relevance
“How well does the description focus on safety-relevant elements: vehicles and pedestrians?”
- 1 (Bad) = mostly irrelevant, misses safety-critical stuff
- 2 (Could be Improved) = mentions a few relevant things but misses many
- 3 (Acceptable) = mentions some important vehicles / pedestrians
- 4 (Excellent) = clearly emphasizes safety-critical elements (vehicles and pedestrians)

(Rate this based on the VLM object description.)

3) Overall quality
“Overall, how good is this description as a human explanation of the scene, independent of how factually accurate it is?”
- 1 (Bad) = poor, confusing or very unhelpful
- 2 (Could be Improved) = partially understandable, but weak overall
- 3 (Acceptable) = okay / usable but incomplete
- 4 (Excellent) = very good, clear, and easy to understand

(Rate this based on the VLM scene description.)

IMPORTANT:
- Use ONLY integer scores from 1 to 4.
- Be consistent with the scale definition.

Respond EXACTLY in the following format:

Feedback:::
Hallucinations: <integer 1-4>
Safety relevance: <integer 1-4>
Overall quality: <integer 1-4>
Short explanation: <one or two sentences explaining your ratings>
"""

# ======================================================
# 2. Helpers: build prompt & parse output
# ======================================================
from collections import Counter

def extract_category_names(annotations):
    """
    Return a list of unique category_name strings from the annotations.
    Example: ["human.pedestrian.adult", "vehicle.car"]
    """
    if not annotations:
        return []

    cats = [ann.get("category_name", "unknown") for ann in annotations]

    # If you don't care about order:
    # unique_cats = sorted(set(cats))

    # If you want to preserve the order they appear in:
    unique_cats = list(dict.fromkeys(cats))

    return unique_cats

    
def build_judge_prompt(sample: dict) -> str:
    """
    Build the user prompt from one JSON sample.

    Expected keys in sample:
      - "annotations" (list)
      - "vlm_scene_description" (str)
      - "vlm_object_description" (str)
    """
    annotations = sample.get("annotations", [])
    scene_desc = sample.get("vlm_scene_description", "")
    object_desc = sample.get("vlm_object_description", "")

    annotations_text = extract_category_names(annotations)

    # annotations_text = json.dumps(annotations, indent=2)

    return JUDGE_USER_TEMPLATE.format(
        annotations=annotations_text,
        scene_desc=scene_desc,
        object_desc=object_desc,
    )


def _extract_int_field(label: str, text: str):
    """
    Extract integer 1–4 after a label like 'Hallucinations:' from the judge output.
    """
    pattern = rf"{label}\s*:\s*([1-4])"
    m = re.search(pattern, text, flags=re.IGNORECASE)
    return int(m.group(1)) if m else None


def parse_judge_output(text: str) -> dict:
    """
    Parse the LLM judge output into numeric scores + explanation.
    """
    hallucinations = _extract_int_field("Hallucinations", text)
    safety_relevance = _extract_int_field("Safety relevance", text)
    overall_quality = _extract_int_field("Overall quality", text)

    m = re.search(r"Short explanation\s*:\s*(.+)", text, flags=re.IGNORECASE | re.DOTALL)
    explanation = m.group(1).strip() if m else None

    return {
        "hallucinations": hallucinations,
        "safety_relevance": safety_relevance,
        "overall_quality": overall_quality,
        "short_explanation": explanation,
        "raw_judge_output": text,
    }

# ======================================================
# 3. Call Llama 3 via transformers.pipeline
# ======================================================

def call_llm_judge_pipeline(
    prompt: str,
    pipe,
    temperature: float = 0.2,
    max_new_tokens: int = 300,
) -> str:
    """
    Use a Llama-3 chat pipeline to get the judge's answer for a single prompt.
    """
    messages = [
        {"role": "system", "content": JUDGE_SYSTEM},
        {"role": "user", "content": prompt},
    ]

    terminators = [
        pipe.tokenizer.eos_token_id,
        pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>"),
    ]

    outputs = pipe(
        messages,
        max_new_tokens=max_new_tokens,
        eos_token_id=terminators,
        do_sample=(temperature > 0),
        temperature=temperature,
        top_p=0.9,
        pad_token_id=pipe.tokenizer.eos_token_id,
    )

    # For Llama 3 chat pipeline, generated_text is a list of messages
    assistant_msg = outputs[0]["generated_text"][-1]
    judge_text = assistant_msg["content"]
    return judge_text

# ======================================================
# 4. Main: run_llm_judge over a folder
# ======================================================

def run_llm_judge(
    input_dir: str,
    model_id: str = "meta-llama/Meta-Llama-3-8B-Instruct",
    output_csv: str = None,
    max_files: int = None,
    temperature: float = 0.2,
    max_new_tokens: int = 300,
):
    """
    Run LLM-as-a-judge over all JSON samples in a folder.

    Parameters
    ----------
    input_dir : str
        Directory containing per-sample JSON files (e.g., "JsonFiles/").
    model_id : str
        Hugging Face model ID, e.g. "meta-llama/Meta-Llama-3-8B-Instruct".
    output_csv : str or None
        If not None, save the resulting DataFrame to this path.
    max_files : int or None
        If not None, only process the first `max_files` JSON files.
    temperature : float
        Sampling temperature for the judge model (0.0 = deterministic).
    max_new_tokens : int
        Max new tokens for the judge output.

    Returns
    -------
    df : pandas.DataFrame
        One row per JSON sample with judge scores and raw output.
    """

    # 1) Collect JSON files
    json_paths = sorted(glob.glob(os.path.join(input_dir, "*.json")))
    if max_files is not None:
        json_paths = json_paths[:max_files]

    if not json_paths:
        print(f"No JSON files found in {input_dir}")
        return pd.DataFrame()

    print(f"Found {len(json_paths)} JSON files in {input_dir}")
    print(f"Loading model: {model_id}")

    # 2) Load Llama 3 pipeline ONCE
    pipe = transformers.pipeline(
        "text-generation",
        model=model_id,
        model_kwargs={"torch_dtype": torch.bfloat16},
        device_map="auto",
        token=HF_TOKEN,
    )

    print("Model loaded.")

    # 3) Loop over files and judge
    rows = []
    for path in tqdm(json_paths, desc="Judging samples"):
        try:
            with open(path, "r") as f:
                sample = json.load(f)
        except Exception as e:
            print(f"Error loading {path}: {e}")
            continue

        # Build prompt
        prompt = build_judge_prompt(sample)
        print(prompt)
    #     # Call LLM judge
    #     try:
    #         judge_text = call_llm_judge_pipeline(
    #             prompt,
    #             pipe,
    #             temperature=temperature,
    #             max_new_tokens=max_new_tokens,
    #         )
    #     except Exception as e:
    #         print(f"Error calling model on {path}: {e}")
    #         judge_text = ""

    #     # Parse scores
    #     scores = parse_judge_output(judge_text)

    #     row = {
    #         "file": os.path.basename(path),
    #         "scene_name": sample.get("scene_name"),
    #         "sample_index": sample.get("sample_index"),
    #         "hallucinations": scores["hallucinations"],
    #         "safety_relevance": scores["safety_relevance"],
    #         "overall_quality": scores["overall_quality"],
    #         "short_explanation": scores["short_explanation"],
    #         "raw_judge_output": scores["raw_judge_output"],
    #     }
    #     rows.append(row)

    # df = pd.DataFrame(rows)

    # if output_csv is not None:
    #     df.to_csv(output_csv, index=False)
    #     print(f"Saved {len(df)} rows to {output_csv}")

    # return df


In [None]:
run_llm_judge(
    input_dir="/home/jovyan/OpenEMMA/data/JSON_file",
    model_id="meta-llama/Meta-Llama-3-8B-Instruct",
    output_csv="/home/jovyan/OpenEMMA/data/trial_llm_as_judge",
    max_files=2,
    temperature=0.2,
    max_new_tokens=300,
)

In [None]:
import transformers
import torch
import os 
import os
from dotenv import load_dotenv

load_dotenv()

HF_TOKEN = os.getenv('HF_token')

model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
os.environ["HF_HOME"] = "/home/jovyan/groupshare/lukelo_duoaa_project/.cache"
# model_id = "mistralai/Mistral-7B-Instruct-v0.3"
pipeline = transformers.pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"torch_dtype": torch.bfloat16},
    device_map="auto",
    token=HF_TOKEN
)
print("CUDA available:", torch.cuda.is_available())
print("Pipeline device:", pipeline.device)
messages = [
    {"role": "system", "content": "You are a kids chatbot who always responds in kids speak!"},
    {"role": "user", "content": "Who are you?"},
]

terminators = [
    pipeline.tokenizer.eos_token_id,
    pipeline.tokenizer.convert_tokens_to_ids("<|eot_id|>")
]

outputs = pipeline(
    messages,
    max_new_tokens=256,
    eos_token_id=terminators,
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
print(outputs[0]["generated_text"][-1])
