In [5]:
import os
import pandas as pd
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Constants



In [6]:
MODEL_DIR = "/home/sd3528/hetav-2/experiments/llama3-8b-qlora-prompt-minority-sampling"  # Path to fine-tuned model
TEST_CSV = "/home/sd3528/hetav-2/data/eval_30_each.csv"

In [12]:
def apply_label_mapping(_label, labels):
    """
    Apply label mapping to convert string labels to integer labels.
    """
    if _label in labels:
        return labels[int(_label)]
    else:
        raise ValueError(f"Label '{_label}' not found in the mapping.")

In [13]:

# Label categories (should match training)
labels = [    
    "Judicial Accountability and Policy Demands",            
    "Public Safety",                                         
    "Socioeconomic Privilege",                               
    "Victim Sympathy",                                       
    "Anger or Outrage",                                      
    "Irrelevant/General Comments",                           
    "Views on Similar Cases in the Past"                     
]

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_DIR, device_map="auto", torch_dtype=torch.float16)



Loading checkpoint shards: 100%|██████████| 4/4 [00:02<00:00,  1.44it/s]


In [15]:
# Load test data
df = pd.read_csv(TEST_CSV)
df.columns = df.columns.str.strip()
df["text"] = df["text"].fillna("").astype(str)
# df["Topic_Label"] = df["human_label"].apply(lambda x: apply_label_mapping(x, {label: i for i, label in enumerate(labels)}))
df["Topic_Label"] =  df['Label_Topic']

# Prompt builder
def make_prompt(text):
    return f"""
### Instruction:
Classify the comment into {', '.join(labels)}
Return the answer as the corresponding label.

### Text: {text}
### Answer:""".strip()

df["prompt"] = df["text"].apply(make_prompt)

# Inference
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)#, device=0 if device == "cuda" else -1)

preds = []
for prompt in tqdm(df["prompt"].tolist(), desc="Generating predictions"):
    output = pipe(prompt, max_new_tokens=20, do_sample=False)[0]["generated_text"]
    answer = output.split("### Answer:")[-1].strip().split("\n")[0]
    preds.append(answer.lower())


Device set to use cuda:0
Generating predictions:   0%|          | 0/210 [00:00<?, ?it/s]

Generating predictions: 100%|██████████| 210/210 [02:45<00:00,  1.27it/s]


In [17]:
print(preds)

['judicial accountability and policy demands', 'judicial accountability and policy demands', 'judicial accountability and policy demands', 'victim sympathy', 'judicial accountability and policy demands', 'judicial accountability and policy demands', 'judicial accountability and policy demands', 'judicial accountability and policy demands :clown_face::nerd_face:', 'judicial accountability and policy demands', 'judicial', 'judicial accountability and policy demands', 'anger or outrage', 'public safety', 'judicial accountability and policy demands', 'judicial accountability and policy demands', 'judicial accountability and policy demands', 'judicial accountability and policy demands', 'judicial accountability and policy demands', 'judicial accountability and policy demands', 'judicial accountability and policy demands', 'judicial', 'judicial accountability and policy demands', 'judicial accountability and policy demands', 'judicial accountability and policy demands', 'judicial accountabil

In [16]:

# Evaluation
true_labels = df["Label_Topic"].tolist()

print("\nAccuracy:", accuracy_score(true_labels, preds))
print("\nClassification Report:\n", classification_report(true_labels, preds, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(true_labels, preds))


Accuracy: 0.0

Classification Report:
                                                                            precision    recall  f1-score   support

                                                                                0.00      0.00      0.00       0.0
                                                                        0       0.00      0.00      0.00       0.0
                                                                        1       0.00      0.00      0.00       0.0
1. judicial accountability and policy demands, 2. socioeconomic privilege       0.00      0.00      0.00       0.0
                         :red_heart::crying_face::crying_face::red_heart:       0.00      0.00      0.00       0.0
                                                         Anger or Outrage       0.00      0.00      0.00      21.0
                                              Irrelevant/General Comments       0.00      0.00      0.00      25.0
                               Judicial