Finding the classification metrics for each of the models tried.

In [1]:
import json
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

def load_json_to_df(path):
    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Flatten llm_output.label
    df = pd.json_normalize(data)

    # Rename columns for convenience
    df = df.rename(columns={
        "label": "human_label",
        "llm_output.label": "model_pred"
    })
    
    # Fix indexing: model predictions are 0-indexed (0-6), human labels are 1-indexed (1-7)
    # Add 1 to model predictions to align with human labels
    df["model_pred"] = df["model_pred"] + 1

    return df

def compute_metrics(df):
    # Create a copy to avoid modifying the original
    df_clean = df.copy()
    
    # Check for NaN values
    nan_count = df_clean["model_pred"].isna().sum()
    total_count = len(df_clean)
    
    print(f"\n===== Data Summary =====")
    print(f"Total samples: {total_count}")
    print(f"NaN predictions: {nan_count} ({nan_count/total_count*100:.2f}%)")
    
    if nan_count > 0:
        print(f"\nRemoving {nan_count} samples with NaN predictions for metrics calculation...")
        df_clean = df_clean.dropna(subset=["model_pred"])
    
    # Also check for NaN in true labels
    nan_true = df_clean["human_label"].isna().sum()
    if nan_true > 0:
        print(f"Removing {nan_true} samples with NaN true labels...")
        df_clean = df_clean.dropna(subset=["human_label"])
    
    if len(df_clean) == 0:
        print("\n ERROR: No valid samples remaining after removing NaN values!")
        return
    
    y_true = df_clean["human_label"]
    y_pred = df_clean["model_pred"]
    
    print(f"\nValid samples for evaluation: {len(df_clean)}")
    print(f"Unique classes in true labels: {sorted(y_true.unique())}")
    print(f"Unique classes in predictions: {sorted(y_pred.unique())}")
    
    print("\n===== Classification Report =====")
    print(classification_report(y_true, y_pred, digits=4, zero_division=0.0))

In [None]:
def inspect_data(df, name="Dataset"):
    """Inspect the dataframe for potential issues"""
    print(f"\n===== {name} Inspection =====")
    print(f"Shape: {df.shape}")
    print(f"\nColumns: {df.columns.tolist()}")
    
    # Check for NaN values
    print(f"\nNaN counts:")
    nan_counts = df.isna().sum()
    for col in nan_counts[nan_counts > 0].index:
        print(f"  {col}: {nan_counts[col]} ({nan_counts[col]/len(df)*100:.2f}%)")
    
    # Check data types
    print(f"\nData types:")
    print(df.dtypes)
    
    # Show sample with NaN predictions if any
    if "model_pred" in df.columns and df["model_pred"].isna().any():
        print(f"\nSample rows with NaN predictions:")
        print(df[df["model_pred"].isna()].head())
    
    # Show class distribution
    if "human_label" in df.columns:
        print(f"\nClass distribution (human_label):")
        print(df["human_label"].value_counts().sort_index())
    
    if "model_pred" in df.columns:
        print(f"\nClass distribution (model_pred):")
        print(df["model_pred"].value_counts(dropna=False).sort_index())


## Model Evaluations

### Claude Haiku (Zero-shot and Few-shot)

In [None]:
claude_zero_path = "/Users/hetavpatel/Desktop/Data Science/Grad DS Work/DSCI 601 Applied Data Science/old_repos/NLPPunePorsche/LLMsTM/DSCI602/outputs-puneporshe/Zero-Shot/claude-haiku-4-5-20251001_outputs.json"
claude_few_path = "/Users/hetavpatel/Desktop/Data Science/Grad DS Work/DSCI 601 Applied Data Science/old_repos/NLPPunePorsche/LLMsTM/DSCI602/outputs-puneporshe/Few-Shot/claude-haiku-4-5-20251001_outputs.json"


df_haiku_zeroshot = load_json_to_df(claude_zero_path)
compute_metrics(df_haiku_zeroshot)

df_haiku_fewshot = load_json_to_df(claude_few_path)
compute_metrics(df_haiku_fewshot)


===== Data Summary =====
Total samples: 10
NaN predictions: 0 (0.00%)

Valid samples for evaluation: 10
Unique classes in true labels: [1, 3, 5, 6]
Unique classes in predictions: [1, 3, 4, 6]

===== Classification Report =====
              precision    recall  f1-score   support

           1     0.3333    1.0000    0.5000         1
           3     1.0000    1.0000    1.0000         1
           4     0.0000    0.0000    0.0000         0
           5     0.0000    0.0000    0.0000         4
           6     0.6667    0.5000    0.5714         4

    accuracy                         0.4000        10
   macro avg     0.4000    0.5000    0.4143        10
weighted avg     0.4000    0.4000    0.3786        10


===== Data Summary =====
Total samples: 10
NaN predictions: 0 (0.00%)

Valid samples for evaluation: 10
Unique classes in true labels: [1, 3, 5, 6]
Unique classes in predictions: [1, 3, 4, 6]

===== Classification Report =====
              precision    recall  f1-score   support



**Note:** The Claude Haiku files contain only 10 samples. Need to check with Soumyajit.

---

### GPT-4o-mini (Zero-shot and Few-shot)

In [4]:
gpt_mini_zero_path = "/Users/hetavpatel/Desktop/Data Science/Grad DS Work/DSCI 601 Applied Data Science/old_repos/NLPPunePorsche/LLMsTM/DSCI602/outputs-puneporshe/Zero-Shot/gpt-4o-mini_outputs.json"
gpt_mini_few_path = "/Users/hetavpatel/Desktop/Data Science/Grad DS Work/DSCI 601 Applied Data Science/old_repos/NLPPunePorsche/LLMsTM/DSCI602/outputs-puneporshe/Few-Shot/gpt-4o-mini_outputs.json"

# Zero-shot evaluation
print("GPT-4o-mini Zero-shot")
df_mini_zeroshot = load_json_to_df(gpt_mini_zero_path)
compute_metrics(df_mini_zeroshot)

GPT-4o-mini Zero-shot

===== Data Summary =====
Total samples: 2707
NaN predictions: 0 (0.00%)

Valid samples for evaluation: 2707
Unique classes in true labels: [1, 2, 3, 4, 5, 6, 7]
Unique classes in predictions: [1, 2, 3, 4, 5, 6, 7]

===== Classification Report =====
              precision    recall  f1-score   support

           1     0.2533    0.4113    0.3135       231
           2     0.1828    0.8793    0.3027        58
           3     0.4435    0.9351    0.6017       231
           4     0.4607    0.6241    0.5301       141
           5     0.5484    0.2853    0.3753       715
           6     0.7910    0.4977    0.6110      1065
           7     0.5165    0.6466    0.5743       266

    accuracy                         0.5009      2707
   macro avg     0.4566    0.6113    0.4727      2707
weighted avg     0.5942    0.5009    0.5081      2707



In [None]:
# Few-shot evaluation
print("GPT-4o-mini Few-shot")

df_mini_fewshot = load_json_to_df(gpt_mini_few_path)
# # Inspect the data first to understand the NaN issue
# inspect_data(df_mini_fewshot, name="GPT-4o-mini Few-shot")
compute_metrics(df_mini_fewshot)

GPT-4o-mini Few-shot

===== Data Summary =====
Total samples: 2707
NaN predictions: 1 (0.04%)

Removing 1 samples with NaN predictions for metrics calculation...

Valid samples for evaluation: 2706
Unique classes in true labels: [1, 2, 3, 4, 5, 6, 7]
Unique classes in predictions: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]

===== Classification Report =====
              precision    recall  f1-score   support

         1.0     0.2651    0.2857    0.2750       231
         2.0     0.1672    0.9138    0.2827        58
         3.0     0.4136    0.8182    0.5494       231
         4.0     0.1370    0.0714    0.0939       140
         5.0     0.3659    0.3147    0.3383       715
         6.0     0.7333    0.3718    0.4935      1065
         7.0     0.3782    0.6128    0.4677       266
         8.0     0.0000    0.0000    0.0000         0

    accuracy                         0.4072      2706
   macro avg     0.3075    0.4236    0.3126      2706
weighted avg     0.4911    0.4072    0.4109   

Let's do it instead for gpt-oss to see if that works...

In [22]:
def load_gpt_oss_json_to_df(json_path):
    """
    Load GPT-OSS outputs into a dataframe.
    - 'human_label' = int from 'llm_output'
    - 'oss_pred' = int parsed from 'llm_output_gpt'['content'] (JSON string or dict with 'label')
    """
    import pandas as pd
    import json
    import re

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    elements = []
    for entry in data:
        d = {}
        d["human_label"] = None
        d["oss_pred"] = None
        # Human-labeled value
        if "llm_output" in entry:
            d["human_label"] = entry["llm_output"]
        # Model prediction: "llm_output_gpt" may contain a "content" like '{"label": 6}'
        oss_pred = None
        gpt_block = entry.get("llm_output_gpt", None)
        if gpt_block:
            content = gpt_block.get("content", None)
            if content:
                try:
                    # Try parsing prediction as JSON first
                    oss_pred = json.loads(content).get("label", None)
                except Exception:
                    # Fallback: extract integer label via regex
                    match = re.search(r'"label"\s*:\s*(\d+)', str(content))
                    if match:
                        oss_pred = int(match.group(1))
                if oss_pred is not None:
                    d["oss_pred"] = int(oss_pred)
        elements.append(d)
    df = pd.DataFrame(elements)
    print(f"Loaded {len(df)} GPT-OSS elements")
    print("Nulls in human_label:", df['human_label'].isnull().sum())
    print("Nulls in oss_pred:", df['oss_pred'].isnull().sum())
    return df

def compute_oss_metrics(df):
    """
    Compute accuracy and classification report for GPT-OSS model outputs.
    Expects df with 'human_label' and 'oss_pred' columns.
    """
    from sklearn.metrics import classification_report, accuracy_score
    df_eval = df.dropna(subset=['human_label', 'oss_pred']).copy()
    y_true = df_eval["human_label"].astype(int)
    y_pred = df_eval["oss_pred"].astype(int)
    acc = accuracy_score(y_true, y_pred)
    print(f"OSS Accuracy: {acc:.3f}")
    print("OSS Classification Report:")
    print(classification_report(y_true, y_pred, zero_division=0.0))

# OSS Zero-shot path
gpt_oss_zero_path = "/Users/hetavpatel/Desktop/Data Science/Grad DS Work/DSCI 601 Applied Data Science/old_repos/NLPPunePorsche/LLMsTM/DSCI602/outputs-puneporshe/gpt-oss_20b_outputs.json"
gpt_oss_few_path = "/Users/hetavpatel/Desktop/Data Science/Grad DS Work/DSCI 601 Applied Data Science/old_repos/NLPPunePorsche/LLMsTM/DSCI602/outputs-puneporshe/gpt-oss_20b_outputs_few.json"

print("OSS Zero-shot")
df_oss_zeroshot = load_gpt_oss_json_to_df(gpt_oss_zero_path)
compute_oss_metrics(df_oss_zeroshot)


OSS Zero-shot
Loaded 2707 GPT-OSS elements
Nulls in human_label: 0
Nulls in oss_pred: 1
OSS Accuracy: 0.520
OSS Classification Report:
              precision    recall  f1-score   support

           1       0.28      0.47      0.35       231
           2       0.25      0.78      0.38        58
           3       0.50      0.86      0.63       231
           4       0.43      0.60      0.50       141
           5       0.47      0.44      0.45       715
           6       0.78      0.49      0.60      1064
           7       0.65      0.50      0.57       266

    accuracy                           0.52      2706
   macro avg       0.48      0.59      0.50      2706
weighted avg       0.59      0.52      0.53      2706



In [23]:
# Few-shot
print("OSS Few-shot")
df_oss_fewshot = load_gpt_oss_json_to_df(gpt_oss_few_path)

compute_oss_metrics(df_oss_fewshot)

OSS Few-shot
Loaded 2707 GPT-OSS elements
Nulls in human_label: 0
Nulls in oss_pred: 0
OSS Accuracy: 0.504
OSS Classification Report:
              precision    recall  f1-score   support

           1       0.28      0.42      0.34       231
           2       0.23      0.83      0.37        58
           3       0.49      0.89      0.63       231
           4       0.41      0.61      0.49       141
           5       0.46      0.45      0.46       715
           6       0.81      0.43      0.57      1065
           7       0.58      0.53      0.55       266

    accuracy                           0.50      2707
   macro avg       0.47      0.59      0.48      2707
weighted avg       0.59      0.50      0.51      2707



In [27]:
# ------------------- Mistral 7B output loader and metrics ------------------

def load_mistral_json_to_df(json_path):
    """
    Load Mistral-7B outputs into a dataframe with columns:
    'human_label' = int from 'llm_output'
    'mistral_pred' = int parsed from 'llm_output_mistral' (which is a string, e.g. '{\n    "label": 4\n}')
    """
    import pandas as pd
    import json
    import re

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    elements = []
    for entry in data:
        d = {}
        d["human_label"] = None
        d["mistral_pred"] = None
        # Human (true) label
        if "llm_output" in entry:
            d["human_label"] = entry["llm_output"]
        # Pred label is usually a string like '{"label": 4}' or similar
        pred_str = entry.get("llm_output_mistral", None)
        if pred_str:
            label_pred = None
            try:
                # Try json.loads
                label_pred = json.loads(pred_str).get("label", None)
            except Exception:
                # Try regex fallback
                match = re.search(r'"label"\s*:\s*(\d+)', pred_str)
                if match:
                    label_pred = int(match.group(1))
            if label_pred is not None:
                d["mistral_pred"] = int(label_pred)
        elements.append(d)
    df = pd.DataFrame(elements)
    print(f"Loaded {len(df)} Mistral elements")
    print("Nulls in human_label:", df['human_label'].isnull().sum())
    print("Nulls in mistral_pred:", df['mistral_pred'].isnull().sum())
    return df

def compute_mistral_metrics(df):
    """
    Compute accuracy and classification report for Mistral model outputs.
    Assumes df has 'human_label' and 'mistral_pred' columns.
    """
    from sklearn.metrics import classification_report, accuracy_score
    df_eval = df.dropna(subset=['human_label', 'mistral_pred']).copy()
    y_true = df_eval["human_label"].astype(int)
    y_pred = df_eval["mistral_pred"].astype(int)
    acc = accuracy_score(y_true, y_pred)
    print(f"Mistral Accuracy: {acc:.3f}")
    print("Mistral Classification Report:")
    # Use zero_division for safety
    print(classification_report(y_true, y_pred, zero_division=0.0))

# Paths for Mistral outputs
mistral_zero_path = "/Users/hetavpatel/Desktop/Data Science/Grad DS Work/DSCI 601 Applied Data Science/old_repos/NLPPunePorsche/LLMsTM/DSCI602/outputs-puneporshe/mistral_7b_outputs.json"
mistral_few_path = "/Users/hetavpatel/Desktop/Data Science/Grad DS Work/DSCI 601 Applied Data Science/old_repos/NLPPunePorsche/LLMsTM/DSCI602/outputs-puneporshe/mistral_7b_outputs_few.json"

# Zero-shot
print("Mistral Zero-shot")
df_mistral_zeroshot = load_mistral_json_to_df(mistral_zero_path)
compute_mistral_metrics(df_mistral_zeroshot)

# Few-shot
print("Mistral Few-shot")
df_mistral_fewshot = load_mistral_json_to_df(mistral_few_path)
compute_mistral_metrics(df_mistral_fewshot)


Mistral Zero-shot
Loaded 2707 Mistral elements
Nulls in human_label: 0
Nulls in mistral_pred: 2
Mistral Accuracy: 0.280
Mistral Classification Report:
              precision    recall  f1-score   support

           1       0.19      0.49      0.28       231
           2       0.11      0.93      0.20        58
           3       0.30      0.78      0.43       231
           4       0.22      0.60      0.32       141
           5       0.42      0.07      0.12       715
           6       0.61      0.14      0.23      1063
           7       0.48      0.47      0.47       266

    accuracy                           0.28      2705
   macro avg       0.33      0.50      0.29      2705
weighted avg       0.45      0.28      0.25      2705

Mistral Few-shot
Loaded 2707 Mistral elements
Nulls in human_label: 0
Nulls in mistral_pred: 1
Mistral Accuracy: 0.258
Mistral Classification Report:
              precision    recall  f1-score   support

           1       0.17      0.23      0.20    

In [17]:
def load_llama_json_to_df(json_path):
    """
    Load Llama 3.1 8B outputs into a dataframe with columns:
    'Serial_Number' = int from 'Serial_Number' in the json (if exists)
    'human_label' = int from 'llm_output'
    'llama_pred' = int parsed from 'llm_output_llama' (which is a string, e.g. '{\n    "label": 5\n}')
    """
    import pandas as pd
    import json
    import re

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    elements = []
    for entry in data:
        d = {}
        # Add Serial_Number if it exists in the entry, else None
        d["Serial_Number"] = entry.get("Serial_Number", None)
        d["human_label"] = None
        d["llama_pred"] = None
        # Human (true) label
        if "llm_output" in entry:
            d["human_label"] = entry["llm_output"]
        # Pred label is usually a string like '{"label": 4}' or similar
        pred_str = entry.get("llm_output_llama", None)
        if pred_str:
            label_pred = None
            try:
                label_pred = json.loads(pred_str).get("label", None)
            except Exception:
                match = re.search(r'"label"\s*:\s*(\d+)', pred_str)
                if match:
                    label_pred = int(match.group(1))
            if label_pred is not None:
                d["llama_pred"] = int(label_pred)
        elements.append(d)
    df = pd.DataFrame(elements)
    print(f"Loaded {len(df)} Llama elements")
    print("Nulls in human_label:", df['human_label'].isnull().sum())
    print("Nulls in llama_pred:", df['llama_pred'].isnull().sum())
    return df

def compute_llama_metrics(df):
    """
    Compute accuracy and classification report for Llama model outputs.
    Assumes df has 'human_label' and 'llama_pred' columns.
    """
    from sklearn.metrics import classification_report, accuracy_score
    df_eval = df.dropna(subset=['human_label', 'llama_pred']).copy()
    y_true = df_eval["human_label"].astype(int)
    y_pred = df_eval["llama_pred"].astype(int)
    acc = accuracy_score(y_true, y_pred)
    print(f"Llama Accuracy: {acc:.3f}")
    print("Llama Classification Report:")
    print(classification_report(y_true, y_pred, zero_division=0.0))


llama_few_path = "/Users/hetavpatel/Desktop/Data Science/Grad DS Work/DSCI 601 Applied Data Science/old_repos/NLPPunePorsche/LLMsTM/DSCI602/outputs-puneporshe/llama3.1_8b_outputs_few.json"
llama_zero_path = "/Users/hetavpatel/Desktop/Data Science/Grad DS Work/DSCI 601 Applied Data Science/old_repos/NLPPunePorsche/LLMsTM/DSCI602/outputs-puneporshe/llama3.1_8b_outputs.json"

print("Llama Zero-shot")
df_llama_zeroshot = load_llama_json_to_df(llama_zero_path)
compute_llama_metrics(df_llama_zeroshot)

print("Llama Few-shot")
df_llama_fewshot = load_llama_json_to_df(llama_few_path)
compute_llama_metrics(df_llama_fewshot)



Llama Zero-shot
Loaded 2707 Llama elements
Nulls in human_label: 0
Nulls in llama_pred: 1
Llama Accuracy: 0.373
Llama Classification Report:
              precision    recall  f1-score   support

           1       0.30      0.45      0.36       231
           2       0.21      0.84      0.34        58
           3       0.28      0.88      0.43       231
           4       0.20      0.69      0.31       141
           5       0.51      0.16      0.24       714
           6       0.72      0.29      0.41      1065
           7       0.49      0.51      0.50       266

    accuracy                           0.37      2706
   macro avg       0.39      0.55      0.37      2706
weighted avg       0.53      0.37      0.37      2706

Llama Few-shot
Loaded 2707 Llama elements
Nulls in human_label: 0
Nulls in llama_pred: 38
Llama Accuracy: 0.446
Llama Classification Report:
              precision    recall  f1-score   support

           1       0.31      0.48      0.37       229
           2

In [18]:


# Identify the rows where 'llama_pred' is null in both zero-shot and few-shot llama outputs,
# and print the corresponding Serial_Number(s) (from the JSON)

print("\nSerial Numbers with null llama_pred in Zero-shot file:")
null_zeroshot = df_llama_zeroshot[df_llama_zeroshot['llama_pred'].isnull()]
if null_zeroshot.empty:
    print("None")
else:
    print(null_zeroshot['Serial_Number'].tolist())

print("\nSerial Numbers with null llama_pred in Few-shot file:")
null_fewshot = df_llama_fewshot[df_llama_fewshot['llama_pred'].isnull()]
if null_fewshot.empty:
    print("None")
else:
    print(null_fewshot['Serial_Number'].tolist())







Serial Numbers with null llama_pred in Zero-shot file:
[2864]

Serial Numbers with null llama_pred in Few-shot file:
[219, 1697, 3047, 1758, 2605, 1107, 1991, 921, 1767, 1310, 835, 2667, 2639, 918, 1642, 2864, 243, 1331, 904, 2118, 1339, 2359, 773, 2972, 395, 2066, 3042, 1604, 2487, 2405, 2057, 2236, 1955, 498, 2198, 1970, 2503, 1722]
