In [None]:
import numpy as np
import pandas as pd
import json
import os
from os.path import join
from sklearn.metrics import classification_report


def report_gen(y_pred, y_true, report_name=None, out_loc="report_llama"):
    # given predicted and true labels,
    # generate the overall results and pertype analysis with misclassification
    report = classification_report(y_true, y_pred, output_dict=True, zero_division=0)

    df_report = pd.DataFrame(
        columns=["type", "precision", "recall", "f1-score", "support"]
    )

    overall = {}
    for t in report:
        if t not in ["accuracy", "macro avg", "weighted avg"]:
            report[t]["type"] = t
            df_report = pd.concat(
                [df_report, pd.DataFrame(report[t], index=[0])], ignore_index=True
            )
        else:
            overall[t] = report[t]

    # extract misclassification details
    dic = {}
    for t, p in zip(y_true, y_pred):
        if t not in dic:
            dic[t] = {"mis_to": {}, "mis_from": {}}
        if p not in dic:
            dic[p] = {"mis_to": {}, "mis_from": {}}

        if t != p:
            dic[t]["mis_to"][p] = dic[t]["mis_to"].get(p, 0) + 1
            dic[p]["mis_from"][t] = dic[p]["mis_from"].get(t, 0) + 1

    def first_five(dic):
        return sorted(dic.items(), key=lambda x: x[1], reverse=True)[:5]

    df_report["mis_from_top5"] = df_report.apply(
        lambda x: first_five(dic[x["type"]]["mis_from"]), axis=1
    )  # precision
    df_report["mis_to_top5"] = df_report.apply(
        lambda x: first_five(dic[x["type"]]["mis_to"]), axis=1
    )  # recall

    # save results
    if report_name is not None:
        if not os.path.exists(out_loc):
            os.mkdir(out_loc)

        df_report.sort_values(["f1-score"], ascending=False).to_csv(
            join(out_loc, "results_per_type_{}.csv".format(report_name))
        )

        with open(join(out_loc, "overall_{}.json".format(report_name)), "w") as outfile:
            json.dump(overall, outfile)

    return overall, df_report

In [None]:
# Data process and result
import json
import numpy as np
import re


def canonical_header(h, max_header_len=30):
    """Convert any header to its canonical form"""
    h = str(h)
    if len(h) > max_header_len:
        return "-"
    h = re.sub(r"\([^)]*\)", "", h)  # trim content in parentheses
    h = re.sub(r"([A-Z][a-z])", r" \1", h)  # insert a space before any Capital starts
    words = list(
        filter(lambda x: len(x) > 0, map(lambda x: x.lower(), re.split("\W", h)))
    )
    if len(words) <= 0:
        return "-"
    new_phrase = "".join([words[0]] + [x.capitalize() for x in words[1:]])
    return new_phrase


def extract_predictions_from_jsonl(file_path):
    """
    Extract predictions and ground truth from JSONL files.
    Applies canonical_header transformation to all column names.
    """
    predictions = []
    ground_truth = []

    with open(file_path, "r") as f:
        for line in f:
            try:
                entry = json.loads(line)

                # Extract the model output after "assistant\n\n"
                sample = entry["sample"]
                matches = re.findall(
                    r"assistant\n\n(.*?)(?=assistant|$)", sample, re.DOTALL
                )

                if matches:
                    # Take the first prediction if multiple exist
                    pred_text = matches[0].strip()
                    try:
                        pred_json = json.loads(pred_text)
                        pred_cols = [
                            canonical_header(col)
                            for col in pred_json.get("colnames", [])
                        ]
                    except json.JSONDecodeError:
                        # Handle invalid JSON format in prediction
                        gold_json = json.loads(entry["gold"])
                        num_cols = len(gold_json.get("colnames", []))
                        pred_cols = ["???"] * num_cols

                    # Extract and canonicalize ground truth
                    gold_json = json.loads(entry["gold"])
                    gold_cols = [
                        canonical_header(col) for col in gold_json.get("colnames", [])
                    ]

                    # Handle mismatched column counts
                    if len(pred_cols) != len(gold_cols):
                        pred_cols = ["???"] * len(gold_cols)

                    predictions.extend(pred_cols)
                    ground_truth.extend(gold_cols)

            except (json.JSONDecodeError, KeyError) as e:
                print(f"Error processing line: {e}")
                continue

    return np.array(predictions), np.array(ground_truth)


if __name__ == "__main__":
    # Process JSONL file and get canonicalized predictions and truth
    preds, truths = extract_predictions_from_jsonl("/content/generated_samples.jsonl")

    # Calculate metrics
    overall, report = report_gen(preds, truths)

    # Save results
    with open("overall.json", "w") as f:
        json.dump(overall, f)
    report.to_csv("report.csv", index=False)