In [1]:
with open("test_data_locale.txt", "r", encoding="utf-8") as f:
    locales = [line.strip() for line in f]

# lora2

In [2]:
import json
from sklearn.metrics import f1_score

VALID_LABELS = {"Exact", "Substitute", "Complement", "Irrelevant"}

y_true = []
y_pred = []

with open("lora2_test_output.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)

        ref = item["label"].strip()
        model_out = item["predict"].strip()

        # 提取第一个词
        first_word = model_out.split()[0]

        # 可选：防止异常输出污染评估
        if first_word not in VALID_LABELS:
            continue   # 或者标记为 Irrelevant / skip

        y_true.append(ref)
        y_pred.append(first_word)

# 计算 micro F1
micro_f1 = f1_score(y_true, y_pred, average="micro")

print(f"Micro F1: {micro_f1:.4f}")
print(f"Evaluated samples: {len(y_true)}")

Micro F1: 0.5344
Evaluated samples: 2878


In [3]:
from collections import defaultdict

labels = set(y_true)
stats = {}

for label in labels:
    tp = sum((p == label and t == label) for p, t in zip(y_pred, y_true))
    fp = sum((p == label and t != label) for p, t in zip(y_pred, y_true))
    fn = sum((p != label and t == label) for p, t in zip(y_pred, y_true))
    tn = sum((p != label and t != label) for p, t in zip(y_pred, y_true))

    precision = tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = tp / (tp + fn) if tp + fn > 0 else 0.0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f1 = 2 * precision * recall / (precision + recall)

    stats[label] = {
        "precision": precision,
        "recall": recall,
        "accuracy": accuracy,
        "support": tp + fn,
        "f1": f1
    }

for k, v in stats.items():
    print(k, v)

Substitute {'precision': 0.5353805073431241, 'recall': 0.4112820512820513, 'accuracy': 0.6796386379430159, 'support': 975, 'f1': 0.4651972157772622}
Complement {'precision': 0.2289156626506024, 'recall': 0.5984251968503937, 'accuracy': 0.893328700486449, 'support': 127, 'f1': 0.3311546840958606}
Irrelevant {'precision': 0.3936899862825789, 'recall': 0.6145610278372591, 'accuracy': 0.7838776928422516, 'support': 467, 'f1': 0.479933110367893}
Exact {'precision': 0.7247191011235955, 'recall': 0.5912910618792971, 'accuracy': 0.711952744961779, 'support': 1309, 'f1': 0.6512410601598654}


In [4]:
from sklearn.metrics import confusion_matrix
import pandas as pd

labels = ["Exact", "Substitute", "Complement", "Irrelevant"]

cm = confusion_matrix(y_true, y_pred, labels=labels)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)

print(df_cm)

            Exact  Substitute  Complement  Irrelevant
Exact         774         261         122         152
Substitute    231         401          81         262
Complement     18           5          76          28
Irrelevant     45          82          53         287


In [5]:
from collections import defaultdict

# stats[locale][label] = correct_count
stats = defaultdict(lambda: defaultdict(int))

for loc, t, p in zip(locales, y_true, y_pred):
    if t == p:
        stats[loc][t] += 1

stats

defaultdict(<function __main__.<lambda>()>,
            {'es': defaultdict(int,
                         {'Substitute': 80,
                          'Complement': 22,
                          'Exact': 144,
                          'Irrelevant': 62}),
             'us': defaultdict(int,
                         {'Complement': 37,
                          'Substitute': 217,
                          'Exact': 413,
                          'Irrelevant': 148}),
             'jp': defaultdict(int,
                         {'Complement': 17,
                          'Irrelevant': 77,
                          'Substitute': 104,
                          'Exact': 217})})

In [6]:
from collections import defaultdict

# stats[locale][label] = {tp, fp, fn}
stats = defaultdict(lambda: defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0}))

labels = set(y_true)

# 1. 统计 TP / FP / FN（在每个 locale 内）
for loc, t, p in zip(locales, y_true, y_pred):
    for label in labels:
        if p == label and t == label:
            stats[loc][label]["tp"] += 1
        elif p == label and t != label:
            stats[loc][label]["fp"] += 1
        elif p != label and t == label:
            stats[loc][label]["fn"] += 1

# 2. 计算 precision / recall / f1
metrics = defaultdict(dict)

for loc, label_stats in stats.items():
    for label, c in label_stats.items():
        tp, fp, fn = c["tp"], c["fp"], c["fn"]

        precision = tp / (tp + fp) if tp + fp > 0 else 0.0
        recall    = tp / (tp + fn) if tp + fn > 0 else 0.0
        f1        = (
            2 * precision * recall / (precision + recall)
            if precision + recall > 0 else 0.0
        )

        metrics[loc][label] = {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "support": tp + fn,
            "tp": tp,
            "fp": fp,
            "fn": fn,
        }


In [7]:
metrics

defaultdict(dict,
            {'us': {'Substitute': {'precision': 0.48008849557522126,
               'recall': 0.4157088122605364,
               'f1': 0.4455852156057495,
               'support': 522,
               'tp': 217,
               'fp': 235,
               'fn': 305},
              'Irrelevant': {'precision': 0.39361702127659576,
               'recall': 0.5648854961832062,
               'f1': 0.4639498432601881,
               'support': 262,
               'tp': 148,
               'fp': 228,
               'fn': 114},
              'Exact': {'precision': 0.7132987910189983,
               'recall': 0.5665294924554184,
               'f1': 0.6314984709480124,
               'support': 729,
               'tp': 413,
               'fp': 166,
               'fn': 316},
              'Complement': {'precision': 0.22289156626506024,
               'recall': 0.6166666666666667,
               'f1': 0.3274336283185841,
               'support': 60,
               'tp': 37,
 

# lora1

In [8]:
import json
from sklearn.metrics import f1_score

VALID_LABELS = {"Exact", "Substitute", "Complement", "Irrelevant"}

y_true = []
y_pred = []

with open("lora1_test_output.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)

        ref = item["label"].strip()
        model_out = item["predict"].strip()

        # 提取第一个词
        first_word = model_out.split()[0]

        # 可选：防止异常输出污染评估
        if first_word not in VALID_LABELS:
            continue   # 或者标记为 Irrelevant / skip

        y_true.append(ref)
        y_pred.append(first_word)

# 计算 micro F1
micro_f1 = f1_score(y_true, y_pred, average="micro")

print(f"Micro F1: {micro_f1:.4f}")
print(f"Evaluated samples: {len(y_true)}")

Micro F1: 0.4868
Evaluated samples: 2880


In [9]:
y_true

['Substitute',
 'Exact',
 'Exact',
 'Exact',
 'Substitute',
 'Substitute',
 'Complement',
 'Exact',
 'Exact',
 'Complement',
 'Substitute',
 'Exact',
 'Complement',
 'Substitute',
 'Substitute',
 'Complement',
 'Exact',
 'Exact',
 'Exact',
 'Exact',
 'Exact',
 'Substitute',
 'Irrelevant',
 'Substitute',
 'Irrelevant',
 'Exact',
 'Substitute',
 'Exact',
 'Substitute',
 'Substitute',
 'Exact',
 'Irrelevant',
 'Exact',
 'Substitute',
 'Substitute',
 'Exact',
 'Exact',
 'Exact',
 'Exact',
 'Exact',
 'Exact',
 'Substitute',
 'Complement',
 'Complement',
 'Exact',
 'Substitute',
 'Exact',
 'Substitute',
 'Exact',
 'Irrelevant',
 'Exact',
 'Substitute',
 'Complement',
 'Irrelevant',
 'Substitute',
 'Substitute',
 'Exact',
 'Exact',
 'Exact',
 'Irrelevant',
 'Substitute',
 'Exact',
 'Exact',
 'Exact',
 'Exact',
 'Substitute',
 'Substitute',
 'Substitute',
 'Exact',
 'Exact',
 'Exact',
 'Substitute',
 'Substitute',
 'Substitute',
 'Exact',
 'Exact',
 'Exact',
 'Exact',
 'Exact',
 'Exact',
 'Sub

In [10]:
y_pred

['Irrelevant',
 'Complement',
 'Exact',
 'Exact',
 'Exact',
 'Irrelevant',
 'Complement',
 'Irrelevant',
 'Substitute',
 'Irrelevant',
 'Exact',
 'Exact',
 'Irrelevant',
 'Irrelevant',
 'Exact',
 'Complement',
 'Irrelevant',
 'Irrelevant',
 'Complement',
 'Exact',
 'Exact',
 'Irrelevant',
 'Irrelevant',
 'Complement',
 'Exact',
 'Exact',
 'Irrelevant',
 'Exact',
 'Complement',
 'Exact',
 'Exact',
 'Irrelevant',
 'Complement',
 'Complement',
 'Exact',
 'Exact',
 'Exact',
 'Complement',
 'Exact',
 'Exact',
 'Exact',
 'Exact',
 'Complement',
 'Irrelevant',
 'Exact',
 'Irrelevant',
 'Exact',
 'Complement',
 'Complement',
 'Irrelevant',
 'Exact',
 'Exact',
 'Complement',
 'Irrelevant',
 'Irrelevant',
 'Irrelevant',
 'Exact',
 'Exact',
 'Complement',
 'Irrelevant',
 'Substitute',
 'Exact',
 'Irrelevant',
 'Exact',
 'Irrelevant',
 'Irrelevant',
 'Irrelevant',
 'Complement',
 'Substitute',
 'Exact',
 'Exact',
 'Irrelevant',
 'Exact',
 'Substitute',
 'Complement',
 'Irrelevant',
 'Exact',
 'Com

In [11]:
from collections import defaultdict

labels = set(y_true)
stats = {}

for label in labels:
    tp = sum((p == label and t == label) for p, t in zip(y_pred, y_true))
    fp = sum((p == label and t != label) for p, t in zip(y_pred, y_true))
    fn = sum((p != label and t == label) for p, t in zip(y_pred, y_true))
    tn = sum((p != label and t != label) for p, t in zip(y_pred, y_true))

    precision = tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = tp / (tp + fn) if tp + fn > 0 else 0.0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f1 = 2 * precision * recall / (precision + recall)

    stats[label] = {
        "precision": precision,
        "recall": recall,
        "accuracy": accuracy,
        "support": tp + fn,
        "f1": f1
    }

for k, v in stats.items():
    print(k, v)


Substitute {'precision': 0.6877828054298643, 'recall': 0.1558974358974359, 'accuracy': 0.6902777777777778, 'support': 975, 'f1': 0.25418060200668896}
Complement {'precision': 0.17829457364341086, 'recall': 0.5433070866141733, 'accuracy': 0.8694444444444445, 'support': 127, 'f1': 0.2684824902723736}
Irrelevant {'precision': 0.33513513513513515, 'recall': 0.7965738758029979, 'accuracy': 0.7107638888888889, 'support': 467, 'f1': 0.4717818642993025}
Exact {'precision': 0.6962134251290878, 'recall': 0.6170861937452327, 'accuracy': 0.703125, 'support': 1311, 'f1': 0.6542660735948241}


In [12]:
TP = sum((p == t) for p, t in zip(y_pred, y_true))
FP = sum((p != t) for p, t in zip(y_pred, y_true))
FN = FP  # single-label multi-class 下 FP == FN

micro_precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
micro_recall    = TP / (TP + FN) if (TP + FN) > 0 else 0.0
micro_f1        = (2 * micro_precision * micro_recall /
                   (micro_precision + micro_recall)
                   if (micro_precision + micro_recall) > 0 else 0.0)

print("micro_precision:", micro_precision)
print("micro_recall   :", micro_recall)
print("micro_f1       :", micro_f1)

micro_precision: 0.48680555555555555
micro_recall   : 0.48680555555555555
micro_f1       : 0.48680555555555555


In [13]:
from sklearn.metrics import confusion_matrix
import pandas as pd

labels = ["Exact", "Substitute", "Complement", "Irrelevant"]

cm = confusion_matrix(y_true, y_pred, labels=labels)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)

print(df_cm)

            Exact  Substitute  Complement  Irrelevant
Exact         809          55         165         282
Substitute    292         152         115         416
Complement     17           1          69          40
Irrelevant     44          13          38         372


In [14]:
from collections import defaultdict

# stats[locale][label] = correct_count
stats = defaultdict(lambda: defaultdict(int))

for loc, t, p in zip(locales, y_true, y_pred):
    if t == p:
        stats[loc][t] += 1

stats

defaultdict(<function __main__.<lambda>()>,
            {'jp': defaultdict(int,
                         {'Exact': 223,
                          'Irrelevant': 101,
                          'Complement': 17,
                          'Substitute': 48}),
             'us': defaultdict(int,
                         {'Exact': 457,
                          'Irrelevant': 197,
                          'Substitute': 56,
                          'Complement': 28}),
             'es': defaultdict(int,
                         {'Complement': 24,
                          'Exact': 129,
                          'Substitute': 48,
                          'Irrelevant': 74})})

In [15]:
from collections import defaultdict

# stats[locale][label] = {tp, fp, fn}
stats = defaultdict(lambda: defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0}))

labels = set(y_true)

# 1. 统计 TP / FP / FN（在每个 locale 内）
for loc, t, p in zip(locales, y_true, y_pred):
    for label in labels:
        if p == label and t == label:
            stats[loc][label]["tp"] += 1
        elif p == label and t != label:
            stats[loc][label]["fp"] += 1
        elif p != label and t == label:
            stats[loc][label]["fn"] += 1

# 2. 计算 precision / recall / f1
metrics = defaultdict(dict)

for loc, label_stats in stats.items():
    for label, c in label_stats.items():
        tp, fp, fn = c["tp"], c["fp"], c["fn"]

        precision = tp / (tp + fp) if tp + fp > 0 else 0.0
        recall    = tp / (tp + fn) if tp + fn > 0 else 0.0
        f1        = (
            2 * precision * recall / (precision + recall)
            if precision + recall > 0 else 0.0
        )

        metrics[loc][label] = {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "support": tp + fn,
            "tp": tp,
            "fp": fp,
            "fn": fn,
        }


In [16]:
metrics

defaultdict(dict,
            {'us': {'Substitute': {'precision': 0.5957446808510638,
               'recall': 0.10666666666666667,
               'f1': 0.18093699515347336,
               'support': 525,
               'tp': 56,
               'fp': 38,
               'fn': 469},
              'Irrelevant': {'precision': 0.3294314381270903,
               'recall': 0.7943548387096774,
               'f1': 0.46572104018912525,
               'support': 248,
               'tp': 197,
               'fp': 401,
               'fn': 51},
              'Exact': {'precision': 0.677037037037037,
               'recall': 0.6134228187919463,
               'f1': 0.643661971830986,
               'support': 745,
               'tp': 457,
               'fp': 218,
               'fn': 288},
              'Complement': {'precision': 0.13526570048309178,
               'recall': 0.5,
               'f1': 0.2129277566539924,
               'support': 56,
               'tp': 28,
               'fp':

# base + think

In [17]:
import json
from sklearn.metrics import f1_score

VALID_LABELS = {"Exact", "Substitute", "Complement", "Irrelevant"}

y_true_bsl = []
y_pred_bsl = []

with open("baseline_test_output.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)

        ref = item["label"].strip()
        model_out = item["predict"]

        label = None

        # case 1: 有完整 <think>...</think>
        if "<think>" in model_out and "</think>" in model_out:
            after_think = model_out.split("</think>", 1)[1].lstrip("\n")
            label = after_think.split()[0] if after_think.split() else None

        # case 2: 没有 thinking，取最后一个合法标签
        if label not in VALID_LABELS:
            for word in reversed(model_out.split()):
                if word in VALID_LABELS:
                    label = word
                    break
        
        if label not in VALID_LABELS:
            label = 'Irrelevant'

        y_true_bsl.append(ref)
        y_pred_bsl.append(label)

# 计算 micro F1
micro_f1_bsl = f1_score(y_true_bsl, y_pred_bsl, average="micro")

print(f"Micro F1: {micro_f1_bsl:.4f}")
print(f"Evaluated samples: {len(y_true_bsl)}")

Micro F1: 0.5160
Evaluated samples: 2880


In [18]:
from collections import defaultdict

labels = set(y_true_bsl)
stats_bsl = {}

for label in labels:
    tp = sum((p == label and t == label) for p, t in zip(y_pred_bsl, y_true_bsl))
    fp = sum((p == label and t != label) for p, t in zip(y_pred_bsl, y_true_bsl))
    fn = sum((p != label and t == label) for p, t in zip(y_pred_bsl, y_true_bsl))
    tn = sum((p != label and t != label) for p, t in zip(y_pred_bsl, y_true_bsl))

    precision = tp / (tp + fp) if tp + fp > 0 else 0.0
    recall = tp / (tp + fn) if tp + fn > 0 else 0.0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    f1 = 2 * precision * recall / (precision + recall)

    stats_bsl[label] = {
        "precision": precision,
        "recall": recall,
        "accuracy": accuracy,
        "support": tp + fn,
        "f1": f1
    }

for k, v in stats_bsl.items():
    print(k, v)

Substitute {'precision': 0.5242718446601942, 'recall': 0.27692307692307694, 'accuracy': 0.6701388888888888, 'support': 975, 'f1': 0.3624161073825503}
Complement {'precision': 0.36666666666666664, 'recall': 0.3464566929133858, 'accuracy': 0.9447916666666667, 'support': 127, 'f1': 0.35627530364372473}
Irrelevant {'precision': 0.3065134099616858, 'recall': 0.6852248394004282, 'accuracy': 0.6975694444444445, 'support': 467, 'f1': 0.4235605559232296}
Exact {'precision': 0.7094088259783514, 'recall': 0.6498855835240275, 'accuracy': 0.7194444444444444, 'support': 1311, 'f1': 0.6783439490445861}


In [19]:
from sklearn.metrics import confusion_matrix
import pandas as pd

labels = ["Exact", "Substitute", "Complement", "Irrelevant"]

cm = confusion_matrix(y_true_bsl, y_pred_bsl, labels=labels)
df_cm = pd.DataFrame(cm, index=labels, columns=labels)

print(df_cm)

            Exact  Substitute  Complement  Irrelevant
Exact         852         167          30         262
Substitute    259         270          27         419
Complement     24          16          44          43
Irrelevant     66          62          19         320


In [20]:
TP = sum((p == t) for p, t in zip(y_pred_bsl, y_true_bsl))
FP = sum((p != t) for p, t in zip(y_pred_bsl, y_true_bsl))
FN = FP  # single-label multi-class 下 FP == FN

micro_precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
micro_recall    = TP / (TP + FN) if (TP + FN) > 0 else 0.0
micro_f1        = (2 * micro_precision * micro_recall /
                   (micro_precision + micro_recall)
                   if (micro_precision + micro_recall) > 0 else 0.0)

print("micro_precision:", micro_precision)
print("micro_recall   :", micro_recall)
print("micro_f1       :", micro_f1)

micro_precision: 0.5159722222222223
micro_recall   : 0.5159722222222223
micro_f1       : 0.5159722222222223


In [21]:
from collections import defaultdict

# stats[locale][label] = correct_count
stats_bsl = defaultdict(lambda: defaultdict(int))

for loc, t, p in zip(locales, y_true_bsl, y_pred_bsl):
    if t == p:
        stats_bsl[loc][t] += 1

stats_bsl

defaultdict(<function __main__.<lambda>()>,
            {'jp': defaultdict(int,
                         {'Exact': 248,
                          'Complement': 8,
                          'Irrelevant': 88,
                          'Substitute': 54}),
             'us': defaultdict(int,
                         {'Exact': 464,
                          'Substitute': 155,
                          'Irrelevant': 164,
                          'Complement': 19}),
             'es': defaultdict(int,
                         {'Substitute': 61,
                          'Complement': 17,
                          'Exact': 140,
                          'Irrelevant': 68})})

In [22]:
from collections import defaultdict

# stats[locale][label] = {tp, fp, fn}
stats = defaultdict(lambda: defaultdict(lambda: {"tp": 0, "fp": 0, "fn": 0}))

labels = set(y_true_bsl)

# 1. 统计 TP / FP / FN（在每个 locale 内）
for loc, t, p in zip(locales, y_true_bsl, y_pred_bsl):
    for label in labels:
        if p == label and t == label:
            stats[loc][label]["tp"] += 1
        elif p == label and t != label:
            stats[loc][label]["fp"] += 1
        elif p != label and t == label:
            stats[loc][label]["fn"] += 1

# 2. 计算 precision / recall / f1
metrics = defaultdict(dict)

for loc, label_stats in stats.items():
    for label, c in label_stats.items():
        tp, fp, fn = c["tp"], c["fp"], c["fn"]

        precision = tp / (tp + fp) if tp + fp > 0 else 0.0
        recall    = tp / (tp + fn) if tp + fn > 0 else 0.0
        f1        = (
            2 * precision * recall / (precision + recall)
            if precision + recall > 0 else 0.0
        )

        metrics[loc][label] = {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "support": tp + fn,
            "tp": tp,
            "fp": fp,
            "fn": fn,
        }


In [23]:
metrics

defaultdict(dict,
            {'us': {'Substitute': {'precision': 0.4889589905362776,
               'recall': 0.29523809523809524,
               'f1': 0.3681710213776722,
               'support': 525,
               'tp': 155,
               'fp': 162,
               'fn': 370},
              'Irrelevant': {'precision': 0.3048327137546468,
               'recall': 0.6612903225806451,
               'f1': 0.41730279898218825,
               'support': 248,
               'tp': 164,
               'fp': 374,
               'fn': 84},
              'Exact': {'precision': 0.7171561051004637,
               'recall': 0.6228187919463087,
               'f1': 0.6666666666666666,
               'support': 745,
               'tp': 464,
               'fp': 183,
               'fn': 281},
              'Complement': {'precision': 0.2638888888888889,
               'recall': 0.3392857142857143,
               'f1': 0.29687499999999994,
               'support': 56,
               'tp': 19,
  