In [2]:
import json
from sklearn.model_selection import train_test_split

In [3]:
import json
import random
from itertools import permutations

def convert_label_studio_to_re_format(
    input_file, 
    output_file, 
    negative_ratio=0.5 
):
    """
    Chuyển đổi JSON export từ Label Studio sang định dạng train cho RBERT.
    Đã sửa để hỗ trợ cả Single Object JSON và List JSON.
    """
    
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            ls_data = json.load(f)
    except FileNotFoundError:
        print(f"Lỗi: Không tìm thấy file {input_file}")
        return []

    # --- SỬA LỖI QUAN TRỌNG: Chuẩn hóa dữ liệu về dạng List ---
    # Nếu file json chỉ chứa 1 object (bắt đầu bằng { "id":... }), ta gói nó vào list
    if isinstance(ls_data, dict):
        ls_data = [ls_data]
    
    final_data = []
    stats = {"positive": 0, "negative": 0, "relations": {}}

    for task in ls_data:
        # 1. Lấy văn bản gốc
        # Kiểm tra kỹ key chứa text (thường là 'text' hoặc 'sentence' tùy config Label Studio)
        data_block = task.get('data', {})
        text = data_block.get('text') or data_block.get('sentence')
        
        if not text:
            print(f"Bỏ qua Task ID {task.get('id', 'Unknown')}: Không tìm thấy text.")
            continue

        # 2. Parse Entities (Nodes)
        entities = {}
        annotations = task.get('annotations', [])
        if not annotations:
            continue
            
        # Lấy annotation đầu tiên (thường là bản mới nhất hoặc ground_truth)
        result = annotations[0].get('result', [])
        
        # Bước 2a: Map ID -> Entity Info
        for item in result:
            if item['type'] == 'labels':
                entity_id = item['id']
                value = item['value']
                
                # Lấy nhãn đầu tiên trong mảng labels
                label_type = value['labels'][0] if value.get('labels') else "Unknown"

                entities[entity_id] = {
                    'text': value['text'],
                    'start': value['start'],
                    'end': value['end'],
                    'type': label_type  # Quan trọng cho Type-Aware Model
                }

        # Bước 2b: Lấy Relations (Edges)
        existing_relations = set() 
        
        for item in result:
            if item['type'] == 'relation':
                from_id = item['from_id']
                to_id = item['to_id']
                relation_label = item['labels'][0] if item.get('labels') else "NO_RELATION"
                
                if from_id in entities and to_id in entities:
                    subj = entities[from_id]
                    obj = entities[to_id]
                    
                    sample = {
                        "text": text,
                        "relation": relation_label,
                        "subj": subj,
                        "obj": obj
                    }
                    final_data.append(sample)
                    existing_relations.add((from_id, to_id))
                    
                    stats["positive"] += 1
                    stats["relations"][relation_label] = stats["relations"].get(relation_label, 0) + 1

        # Bước 3: Negative Sampling (Tạo mẫu No_Relation)
        entity_ids = list(entities.keys())
        all_pairs = list(permutations(entity_ids, 2)) 
        
        potential_negatives = []
        for e1_id, e2_id in all_pairs:
            if (e1_id, e2_id) not in existing_relations:
                potential_negatives.append((e1_id, e2_id))
        
        # Logic lấy số lượng negative sample
        num_neg_to_take = int(len(existing_relations) * negative_ratio)
        
        # Nếu có entity nhưng chưa có relation nào, vẫn lấy 1 mẫu negative để model học
        if len(existing_relations) == 0 and len(potential_negatives) > 0 and negative_ratio > 0:
             num_neg_to_take = 1
        # Nếu tính ra 0 nhưng user muốn lấy (negative_ratio > 0) và có relations, tối thiểu lấy 1
        elif len(existing_relations) > 0 and num_neg_to_take == 0 and negative_ratio > 0:
             num_neg_to_take = 1

        random.shuffle(potential_negatives)
        selected_negatives = potential_negatives[:num_neg_to_take]
        
        for e1_id, e2_id in selected_negatives:
            subj = entities[e1_id]
            obj = entities[e2_id]
            
            sample = {
                "text": text,
                "relation": "NO_RELATION",
                "subj": subj,
                "obj": obj
            }
            final_data.append(sample)
            stats["negative"] += 1

    # Lưu file
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(final_data, f, ensure_ascii=False, indent=2)

    print("=== Conversion Completed ===")
    print(f"Total samples generated: {len(final_data)}")
    print(f"Positive samples: {stats['positive']}")
    print(f"Negative samples: {stats['negative']}")
    print("Relation Types Breakdown:", stats["relations"])
    return final_data

# --- CHẠY VỚI DỮ LIỆU CỦA BẠN ---
# Lưu đoạn JSON bạn đưa vào file tên 'dataset2.json' trước khi chạy
convert_label_studio_to_re_format("full.json", "full_data.json", negative_ratio=1.0)

=== Conversion Completed ===
Total samples generated: 4721
Positive samples: 2337
Negative samples: 2384
Relation Types Breakdown: {'CAUSES': 537, 'TREATED_WITH': 459, 'HAS_MANIFESTATION': 1120, 'REVEALS': 200, 'NO_RELATION': 21}


[{'text': 'tuy nhiên , adalimumab , infliximab mang một số nguy cơ nhiễm trùng , bao gồm cả bệnh lao và nhiễm nấm nghiêm trọng .',
  'relation': 'CAUSES',
  'subj': {'text': 'adalimumab', 'start': 12, 'end': 22, 'type': 'TREATMENT'},
  'obj': {'text': 'bệnh lao', 'start': 81, 'end': 89, 'type': 'DISEASE'}},
 {'text': 'tuy nhiên , adalimumab , infliximab mang một số nguy cơ nhiễm trùng , bao gồm cả bệnh lao và nhiễm nấm nghiêm trọng .',
  'relation': 'CAUSES',
  'subj': {'text': 'adalimumab', 'start': 12, 'end': 22, 'type': 'TREATMENT'},
  'obj': {'text': 'nhiễm nấm', 'start': 93, 'end': 102, 'type': 'DISEASE'}},
 {'text': 'tuy nhiên , adalimumab , infliximab mang một số nguy cơ nhiễm trùng , bao gồm cả bệnh lao và nhiễm nấm nghiêm trọng .',
  'relation': 'CAUSES',
  'subj': {'text': 'infliximab', 'start': 25, 'end': 35, 'type': 'TREATMENT'},
  'obj': {'text': 'bệnh lao', 'start': 81, 'end': 89, 'type': 'DISEASE'}},
 {'text': 'tuy nhiên , adalimumab , infliximab mang một số nguy cơ nhiễ

In [10]:
with open('train_data_fixed.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

labels = [item['relation'] for item in data]

# 2. Split train/temp (80/20)
train_data, temp_data = train_test_split(
    data, 
    test_size=0.2, 
    random_state=42, 
    stratify=labels
)

# 3. Split dev/test (50/50)
temp_labels = [item['relation'] for item in temp_data]
dev_data, test_data = train_test_split(
    temp_data, 
    test_size=0.5, 
    random_state=42, 
    stratify=temp_labels
)

# 4. Save
def save_json(data, filename):
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

save_json(train_data, 'train.json')
save_json(dev_data, 'dev.json')
save_json(test_data, 'test.json')

print(f"Train: {len(train_data)}, Dev: {len(dev_data)}, Test: {len(test_data)}")

Train: 948, Dev: 118, Test: 119


In [6]:
import json
from pathlib import Path
from itertools import permutations
from collections import Counter, defaultdict

# ====== 4 RELATION SCHEMA DUY NHẤT ======
# (subj_type, obj_type) -> relation_name
REL_SCHEMA = {
    ("DISEASE", "SYMPTOM"): "HAS_MANIFESTATION",
    ("CAUSE", "DISEASE"): "CAUSES",
    ("DISEASE", "TREATMENT"): "TREATED_WITH",
    ("DIAGNOSTIC", "DISEASE"): "REVEALS",
}

POS_REL_SET = set(REL_SCHEMA.values())
NO_REL = "No_Relation"


def load_ls(path: str):
    data = json.loads(Path(path).read_text(encoding="utf-8"))
    if isinstance(data, dict):
        data = [data]
    if not isinstance(data, list):
        raise ValueError("Top-level JSON phải là list tasks hoặc 1 task dict.")
    return data


def parse_task(task: dict):
    """Trả về text, entities_map, positive_pairs_set"""
    data_block = task.get("data", {}) or {}
    text = data_block.get("text") or data_block.get("sentence")
    if not text:
        return None, {}, set(), []

    anns = task.get("annotations") or []
    if not anns:
        return text, {}, set(), []

    result = (anns[0] or {}).get("result") or []

    # entity_id -> {text,start,end,type}
    entities = {}
    for item in result:
        if item.get("type") == "labels":
            eid = item.get("id")
            val = item.get("value") or {}
            labels = val.get("labels") or []
            if not eid or not labels:
                continue
            entities[eid] = {
                "text": val.get("text"),
                "start": val.get("start"),
                "end": val.get("end"),
                "type": labels[0],
            }

    # Positive relation pairs (from_id,to_id) chỉ tính khi label thuộc 4 relation schema
    positive_pairs = set()
    positive_items = []  # để xuất positive sample đúng label gốc nếu muốn

    for item in result:
        if item.get("type") != "relation":
            continue
        from_id = item.get("from_id")
        to_id = item.get("to_id")
        labels = item.get("labels") or []
        rel = labels[0] if labels else None

        if from_id in entities and to_id in entities and rel in POS_REL_SET:
            positive_pairs.add((from_id, to_id))
            positive_items.append((from_id, to_id, rel))

    return text, entities, positive_pairs, positive_items


def generate_dataset_full_no_relation(
    input_full="full.json",
    output_dataset="full_data.json",
    include_positive=True,
    include_valid_but_unlabeled_as_no_relation=False,
):
    """
    - No_Relation được generate cho mọi pair:
        + type1 == type2 OR (type1,type2) không nằm trong REL_SCHEMA
      và pair đó không nằm trong positive_pairs (đã gán relation thật).
    - Nếu include_valid_but_unlabeled_as_no_relation=True:
        + thêm cả những pair-type HỢP LỆ (nằm trong schema) nhưng KHÔNG được annotate
          -> cũng gán No_Relation (dùng khi bạn muốn model học "không phải lúc nào cặp hợp lệ cũng có quan hệ").
    """
    tasks = load_ls(input_full)
    final_data = []

    stats = {
        "positive": 0,
        "no_relation": 0,
        "no_relation_by_type_pair": Counter(),
        "positive_by_rel": Counter(),
        "skipped_tasks_no_text": 0,
        "skipped_tasks_no_entities": 0,
    }

    for task in tasks:
        text, entities, positive_pairs, positive_items = parse_task(task)

        if text is None:
            stats["skipped_tasks_no_text"] += 1
            continue
        if not entities:
            stats["skipped_tasks_no_entities"] += 1
            continue

        # 1) (Tuỳ chọn) add positive samples (giữ nguyên như LS đã gán)
        if include_positive:
            for from_id, to_id, rel in positive_items:
                final_data.append({
                    "text": text,
                    "relation": rel,
                    "subj": entities[from_id],
                    "obj": entities[to_id],
                })
                stats["positive"] += 1
                stats["positive_by_rel"][rel] += 1

        # 2) Generate No_Relation theo rule schema + same-type
        entity_ids = list(entities.keys())
        for e1_id, e2_id in permutations(entity_ids, 2):
            if (e1_id, e2_id) in positive_pairs:
                continue  # đã là positive thì không generate No_Relation

            t1 = entities[e1_id]["type"]
            t2 = entities[e2_id]["type"]

            same_type = (t1 == t2)
            in_schema = ((t1, t2) in REL_SCHEMA)

            # Rule bạn yêu cầu:
            # - nếu same-type OR NOT in_schema => No_Relation
            # - nếu in_schema nhưng chưa annotate:
            #     + mặc định: KHÔNG gán (để dataset không phình vì cặp hợp lệ chưa chắc là No_Relation)
            #     + nếu flag True: gán No_Relation luôn
            make_no_rel = same_type or (not in_schema) or (include_valid_but_unlabeled_as_no_relation and in_schema)

            if not make_no_rel:
                continue

            final_data.append({
                "text": text,
                "relation": NO_REL,
                "subj": entities[e1_id],
                "obj": entities[e2_id],
            })
            stats["no_relation"] += 1
            stats["no_relation_by_type_pair"][f"{t1}-{t2}"] += 1

    Path(output_dataset).write_text(json.dumps(final_data, ensure_ascii=False, indent=2), encoding="utf-8")

    print("=== DONE ===")
    print(f"Output: {output_dataset}")
    print(f"Total samples: {len(final_data)}")
    print(f"Positive: {stats['positive']}")
    print(f"No_Relation: {stats['no_relation']}")
    print("Positive breakdown:", dict(stats["positive_by_rel"]))
    print("Top No_Relation type-pairs:", stats["no_relation_by_type_pair"].most_common(15))
    print("Skipped (no text):", stats["skipped_tasks_no_text"])
    print("Skipped (no entities):", stats["skipped_tasks_no_entities"])

    return final_data, stats


if __name__ == "__main__":
    generate_dataset_full_no_relation(
        input_full="full.json",
        output_dataset="full_data.json",
        include_positive=True,
        include_valid_but_unlabeled_as_no_relation=False,  # đổi True nếu bạn muốn cả cặp hợp lệ nhưng không gán cũng là No_Relation
    )


=== DONE ===
Output: full_data.json
Total samples: 27777
Positive: 2316
No_Relation: 25461
Positive breakdown: {'CAUSES': 537, 'TREATED_WITH': 459, 'HAS_MANIFESTATION': 1120, 'REVEALS': 200}
Top No_Relation type-pairs: [('SYMPTOM-SYMPTOM', 7082), ('DISEASE-DISEASE', 4838), ('SYMPTOM-DISEASE', 2908), ('TREATMENT-TREATMENT', 2602), ('CAUSE-CAUSE', 1736), ('TREATMENT-DISEASE', 1286), ('DISEASE-CAUSE', 1260), ('DIAGNOSTIC-DIAGNOSTIC', 934), ('DISEASE-DIAGNOSTIC', 697), ('SYMPTOM-TREATMENT', 386), ('TREATMENT-SYMPTOM', 386), ('SYMPTOM-CAUSE', 308), ('CAUSE-SYMPTOM', 307), ('SYMPTOM-DIAGNOSTIC', 139), ('DIAGNOSTIC-SYMPTOM', 136)]
Skipped (no text): 0
Skipped (no entities): 0


In [7]:
import json
import random
from collections import defaultdict, Counter
from pathlib import Path

NO_REL = "No_Relation"

def is_same_type(sample):
    return sample["subj"]["type"] == sample["obj"]["type"]

def type_pair(sample):
    return f'{sample["subj"]["type"]}-{sample["obj"]["type"]}'

def stratified_sample(groups, total_needed, seed=42):
    """
    groups: dict[key] -> list[samples]
    Lấy gần đều theo group key.
    """
    rng = random.Random(seed)
    keys = list(groups.keys())
    for k in keys:
        rng.shuffle(groups[k])

    # Nếu total_needed <= 0
    if total_needed <= 0:
        return []

    # Round-robin lấy đều
    picked = []
    idx = 0
    while len(picked) < total_needed and keys:
        k = keys[idx % len(keys)]
        if groups[k]:
            picked.append(groups[k].pop())
        else:
            keys.remove(k)
            # không tăng idx để tránh skip
            continue
        idx += 1

    return picked

def balance_dataset(
    input_file="full_data.json",
    output_file="full_data_balanced.json",
    neg_pos_ratio=2.0,          # NoRel = 2 * Pos
    same_type_fraction=0.3,     # 30% NoRel từ same-type bucket
    seed=42
):
    data = json.loads(Path(input_file).read_text(encoding="utf-8"))

    pos = [s for s in data if s.get("relation") != NO_REL]
    neg = [s for s in data if s.get("relation") == NO_REL]

    pos_n = len(pos)
    target_neg = int(pos_n * neg_pos_ratio)

    # Chia neg thành 2 bucket
    neg_same = [s for s in neg if is_same_type(s)]
    neg_cross = [s for s in neg if not is_same_type(s)]

    target_same = int(target_neg * same_type_fraction)
    target_cross = target_neg - target_same

    # group theo type-pair để stratified
    g_same = defaultdict(list)
    for s in neg_same:
        g_same[type_pair(s)].append(s)

    g_cross = defaultdict(list)
    for s in neg_cross:
        g_cross[type_pair(s)].append(s)

    picked_same = stratified_sample(g_same, min(target_same, len(neg_same)), seed=seed)
    picked_cross = stratified_sample(g_cross, min(target_cross, len(neg_cross)), seed=seed+1)

    # Nếu thiếu (pool không đủ), bù từ bucket còn lại
    picked = picked_same + picked_cross
    missing = target_neg - len(picked)

    if missing > 0:
        # bù từ những gì còn lại trong cross trước, rồi same
        rest_cross = []
        for lst in g_cross.values():
            rest_cross.extend(lst)
        rest_same = []
        for lst in g_same.values():
            rest_same.extend(lst)

        rng = random.Random(seed+2)
        rng.shuffle(rest_cross)
        rng.shuffle(rest_same)

        take = min(missing, len(rest_cross))
        picked += rest_cross[:take]
        missing -= take

        if missing > 0:
            take2 = min(missing, len(rest_same))
            picked += rest_same[:take2]
            missing -= take2

    balanced = pos + picked
    random.Random(seed).shuffle(balanced)

    # Stats
    c = Counter(s["relation"] for s in balanced)
    neg_pairs = Counter(type_pair(s) for s in balanced if s["relation"] == NO_REL)
    same_count = sum(1 for s in balanced if s["relation"] == NO_REL and is_same_type(s))

    Path(output_file).write_text(json.dumps(balanced, ensure_ascii=False, indent=2), encoding="utf-8")

    print("=== BALANCED DONE ===")
    print(f"Input: {input_file} | Output: {output_file}")
    print(f"Pos: {len(pos)}")
    print(f"Neg kept: {len(picked)} (target {target_neg})")
    print(f"  - same-type neg kept: {same_count} (~{same_count/max(1,len(picked)):.1%})")
    print("Relation breakdown:", dict(c))
    print("Top NoRel type-pairs:", neg_pairs.most_common(15))

if __name__ == "__main__":
    balance_dataset(
        input_file="full_data.json",
        output_file="full_data_balanced.json",
        neg_pos_ratio=2.0,       # thử 1.0 / 2.0 / 3.0
        same_type_fraction=0.3,  # thử 0.2 - 0.4
        seed=42
    )


=== BALANCED DONE ===
Input: full_data.json | Output: full_data_balanced.json
Pos: 2316
Neg kept: 4632 (target 4632)
  - same-type neg kept: 1389 (~30.0%)
Relation breakdown: {'No_Relation': 4632, 'HAS_MANIFESTATION': 1120, 'TREATED_WITH': 459, 'CAUSES': 537, 'REVEALS': 200}
Top NoRel type-pairs: [('TREATMENT-DISEASE', 317), ('DISEASE-CAUSE', 317), ('DISEASE-DIAGNOSTIC', 316), ('TREATMENT-SYMPTOM', 316), ('SYMPTOM-TREATMENT', 316), ('SYMPTOM-DISEASE', 315), ('SYMPTOM-CAUSE', 308), ('CAUSE-SYMPTOM', 307), ('TREATMENT-TREATMENT', 278), ('SYMPTOM-SYMPTOM', 278), ('CAUSE-CAUSE', 278), ('DISEASE-DISEASE', 278), ('DIAGNOSTIC-DIAGNOSTIC', 277), ('SYMPTOM-DIAGNOSTIC', 139), ('DIAGNOSTIC-SYMPTOM', 136)]


In [9]:
import json
from collections import Counter
from pathlib import Path

INPUT_FILE = "full_data_balanced.json"
NO_REL = "No_Relation"

data = json.loads(Path(INPUT_FILE).read_text(encoding="utf-8"))

total = len(data)
rel_counter = Counter(s["relation"] for s in data)

print("=== FULL RELATION COUNT ===")
print("Total samples:", total)
print("\nRelation breakdown:")
for rel, cnt in rel_counter.most_common():
    ratio = cnt / total * 100
    print(f"{rel:20s} {cnt:6d} ({ratio:5.1f}%)")

print("\nPositive total:", total - rel_counter[NO_REL])
print("No_Relation total:", rel_counter[NO_REL])


=== FULL RELATION COUNT ===
Total samples: 6948

Relation breakdown:
No_Relation            4632 ( 66.7%)
HAS_MANIFESTATION      1120 ( 16.1%)
CAUSES                  537 (  7.7%)
TREATED_WITH            459 (  6.6%)
REVEALS                 200 (  2.9%)

Positive total: 2316
No_Relation total: 4632


In [10]:
import json
import random
from collections import defaultdict, Counter
from pathlib import Path

INPUT_FILE = "full_data_balanced.json"
OUT_TRAIN = "train_model.json"
OUT_DEV = "dev_model.json"
OUT_TEST = "test_model.json"

TRAIN_RATIO = 0.8
DEV_RATIO = 0.1
TEST_RATIO = 0.1

SEED = 42


def stratified_split(data, label_key="relation",
                     train_ratio=0.8, dev_ratio=0.1, seed=42):
    """
    Chia stratified theo label (relation).
    """
    rng = random.Random(seed)

    by_label = defaultdict(list)
    for item in data:
        by_label[item[label_key]].append(item)

    train, dev, test = [], [], []

    for label, items in by_label.items():
        rng.shuffle(items)
        n = len(items)

        n_train = int(n * train_ratio)
        n_dev = int(n * dev_ratio)
        n_test = n - n_train - n_dev  # phần còn lại

        train.extend(items[:n_train])
        dev.extend(items[n_train:n_train + n_dev])
        test.extend(items[n_train + n_dev:])

    rng.shuffle(train)
    rng.shuffle(dev)
    rng.shuffle(test)

    return train, dev, test


def save_json(path, data):
    Path(path).write_text(
        json.dumps(data, ensure_ascii=False, indent=2),
        encoding="utf-8"
    )


def print_stats(name, data):
    c = Counter(x["relation"] for x in data)
    print(f"\n{name}")
    print(f"  total: {len(data)}")
    for k, v in c.most_common():
        print(f"  {k:20s} {v}")


def main():
    data = json.loads(Path(INPUT_FILE).read_text(encoding="utf-8"))

    train, dev, test = stratified_split(
        data,
        train_ratio=TRAIN_RATIO,
        dev_ratio=DEV_RATIO,
        seed=SEED
    )

    save_json(OUT_TRAIN, train)
    save_json(OUT_DEV, dev)
    save_json(OUT_TEST, test)

    print("=== SPLIT DONE ===")
    print_stats("TRAIN", train)
    print_stats("DEV", dev)
    print_stats("TEST", test)


if __name__ == "__main__":
    main()


=== SPLIT DONE ===

TRAIN
  total: 5557
  No_Relation          3705
  HAS_MANIFESTATION    896
  CAUSES               429
  TREATED_WITH         367
  REVEALS              160

DEV
  total: 693
  No_Relation          463
  HAS_MANIFESTATION    112
  CAUSES               53
  TREATED_WITH         45
  REVEALS              20

TEST
  total: 698
  No_Relation          464
  HAS_MANIFESTATION    112
  CAUSES               55
  TREATED_WITH         47
  REVEALS              20
