## 路徑設定及安裝

In [None]:
import os
from pathlib import Path
from groq import Groq
from pathlib import Path
import whisperx
import torch
import json
from tqdm import tqdm  # ← 這樣匯入的是函數，而非整個模組
import difflib
import re

base_path = Path(r"your_path")

Validation_Dataset_Formal_entity= base_path / "Validation_Dataset_Formal_entity.json"

task1_answer_timestamps = base_path / "task1_answer_timestamps.json"
task1_answer_timestamps_ZH = base_path / "task1_answer_timestamps_ZH.json"
submission_task1_answer = base_path / "submission/task1_answer.txt"
submission_task1_answer_S = base_path / "submission/task1_answer_S.txt"
submission_task1_answer_L = base_path / "submission/task1_answer_L.txt"


submission_task2_answer_S = base_path / "submission/task2_answer_S.txt"
submission_task2_answer_L = base_path / "submission/task2_answer_L.txt"

submission_task2_answer = base_path / "submission/task2_answer.txt"
submission_task2_answer_LLM2 = base_path / "submission/task2_answer_LLM2.txt"
submission_task2_answer_LLM2_reasoning = base_path / "submission/task2_answer_LLM2_reasoning.txt"

# 清理無效
submission_task2_answer_clean_invalid = submission_task2_answer.parent / "task2_answer_clean_invalid.txt"

# 轉成正確類別
submission_task2_answer_corrected = submission_task2_answer.parent / "task2_answer_corrected.txt"

# 規則一
submission_task2_answer_rule1 = submission_task2_answer.parent / "task2_answer_rule1.txt"

# 規則二
submission_task2_answer_rule2 = submission_task2_answer.parent / "task2_answer_rule2.txt"

# 完全清理乾淨
submission_task2_answer_cleaned = submission_task2_answer.parent / "task2_answer_cleaned.txt"

# 5倍標註量
submission_task2_answer_duplicated = submission_task2_answer.parent / "task2_answer_duplicated.txt"

# 對齊時間戳
submission_task2_answer_alignment = submission_task2_answer.parent / "task2_answer_alignment.txt"

# 排序
submission_task2_answer_sort = submission_task2_answer.parent / "task2_answer_sort.txt"

# 最終輸出結果
submission_task2_answer_finally = submission_task2_answer.parent / "task2_answer_finally.txt"


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: {device}")

# 類別設定
train_phi_category = ['PATIENT', 'DOCTOR', 'USERNAME', 'FAMILYNAME', "PERSONALNAME",'PROFESSION',
                      'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY',
                      'DISTRICT', 'COUNTY', 'STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER',
                      'AGE',
                      'DATE', 'TIME', 'DURATION', 'SET',
                      'PHONE', 'FAX', 'EMAIL', 'URL', 'IPADDRESS',
                      'SOCIAL_SECURITY_NUMBER', 'MEDICAL_RECORD_NUMBER', 'HEALTH_PLAN_NUMBER', 'ACCOUNT_NUMBER',
                      'LICENSE_NUMBER', 'VEHICLE_ID', 'DEVICE_ID', 'BIOMETRIC_ID', 'ID_NUMBER',
                      'OTHER']

# 確保資料夾存在
# wav_dir.mkdir(parents=True, exist_ok=True)


## task2

## LLM1_prompt

In [None]:
import openai
import time

special_note = """
- Extract exact numeric time expressions as TIME only if they refer to a specific clock time (e.g., "2:00", "two"). Do not label time spans like "three hours" or "six months" as TIME—those must be labeled as DURATION.
- Always label "today", "yesterday", "now", "tomorrow", "Monday", "this week", and "June" as DATE.
- Label full expressions like "Monday morning" or "Friday night" as TIME. Do not split them.
- If a number clearly refers to a time point (e.g., "two", "maybe two"), extract it as TIME even if casual.
- Label "three hours", "several weeks", "six months", "a couple of minutes", etc. as DURATION. These refer to a time span, not a point in time.
- Label "every morning", "twice a week", "once a day" as SET. These refer to recurring patterns and must include recurrence indicators like "every" or "once".
- Do not label vague frequency adverbs like "frequently", "occasionally", "sometimes", or "regularly" as SET.
- Do not label event-triggered phrases (e.g., "after three hours", "before dinner", "when I got home", "once I arrived") as TIME, DURATION, or any category. These are conditionals, not independent time expressions.
- Prioritize labeling based on the phrase’s standalone semantic meaning, not on its surrounding context.
- Extract all valid TIME, DATE, DURATION, or SET expressions that clearly indicate a specific time, even if embedded in broken or casual sentences.
- Time-related phrases must independently and concretely express a time point, span, or recurrence. Do not extract phrases that rely on another action to be meaningful.
- TIME expressions must refer to a clock time, time of day, or specific point (e.g., "3 PM", "Friday morning", "noon", "late at night").
- DURATION expressions must refer to a measurable span (e.g., "four days", "two hours", "a long time") with a time unit.
- DATE expressions must refer to a calendar reference (e.g., "Friday", "August 12", "yesterday", "last year").
- SET expressions must refer to a recurring schedule and include recurrence indicators (e.g., "every Monday", "twice a day").
- AGE expressions must contain a clearly stated number that represents a person's age (e.g., "65", "three", "twenty-one"). The age must be explicit and quantifiable.
- FAMILYNAME, PERSONALNAME, DOCTOR, and PATIENT: Only extract full, proper given names (e.g., "Emily", "John"). Do not extract roles or relational phrases (e.g., "Ivan's dad").
- If the full name is unknown or missing, do not extract FAMILYNAME or PERSONALNAME under any condition.
- Label people based on role:
  - DOCTOR: Named individuals providing care or diagnosis.
  - PATIENT: Named individuals receiving care or diagnosis.
  - FAMILYNAME: Named relatives of the patient.
  - PERSONALNAME: Named unrelated individuals not acting as doctor, patient, or family.
- If the role is unclear, default to PERSONALNAME.
- Do not extract LOCATION-OTHER unless it matches a known list. If uncertain, try CITY, STATE, COUNTY, or DISTRICT. If still unclear, do not label.
- Label CITY only if the name clearly refers to a real-world city or town. Ambiguous or fictional locations must not be labeled.
- Label each entity occurrence individually. If the same entity appears multiple times in the sentence or paragraph, each occurrence must be labeled separately, even if the category and entity are identical.
- For each entity occurrence (token), assign exactly one PHI category.
- Do not infer or guess entities. Only extract if explicitly stated and clearly match the defined criteria.
- Preserve the exact original casing, spacing, and punctuation of extracted entities.
- Output format must be strict: CATEGORY: entity. Do not add explanations, justifications, or notes.
- Minor errors are acceptable. Missing valid entities is a serious mistake.
- If unsure whether an entity exists, attempt extraction. Output PHI:NULL only if no candidate fits after careful checking.
- After extraction, verify that all PHI categories present in the sentence have been labeled. Missing eligible entities is considered a major mistake.
- If no valid PHI entity can be extracted, output exactly: PHI:NULL.
"""

Rules = """
- Extract each entity exactly as it appears in the text, preserving the original casing, spaces, and punctuation. No normalization, expansion, or abbreviation is allowed.
- For each occurrence (token), assign exactly one PHI category. Do not label the same token with multiple categories.
- Each entity must be assigned to only one PHI category based on context. Do not label the same entity with multiple categories.
- Extract every occurrence of an entity, even if the category and entity are identical and appear multiple times in the same sentence or paragraph. Do not skip, deduplicate, or merge repeated mentions—each instance must be labeled individually.
- If the same entity appears multiple times (even at different positions), each occurrence must be extracted separately. Do not deduplicate.
- Exclude titles like "Dr" or "Dr." when extracting DOCTOR names; only extract the actual name (e.g., "James" from "Dr. James").
- Do not extract generic words like "hospital", "phone", or "address" unless they have specific identifying information (e.g., "Bamaga Hospital", "911").
- Use only the following PHI categories:
    'PATIENT', 'DOCTOR', 'PERSONALNAME', 'FAMILYNAME', 'PROFESSION',
    'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY', 'STATE',
    'COUNTRY', 'COUNTY', 'ZIP', 'LOCATION-OTHER', 'DISTRICT', 'AGE', 'DATE',
    'TIME', 'DURATION', 'SET', 'PHONE', 'MEDICAL_RECORD_NUMBER', 'ID_NUMBER'
- Format output strictly as: CATEGORY: entity.
- PATIENT: if referring to the patient, including by full name.
- DOCTOR: if referring to the doctor, including by full name.
- FAMILYNAME: for named family members of the patient (e.g., "John", "Maria") when their family role is clear (e.g., parent, sibling).
- PERSONALNAME: for named unrelated persons or bystanders not identified as patient, doctor, or family.
- If the role of a person is unclear, assign PERSONALNAME by default.
- If a location contains multiple components (e.g., street, city, state), extract each part under the appropriate category.
- CITY: Extract only if the location name clearly refers to a real-world city or town, and is not ambiguous, generic, or a district, state, or country (e.g., "Chicago", "Miami", "Hamden", "San Antonio", "Austin"). Do not classify as CITY if the entity contains numbers. Use ZIP if address-like.
- STATE: First-level administrative regions within a country (e.g., "Delaware", "Montana", "Oregon", "South Australia", "Western Australia", "Texas", "RI", "QLD").
- COUNTRY: Use for names of recognized sovereign nations or independent countries (e.g., "Madagascar", "Denmark", "France", "Japan", "Brazil", "Australia", "USA").
- COUNTY: Mid-level administrative divisions often below state level (e.g., "Cheshire").
- DISTRICT: Smaller administrative or geographic areas within cities or counties (e.g., "Greenwich").
- STREET: Full street names with or without numbers (e.g., "Main Street", "456 Maple Avenue", "Oxendon").
- Absolutely no inference, guessing, semantic matching, or partial similarity is allowed. Only exact string matches are valid.
- If the location name does not exactly match the above list, do not extract it as LOCATION-OTHER under any condition.
- LOCATION-OTHER must match a predefined whitelist. Otherwise, classify as CITY, STATE, COUNTY, or DISTRICT, or discard.
- Locations that cannot be confidently categorized as CITY, STATE, COUNTRY, COUNTY, DISTRICT, STREET, or exactly match LOCATION-OTHER must not be extracted at all.
- Label a location as CITY only if it is explicitly and clearly indicated as a real-world city or town based on the context. If the location is ambiguous, fictional, incomplete, or contextually unclear, do not label it.
- HOSPITAL: Extract the full name of any hospital, medical center, health service, or healthcare institution as HOSPITAL. The name must be specific and identifiable. Generic terms without specific names (e.g., "hospital", "health center") must not be extracted. Classify as HOSPITAL if entity contains keywords like “Health Service”, “District Health”, “Hospital”, “Clinic”, “Medical Centre”.
- ORGANIZATION: Extract the names of institutions, companies, libraries, businesses, or organized groups only if they are explicitly and clearly identifiable as organizations based on the text. Example entries include Subway, Divinity School Library, Career Services, seekers workshop, Cambridge, Starbucks, Michaels, Google, Datsun, and Orizia.
- Do not extract any health service, hospital, or medical center names as ORGANIZATION. Such entities must be classified as HOSPITAL if specific naming is present.
- ZIP: Treat "postal code", "postcode", "zip code", "ZIP", "ZIP number", "mail code", "delivery code", and "postal ZIP" as ZIP. Use "area code" or "address code" only if clearly referring to a mailing address.
- AGE: Extract only the numeric age of a person. Convert simple number words (e.g., "five") to digits ("5"). Exclude words like "years" or "old". Ignore vague phrases (e.g., "a couple of years") and non-numeric terms (e.g., "young", "teenager").
- DURATION: Extract only when the expression clearly refers to a measurable length or span of time with an explicit time unit such as "seconds", "minutes", "hours", "days", "weeks", "months", or "years".
  - Examples: "two days", "past few weeks", "10 minutes", "six months", "a long time", "a couple of hours"
  - Do not extract standalone numbers without time units.
  - Do not classify calendar expressions such as "last year", "next week", "this Saturday", or "last Friday" as DURATION — these should be classified as DATE.
  - Do not extract seasonal or event-based phrases such as "last summer" or "next winter" — these are also DATE.
- TIME: Extract when the expression refers to a specific time of day or clock time.
  - Examples: "3 PM", "2:30", "morning", "afternoon", "evening", "night", "last night", "middle of the night", "Monday morning", "next Friday morning"
  - Do not extract vague expressions such as "soon", "later", or "sometime".
  - Always reclassify expressions like "Friday morning" or "next Monday morning" as TIME, not DATE.
- DATE: Extract when the expression refers to a specific calendar point, named date, or time-referenced event.
  - Examples: "now", "on Friday", "Monday", "August 5", "May", "last year", "next week", "today", "yesterday", "tomorrow", "Christmas", "New Year's Eve"
  - Always classify "today", "yesterday", and "tomorrow" as DATE.
  - Days of the week like "Monday" and "Friday" are DATE unless clearly part of a recurring pattern (e.g., "every Monday", which is SET).
- SET: Extract only when the expression clearly refers to a repeated time pattern or schedule with explicit recurrence keywords.
  - Examples: "twice a week", "every Monday", "once a day", "three times a week", "every night"
  - Do not extract vague frequency terms like "sometimes", "occasionally", "frequently", "usually", "normally"
  - SET expressions must include recurrence indicators like "every", "once a", "twice per", or "three times each".
  - Do not confuse DURATION ("two weeks") with SET ("every two weeks").
- ROOM: Label room numbers or hospital room identifiers, such as “room 302”, “bed A”, or “A3”.
- PROFESSION: Extract explicit job titles and professional roles (e.g., "babysitter", "IT", "manager", "lawyer", "a lawyer", "engineer", "accountant").
  - Do not classify organization names (e.g., "Google", "Starbucks") or department names (e.g., "HR department", "Trauma team") as PROFESSION.
- DEPARTMENT: Extract all mentions of departments, teams, units, groups, wards, or divisions explicitly described (e.g., "HR department", "Intensive Care Unit", "Immunology Department", "Trauma team").
  - The mention must clearly include keywords such as "department", "team", "unit", "group", or "ward" to qualify.
  - Departments related to medical, administrative, academic, or organizational contexts are eligible.
  - Do not extract general locations, organization names, or professions as DEPARTMENT unless they explicitly include the structural keywords above.
- MEDICAL_RECORD_NUMBER: Use for alphanumeric codes that are clearly tied to a patient’s medical record, such as "1706458.VTX". Must be supported by context with terms like "MRN", "record number", or "medical file". If the text refers to a value as “medical record number” even without a decimal, still classify as MEDICAL_RECORD_NUMBER.
- ID_NUMBER: Use for general-purpose identifiers that are not explicitly linked to medical records. This includes episode numbers, lab numbers, form codes, and internal or personal IDs.
  - If the identifier is not clearly labeled as a medical record number (e.g., “MRN” or “medical record number”) and does not contain a decimal point, label it as ID_NUMBER.
  - Do not use ID_NUMBER for values that clearly represent date expressions (e.g., "17th, 2063", "17/06/2063", "21, 2063"). Use DATE instead unless context shows it is functioning as an identifier.
  - If a value is clearly referred to as an ID number, episode number, or lab number, it may be labeled as ID_NUMBER regardless of format.

Other rules:
- Do not infer or hallucinate missing entities.
- Preserve the original entity casing and spelling.
- If no PHI entities are found, output exactly: PHI:NULL.
"""


fewshot_example = """
Sentence:
Dr. Connie examined patient Florrie Minion at Kangaroo Island Health Service on June 20, 1989. Her medical record number 4402074.WNE and lab ID 44B20748 were recorded in the Department of Cardiology, located at Blue Cow Street, Camden Haven, Western Australia, ZIP 5067.
DOCTOR: Connie
PATIENT: Florrie Minion
HOSPITAL: Kangaroo Island Health Service
DATE: June 20, 1989
MEDICAL_RECORD_NUMBER: 4402074.WNE
ID_NUMBER: 44B20748
DEPARTMENT: Department of Cardiology
STREET: Blue Cow Street
CITY: Camden Haven
STATE: Western Australia
ZIP: 5067

Sentence:
Ashley, a chiropractor, visited the HR department twice a week. The chiropractor worked three hours in the evening at the district office in Greenwich, then spoke with Carl and Ivan’s dad from Cheshire County.
PERSONALNAME: Ashley
PROFESSION: chiropractor
DEPARTMENT: HR department
SET: twice a week
PROFESSION: chiropractor
DURATION: three hours
TIME: evening
DISTRICT: Greenwich
PERSONALNAME: Carl
FAMILYNAME: Ivan
COUNTY: Cheshire

Sentence:
On Monday morning in Victoria, Western Australia, I met with the organization Orizia. We verified ID 57X22961, confirmed her age is 65, and she stayed previously at P.O. Box 15.
TIME: Monday morning
STATE: Victoria
STATE: Western Australia
ORGANIZATION: Orizia
ID_NUMBER: 57X22961
AGE: 65
LOCATION-OTHER: P.O. Box 15

Sentence:
He came in last year, stayed for three weeks, returned two hours later, then again every Monday morning around two, left yesterday afternoon, and usually came back two hours later once a week for tests. Push him to A3.
DATE: last year
DURATION: three weeks
DURATION: two hours
SET: every Monday
TIME: Monday morning
TIME: two
TIME: yesterday afternoon
DURATION: two hours
SET: once a week
ROOM: A3

Sentence:
Beaconsfield District Health Service confirmed Ramona's follow-up was at 9:11 a.m. on Oxendon Street, Kyabram, ZIP 7000, where her medical record 4402074.WNE was reviewed again. Her lab number was 44B20748.
HOSPITAL: Beaconsfield District Health Service
PATIENT: Ramona
TIME: 9:11 a.m.
STREET: Oxendon Street
CITY: Kyabram
ZIP: 7000
MEDICAL_RECORD_NUMBER: 4402074.WNE
ID_NUMBER: 44B20748
"""

## 設定

In [None]:
import time
import openai
from tqdm import tqdm
from openai import OpenAI

Groq_key = "your_API_key"

client = OpenAI(
    api_key=Groq_key,
    base_url="https://api.groq.com/openai/v1"
)


## 測試 有checkpoint的，LLM1

In [None]:
import time
import os
import openai
from tqdm import tqdm
from openai import OpenAI
import random

# ======== 包裝：遇到錯誤就停止 ========
def chat_with_stop_on_error(messages, fid, model="llama3-70b-8192", max_tokens=512, temperature=0.0):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        error_message = str(e)
        print(f"[FATAL ERROR] Error at fid {fid}: {error_message}")
        raise RuntimeError(f"Program stopped due to error at fid {fid}.")

# ======== 啟動時找最後一個 fid 並清理 answer 檔案 ========
restart_fid = None
if os.path.exists(submission_task2_answer_S):
    with open(submission_task2_answer_S, encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]

    if lines:
        last_line = lines[-1]
        if "\t" in last_line:
            restart_fid = last_line.split("\t")[0]

    if restart_fid:
        with open(submission_task2_answer_S, encoding="utf-8") as fin:
            all_lines = fin.readlines()
        with open(submission_task2_answer_S, "w", encoding="utf-8") as fout:
            for line in all_lines:
                if not line.startswith(restart_fid + "\t"):
                    fout.write(line)

# ======== 核心 prompt 設定 ========
system_prompt = f"""
You are an expert at extracting PHI (Protected Health Information) entities from doctor-patient conversations or Daily conversation.

Special note:
{special_note}
Rules:
{Rules}
Few-shot examples:
{fewshot_example}
"""

# ======== 主程式：逐句處理，遇錯就停 ========
try:
    with open(submission_task1_answer_S, encoding="utf-8") as fin, \
         open(submission_task2_answer_S, "a", encoding="utf-8") as fout:

        system_messages = [{"role": "system", "content": system_prompt}]
        start_processing = restart_fid is None

        pbar = tqdm(fin, desc="Extracting PHI entities")
        for line in pbar:
            if "\t" not in line:
                continue

            try:
                fid, sentence = line.split("\t", 1)
            except ValueError:
                print(f"[SKIP] Line skipped due to unpacking error: {line}")
                continue

            pbar.set_description(f"Extracting PHI entities (FID {fid})")
            if not start_processing:
                if fid == restart_fid:
                    start_processing = True
                else:
                    continue

            user_message = [{"role": "user", "content": f"Sentence:\n{sentence}"}]
            messages = system_messages + user_message

            prediction = chat_with_stop_on_error(messages, fid=fid)
            written_preds = []

            for pred_line in prediction.splitlines():
                if pred_line.upper() == "PHI:NULL":
                    continue
                if ":" in pred_line:
                    category, entity = pred_line.split(":", 1)
                    fout.write(f"{fid}\t{category.strip()}\t{entity.strip()}\n")
                    fout.flush()
                    written_preds.append(f"{category.strip()}: {entity.strip()}")

            if written_preds:
                print(f"[WRITE] FID {fid}: {written_preds}")
            else:
                print(f"[WRITE] FID {fid}: (No entities extrated)")
            time.sleep(35 + random.uniform(1, 30))  # 加點 jitter

except RuntimeError as e:
    print(f"[EXIT] {e}")
    exit(1)

## S+L

In [None]:
import os

def load_lines(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    return lines

lines_s_2 = load_lines(submission_task2_answer_S)
lines_l_2 = load_lines(submission_task2_answer_L)
combined_lines_2 = lines_s_2 + lines_l_2

def sort_key(line):
    parts = line.split("\t")
    try:
        return int(parts[0])  # ⬅️ 確保 fid 按照整數比較
    except:
        return float('inf')  # 排在最後，防止錯行打亂順序



combined_sorted_2 = sorted(combined_lines_2, key=sort_key)

with open(submission_task2_answer, 'w', encoding='utf-8') as out_file:
    for line in combined_sorted_2:
        out_file.write(line + '\n')

print("Task 2 合併完成（未去重），輸出至:", submission_task2_answer)

lines_s_1 = load_lines(submission_task1_answer_S)
lines_l_1 = load_lines(submission_task1_answer_L)
combined_lines_1 = lines_s_1 + lines_l_1

combined_sorted_1 = sorted(combined_lines_1, key=sort_key)

with open(submission_task1_answer, 'w', encoding='utf-8') as out_file:
    for line in combined_sorted_1:
        out_file.write(line + '\n')

print("Task 1 合併完成（未去重），輸出至:", submission_task1_answer)


## 先處理錯誤的標籤/人名全小寫直接去除

In [None]:
import re
from collections import defaultdict

modified_count = 0
removed_count = 0
log_entries = []

# === 重複詞修剪工具函式（保留順序，只限制次數） ===
def limit_repeated_phrases(text, max_repeat=3):
    tokens = text.split()
    result = []
    seen = defaultdict(int)
    for token in tokens:
        seen[token] += 1
        if seen[token] <= max_repeat:
            result.append(token)
    return " ".join(result)

# === 讀檔與處理 ===
with open(submission_task2_answer, "r", encoding="utf-8") as fin:
    lines = fin.readlines()

cleaned_lines = []
for line in lines:
    line = line.rstrip("\n")
    if line.count("\t") < 2:
        cleaned_lines.append(line + "\n")
        continue  # 格式錯誤跳過

    fid, category, entity = line.split("\t", 2)
    original_category = category
    original_entity = entity

    # === 清理 category 前綴 "-" ===
    if category.startswith("- "):
        category = category[2:].lstrip()
    elif category.startswith("-"):
        category = category[1:].lstrip()

    # === 移除 entity 中的 NULL ===
    entity_tokens = entity.split()
    entity_tokens = [t for t in entity_tokens if t.upper() != "NULL"]
    entity = " ".join(entity_tokens)
    if entity != original_entity:
        log_entries.append(
            f"[FID {fid}] Removed 'NULL' from entity: \"{original_entity}\" → \"{entity}\""
        )

    # === 若 entity 為空，移除整行並記錄 log ===
    if not entity.strip():
        removed_count += 1
        log_entries.append(
            f"[FID {fid}] Removed entry: entity was NULL or became empty after cleaning"
        )
        continue

    # === 限制 entity 重複詞次數（保留順序） ===
    cleaned_entity = limit_repeated_phrases(entity, max_repeat=3)
    if cleaned_entity != entity:
        log_entries.append(
            f"[FID {fid}] Trimmed repetitive entity: \"{entity}\" → \"{cleaned_entity}\""
        )
        entity = cleaned_entity

    # === 特殊過濾：ID_NUMBER / MEDICAL_RECORD_NUMBER ===
    if category in {"ID_NUMBER", "MEDICAL_RECORD_NUMBER"}:
        if "," in entity:
            removed_count += 1
            log_entries.append(
                f"[FID {fid}] Removed {category} because entity contains a comma: \"{entity}\""
            )
            continue
        if len(entity.strip()) <= 4:
            removed_count += 1
            log_entries.append(
                f"[FID {fid}] Removed {category} with short entity (≤ 4 chars): \"{entity}\""
            )
            continue

    # === 移除全小寫的 PERSONALNAME / FAMILYNAME ===
    if category in {"PERSONALNAME", "FAMILYNAME"} and entity.islower():
        removed_count += 1
        log_entries.append(
            f"[FID {fid}] Removed {category} with all-lowercase entity: \"{entity}\""
        )
        continue  # 不保留此行

    # === 記錄 category 的修正 ===
    if category != original_category:
        modified_count += 1
        log_entries.append(
            f"[FID {fid}] Cleaned CATEGORY: \"{original_category}\" → \"{category}\""
        )

    cleaned_lines.append(f"{fid}\t{category}\t{entity}\n")

# === 寫回原檔案 ===
with open(submission_task2_answer, "w", encoding="utf-8") as fout:
    fout.writelines(cleaned_lines)

# === 輸出 log ===
print(f"[SUMMARY] {modified_count} category values cleaned.")
print(f"[SUMMARY] {removed_count} entries removed (NULL/empty, short/invalid ID or MRN, all-lowercase name).\n")

for entry in log_entries:
    print(entry)


## 測試，第二個LLM

In [None]:
VALID_PHI_CATEGORIES = {
    'PROFESSION',
    'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY', 'STATE',
    'COUNTRY', 'COUNTY', 'ZIP', 'LOCATION-OTHER', 'DISTRICT', 'AGE', 'DATE',
    'TIME', 'DURATION', 'SET', 'PHONE', 'MEDICAL_RECORD_NUMBER', 'ID_NUMBER'
}

LLM2_Rule = f"""
Valid PHI categories (only choose from these):
{VALID_PHI_CATEGORIES}

Category definitions:
- DATE: Refers to a full day or longer period (e.g., today, tomorrow, this week, August 21)
- TIME: Refers to a time within a day (e.g., night, morning, 12:50)
- DURATION: A period of time (e.g., 15 minutes, several months)
- SET: Repeating or scheduled time expressions (e.g., every day, once a week)
- ORGANIZATION: Corporate or institutional names (e.g., Sealed Air Corporation)
- HOSPITAL: Medical facilities (e.g., Eidsvold Multipurpose Health Service)
- DEPARTMENT: Medical or functional departments (e.g., Surgical Unit, Pharmacy)
- CITY / STATE / COUNTRY / ZIP / DISTRICT / STREET: As expected

Keyword-based classification rules:
- DURATION: Time spans like "ages", "15 minutes", "3 hours", "several days", "half an hour".
- SET: Recurring phrases like "every day", "twice a week", "one day a week", "weekly", "monthly".
- DATE: Calendar terms like "yesterday", "Monday", "Sunday", "May", "Easter", "today", "tomorrow", "this weekend", "September 12, 2062", weekdays, months, or "last week".
- TIME: less day like "last night", "morning", "afternoon", "tonight", or clock times (e.g., "12:50", "2:30PM").

Location-based classification rules:
- DEPARTMENT: Phrases with "department", "unit", "ward", "division", "section", "center", "clinic", "rooms", "central".
- HOSPITAL: Phrases with "hospital", "medical center", "health", "healthy", "health center", "service", "services", "centre", "healthcare".
- ORGANIZATION: Names with "Inc.", "Corp.", "Ltd.", "Group", "Corporation", or agencies.

- CITY: Ends with "City" or matches a known city name, (e.g., Chicago, Bowen).
- STATE: Subnational administrative divisions such as states, territories, or regions (e.g., TAS, NT, Western Australia, California).
- COUNTRY: Known country names, (e.g., United States, Germany, India).
- ZIP: numeric postal codes (e.g., "90210" or "8003").
- DISTRICT: Contains "District", or sub-city regions, (e.g., Greenwich)
- COUNTY: Contains "County" (e.g., "Cheshire").
- STREET: Ends with "St.", "Ave", "Road", "Boulevard", "Lane", etc, (e.g., Legend Manor Street).

- PROFESSION: If the entity is a specific job title such as "lawyer", "chiropractor", "manager" or "cashier", classify it as PROFESSION.

Examples:
TIME: this morning
<think>"this morning" is a time span within the day, so it should be TIME.</think>

DATE: last night
<think>"last night" refers to a night, which is within a day. It should be TIME.</think>

CITY: Eidsvold Multipurpose Health Service
<think>This is not a city, but a hospital. Correct category is HOSPITAL.</think>

HOSPITAL: Timbun and District Healthcare Services
<think>"Timbun and District Healthcare Services" contains the keywords 'healthcare' and 'services', which match the HOSPITAL category, not ORGANIZATION.</think>

DEPARTMENT: 3HR State Surgical Unit
<think>"3HR State Surgical Unit" contains the keyword 'unit', which is used to describe a department within a hospital, not the hospital itself.</think>

DATE: now
<think>"now" is like 'today'—it refers to the current date, not time of day.</think>

DEPARTMENT: clinic
<think>"clinic" refers to a specific unit or sub-division within a healthcare facility, not a hospital as a whole. It should be categorized as DEPARTMENT, not HOSPITAL.</think>

ORGANIZATION: YMCA
<think>"YMCA" is a non-profit organization, not a hospital or department. It should be classified as ORGANIZATION.</think>
"""


In [None]:
import time
import os
import openai
from tqdm import tqdm
from openai import OpenAI

# ======== 包裝：遇到錯誤就停止 ========
def chat_with_stop_on_error_LLM2(messages, fid, model="llama3-70b-8192", max_tokens=512, temperature=0.0):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
        )
        result = response.choices[0].message.content
        if result is None:
            print(f"[FATAL] Empty response at fid {fid} → result is None")
            exit(1)
        return result.strip()

    except Exception as e:
        error_message = str(e).lower()

        if "429" in error_message or "rate limit" in error_message or "too many requests" in error_message:
            print(f"[FATAL] 429 Rate Limit Error at fid {fid} → {error_message}")
            exit(1)

        if "maximum context length" in error_message or "maximum tokens" in error_message or "token limit" in error_message:
            print(f"[FATAL] Token Limit Error at fid {fid} → {error_message}")
            exit(1)

        print(f"[FATAL] Unexpected Error at fid {fid} → {error_message}")
        exit(1)

# 實作呼叫
def process_batch(buffer, fout):
    numbered = [f"{i+1}. {cat}: {ent}" for i, (_, cat, ent) in enumerate(buffer)]
    user_prompt = f"There are {len(buffer)} lines. Return {len(buffer)} corrected lines:\n\n" + "\n".join(numbered)
    messages = system_messages_LLM2 + [{"role": "user", "content": user_prompt}]

    result = chat_with_stop_on_error_LLM2(messages, fid=buffer[0][0])
    raw_lines = result.strip().splitlines()

    parsed_lines = []
    for i, line in enumerate(raw_lines):
        line = line.strip()
        if not line:
            continue
        line = re.sub(r"^\d+\.\s*", "", line)
        parsed_lines.append((i, line))

    corrected_lines = []
    for i, line in parsed_lines:
        if ":" in line and not line.startswith("<") and not line.lower().startswith("here are"):
            try:
                category, entity = map(str.strip, line.split(":", 1))
                corrected_lines.append((i, category, entity))
            except:
                continue

    corrected_pool = corrected_lines.copy()

    for i in range(len(buffer)):
        fid, category_orig, entity_orig = buffer[i]
        matched = False

        # 只相信他的Category，不相信他的entity，因為有時候會汙染，所以說寫進去只寫自己原本的entity
        for j, (idx, category_pred, entity_pred) in enumerate(corrected_pool):
            # category必須要在合法名單中
            if entity_pred == entity_orig and category_pred in VALID_PHI_CATEGORIES:
                fout.write(f"{fid}\t{category_pred}\t{entity_orig}\n")
                fout.flush()
                print(f"[WRITE] {fid}: {category_pred}: {entity_orig}")

                if category_pred != category_orig:
                    with open(submission_task2_answer_LLM2_reasoning, "a", encoding="utf-8") as fr:
                        fr.write(f"[REPLACED] {fid}\t{category_orig} → {category_pred}\t{entity_orig}\n")

                corrected_pool.pop(j)
                matched = True
                break

        if not matched:
            fout.write(f"{fid}\t{category_orig}\t{entity_orig}\n")
            fout.flush()
            print(f"[FALLBACK] {fid}: {category_orig}: {entity_orig}")
            with open(submission_task2_answer_LLM2_reasoning, "a", encoding="utf-8") as fr:
                fr.write(f"[MISMATCH] {fid}\t{category_orig}\t{entity_orig} ← LLM2 returned unmatched category or entity\n")

    buffer.clear()
    time.sleep(1.5)

# ======== 啟動時找最後一個 fid 並清理 answer 檔案 ========
restart_fid_LLM2 = None
if os.path.exists(submission_task2_answer_LLM2):
    with open(submission_task2_answer_LLM2, encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]

    if lines:
        last_line = lines[-1]
        if "\t" in last_line:
            restart_fid_LLM2 = last_line.split("\t")[0]

    if restart_fid_LLM2:
        with open(submission_task2_answer_LLM2, encoding="utf-8") as fin:
            all_lines = fin.readlines()
        with open(submission_task2_answer_LLM2, "w", encoding="utf-8") as fout:
            for line in all_lines:
                if not line.startswith(restart_fid_LLM2 + "\t"):
                    fout.write(line)

# ======== 核心 prompt 設定 ========
batch_size = 10

system_prompt_LLM2 = f"""
You are a PHI category corrector.

Each input is in the format: CATEGORY: ENTITY
Correct only the CATEGORY if it's in this list:
DATE, TIME, DURATION, SET, ORGANIZATION, HOSPITAL, DEPARTMENT, CITY, STATE, COUNTRY, DISTRICT, ZIP, STREET

Return exactly one corrected line per input, in the same order.
Do not change the ENTITY.
Do not add explanations, summaries, or introductions.
Output only the corrected lines. No extra text.

Strict Output Constraints:
- You must return exactly one line per input, no more, no less.
- DO NOT merge, summarize, omit, or skip any input line.
- Even if two lines appear similar, treat them as independent and return both.
- Output must preserve the input order exactly.
- If unsure, copy the CATEGORY unchanged.
- Do not omit similar-looking inputs. Return every line, even if redundant.
- Each input must be treated independently, and classification must strictly follow the Rules.
- You are strictly prohibited from using your own judgment to classify entities that appear in the Rules section. If an ENTITY matches any keyword or phrase listed in the Rules, you must assign the exact category as specified. Do not rely on context. Do not reinterpret. Do not override. These mappings are absolute, authoritative, and must be applied exactly as written.
- Ignore general knowledge or real-world semantics if they contradict these rules.

Output format:
CATEGORY: ENTITY

Rules:
{LLM2_Rule}
"""

# 只需要 LLM 處理的類別
target_categories = {
    'DATE', 'TIME', 'DURATION', 'SET',
    'ORGANIZATION', 'HOSPITAL', 'DEPARTMENT',
    'CITY', 'STATE', 'COUNTRY', 'DISTRICT', 'ZIP', 'LOCATION-OTHER', 'STREET'
}

buffer = []
# ======== 主程式 ========
try:
    with open(submission_task2_answer, encoding="utf-8") as llm1_file, \
         open(submission_task2_answer_LLM2, "a", encoding="utf-8") as fout:

        system_messages_LLM2 = [{"role": "system", "content": system_prompt_LLM2}]
        start_processing_LLM2 = restart_fid_LLM2 is None

        for line in tqdm(llm1_file, desc="Processing in batches"):
            if "\t" not in line:
                continue
            parts = line.strip().split("\t")
            if len(parts) < 3:
                continue
            fid, category, entity = parts[0], parts[1], parts[2]

            if not start_processing_LLM2:
                if fid == restart_fid_LLM2:
                    start_processing_LLM2 = True
                else:
                    continue

            if category not in target_categories:
                fout.write(f"{fid}\t{category}\t{entity}\n")
                fout.flush()
                continue

            buffer.append((fid, category, entity))

            if len(buffer) >= batch_size:
                process_batch(buffer, fout)

        # 最後不足一批的也要處理
        if buffer:
            process_batch(buffer, fout)

except Exception as e:
    print(f"[EXIT] {e}")
    exit(1)

# 後處理

## 清理submission_task2_answer

## 先清理無效的列

In [None]:
# 類別標準化對照表
category_mapping = {
    # 地址相關
    "address": "STREET",
    "street": "STREET",
    "street name": "STREET",
    "road": "STREET",
    "road name": "STREET",
    "city": "CITY",
    "state": "STATE",
    "zip": "ZIP",
    "zip code": "ZIP",
    "postal code": "ZIP",
    "country": "COUNTRY",
    "county": "COUNTY",
    "district": "DISTRICT",
    "location": "LOCATION-OTHER",
    "location-other": "LOCATION-OTHER",

    # 人物相關
    "name": "PERSONALNAME",
    "person name": "PERSONALNAME",
    "personal name": "PERSONALNAME",
    "family name": "FAMILYNAME",
    "patient": "PATIENT",
    "doctor": "DOCTOR",
    "username": "USERNAME",
    "profession": "PROFESSION",

    # 聯絡方式
    "phone": "PHONE",
    "phone number": "PHONE",
    "telephone": "PHONE",
    "fax": "FAX",
    "email": "EMAIL",
    "email address": "EMAIL",
    "url": "URL",
    "ip": "IPADDRESS",
    "ip address": "IPADDRESS",

    # 醫療與身份編號
    "ssn": "SOCIAL_SECURITY_NUMBER",
    "social security number": "SOCIAL_SECURITY_NUMBER",
    "medical record number": "MEDICAL_RECORD_NUMBER",
    "health plan number": "HEALTH_PLAN_NUMBER",
    "account number": "ACCOUNT_NUMBER",
    "license number": "LICENSE_NUMBER",
    "id": "ID_NUMBER",
    "id number": "ID_NUMBER",
    "lab_number": "ID_NUMBER",
    "device id": "DEVICE_ID",
    "vehicle id": "VEHICLE_ID",
    "biometric id": "BIOMETRIC_ID",

    # 其他類型
    "date of birth": "DATE",
    "birthdate": "DATE",
    "dob": "DATE",
    "date": "DATE",
    "age": "AGE",
    "time": "TIME",
    "duration": "DURATION",
    "set": "SET",
    "room": "ROOM",
    "department": "DEPARTMENT",
    "hospital": "HOSPITAL",
    "organization": "ORGANIZATION",
    "other": "OTHER",
}

# 錯誤行記錄
invalid_entries = []

with open(submission_task2_answer_LLM2, encoding="utf-8") as fin, \
     open(submission_task2_answer_clean_invalid, "w", encoding="utf-8") as fout:

    for line in fin:
        line = line.strip()
        if not line or line.upper() == "PHI:NULL":
            continue

        if "\t" not in line:
            invalid_entries.append(("InvalidFormat", line))
            continue

        parts = line.split("\t")
        if len(parts) != 3:
            invalid_entries.append(("InvalidParts", line))
            continue

        id_, category, value = parts

        # 改用 in 判斷
        lowered = value.lower()
        if "is not a valid" in lowered:
            parts = lowered.split("is not a valid", 1)
            new_value = parts[0].strip()
            raw_category = parts[1].strip().split()[0]  # 拿第一個單字
            new_category = category_mapping.get(raw_category, raw_category.upper())
            fout.write(f"{id_}\t{new_category}\t{new_value}\n")
             # ✅ 增加這行，把修正資訊也 log 起來
            invalid_entries.append({
                "type": f"FIXED({new_category})",
                "original": value,
                "corrected": f"{id_}\t{new_category}\t{new_value}"
            })

        else:
            fout.write(line + "\n")

# 錯誤列印
if invalid_entries:
    print("錯誤列：")
    print(f"總共 {len(invalid_entries)} 行錯誤列")
    for entry in invalid_entries:
        if isinstance(entry, dict):  # FIXED 類型
            print(f"[{entry['type']}]")
            print(f"  原始：{entry['original']}")
            print(f"  修改：{entry['corrected']}")
        else:  # 其他格式錯誤
            category, content = entry
            print(f"[{category}] {content}")


## re+去除category == entity相關

## STREET, CITY, STATE, ZIP

In [None]:
import re
from collections import defaultdict

tags = ['STREET', 'CITY', 'STATE', 'ZIP']

class OptimizedAddressExtractor:
    def __init__(self):
        # 預編譯所有正則表達式 - 修正版
        self.regex_patterns = [
            # 標準格式：residing at STREET, CITY, STATE ZIP
            re.compile(
                r"residing at\s+([\w\s'.-]{2,40}?),\s*([\w\s'.-]{2,30}?),\s*([A-Z][a-zA-Z\s]+|[A-Z]{2,})(?:[,\s]+(\d{4}))?",
                re.IGNORECASE
            ),
            # residing on STREET in CITY, STATE ZIP
            re.compile(
                r"residing on\s+([\w\s'.-]{2,40}?)\s+in\s+([\w\s'.-]{2,30}?),\s*([A-Z][a-zA-Z\s]+|[A-Z]{2,})(?:[,\s]+(\d{4}))?",
                re.IGNORECASE
            ),
            # 一般格式：STREET, CITY, STATE ZIP (但要避免句子開頭的長文字)
            re.compile(
                r"(?:^|\.\s+|\b(?:at|in)\s+)([\w\s'.-]{2,40}?),\s*([\w\s'.-]{2,30}?),\s*([A-Z]{2,}|[A-Z][a-zA-Z\s]+)[,\s]+(\d{4})",
                re.IGNORECASE
            ),
            # residing at STREET in CITY, STATE ZIP
            re.compile(
                r"residing at\s+([\w\s'.-]{2,40}?)\s+in\s+([\w\s'.-]{2,30}?),\s*([A-Z]{2,}|[A-Z][a-zA-Z\s]+)(?:[,\s]+(\d{4}))?",
                re.IGNORECASE
            ),
            re.compile(
            r"residing at\s+([\w\s'.-]{2,40}?)\s+in\s+([\w\s'.-]{2,30}?),\s*([A-Z]{2,}|[A-Z][a-zA-Z\s]+)[,\s]+with the postal code\s+(\d{4})",
            re.IGNORECASE
        )
        ]

        # 預編譯驗證相關的正則表達式
        self.state_pattern = re.compile(r"^(?:[A-Z]{2,}|[A-Z][a-z]+(?: [A-Z][a-z]+)*)$")
        self.zip_pattern = re.compile(r"^\d{4}$")

        # 加強 STREET 驗證模式
        self.street_junk_pattern = re.compile(r"\b(resides? at|resides? on|resides? in|located|lives?|residences? in|specimen|collected|department|diagnostic|pathology)\b", re.IGNORECASE)
        self.street_invalid_pattern = re.compile(r"^\d{4}$|^\d{1,4}\s*$|Department|Unit|Service|Hospital|Centre|Center|specimen|collected|patient|diagnostic", re.IGNORECASE)
        self.street_valid_pattern = re.compile(r"^[\w\s'.-]{2,40}$")
        self.street_cleanup_pattern = re.compile(r"\.\s*He\s+resides.*")

        # 年份檢測模式
        self.year_pattern = re.compile(r"^\d{4}$")

        # 編號/代碼模式檢測（不是有效街道）
        self.code_pattern = re.compile(r"^\d+[A-Z]+\d+$|^[A-Z]+\d+[A-Z]+$|^\d{2,}[A-Z]{1,3}\d{2,}$", re.IGNORECASE)

        # 醫療/時間相關詞彙過濾
        self.medical_time_pattern = re.compile(r"\b(specimen|collected|january|february|march|april|may|june|july|august|september|october|november|december|tissue|pathology|diagnostic|oncology|at\s+\d+\.\d+)\b", re.IGNORECASE)

        self.zip_fullmatch_pattern = re.compile(r"\d{4}")

        # 預編譯句子預處理的正則表達式
        self.preprocessing_patterns = [
            (re.compile(r"\bresides at\b", re.IGNORECASE), "residing at"),
            (re.compile(r"\bresides on\b", re.IGNORECASE), "residing on"),
            (re.compile(r"\blives at\b", re.IGNORECASE), "residing at"),
            (re.compile(r"\blocated at\b", re.IGNORECASE), "residing at"),
            (re.compile(r"\baddress is\b", re.IGNORECASE), "residing at"),
            (re.compile(r"postal code", re.IGNORECASE), ""),
            (re.compile(r"with postal code\s*(\d{4})", re.IGNORECASE), r"\1"),
            (re.compile(r"ZIP\s*(\d{4})", re.IGNORECASE), r"\1"),
            (re.compile(r"near\s+([A-Z][a-zA-Z\s]+|[A-Z]{2,})", re.IGNORECASE), r"\1")
        ]

    def is_valid_state(self, text):
        """驗證州名格式"""
        return bool(self.state_pattern.match(text.strip()))

    def is_valid_zip(self, text):
        """驗證郵遞區號格式"""
        return bool(self.zip_pattern.match(text.strip()))

    def preprocess_sentence(self, sentence):
        """批次預處理句子，提高效率"""
        sentence_norm = sentence
        for pattern, replacement in self.preprocessing_patterns:
            sentence_norm = pattern.sub(replacement, sentence_norm)
        return sentence_norm

    def validate_and_clean_street(self, value, fid, log):
        """驗證和清理街道地址"""
        # 檢查是否為年份
        if self.year_pattern.match(value.strip()):
            log.append(f"{fid} - ⚠️ Skip STREET (年份): {value}")
            return None

        # 檢查是否為編號/代碼格式
        if self.code_pattern.match(value.strip()):
            log.append(f"{fid} - ⚠️ Skip STREET (編號/代碼): {value}")
            return None

        # 檢查是否包含醫療/時間相關詞彙
        if self.medical_time_pattern.search(value):
            log.append(f"{fid} - ⚠️ Skip STREET (醫療/時間詞彙): {value}")
            return None

        # 檢查是否包含無效模式
        if self.street_invalid_pattern.search(value):
            log.append(f"{fid} - ⚠️ Skip STREET (無效模式): {value}")
            return None

        # 清理內容
        value = self.street_cleanup_pattern.sub("", value).strip()
        value = re.sub(r"^(at|in)\s+", "", value, flags=re.IGNORECASE).strip()

        # 檢查垃圾文字 - 在清理後檢查
        if self.street_junk_pattern.search(value):
            log.append(f"{fid} - ⚠️ Skip STREET (包含垃圾詞): {value}")
            return None

        # 基本格式驗證
        if not self.street_valid_pattern.match(value):
            log.append(f"{fid} - ⚠️ Skip STREET (格式不符): {value}")
            return None

        # 檢查是否太長（可能是整個句子）
        if len(value) > 50:
            log.append(f"{fid} - ⚠️ Skip STREET (過長): {value}")
            return None

        return value

    def categorize_entity(self, tag, value):
        """僅依 tag 類別統一分類，不細分子類別"""
        return [tag.upper()]


    def extract_entities_from_sentence(self, fid, sentence, log):
        """從單個句子提取地址實體（每條 regex 順序嘗試，驗證失敗繼續）"""
        sentence_norm = self.preprocess_sentence(sentence)

        for regex in self.regex_patterns:
            match = regex.search(sentence_norm)
            if not match:
                continue

            groups = match.groups()
            local_entities = []
            valid = True

            for tag, value in zip(tags, groups):
                if not value:
                    continue
                value = value.strip()

                if tag == "STREET":
                    cleaned_value = self.validate_and_clean_street(value, fid, log)
                    if not cleaned_value:
                        valid = False
                        break
                    value = cleaned_value

                elif tag == "STATE" and not self.is_valid_state(value):
                    log.append(f"{fid} - ❌ Invalid STATE: {value}")
                    valid = False
                    break

                elif tag == "ZIP" and value and not self.is_valid_zip(value):
                    log.append(f"{fid} - ❌ Invalid ZIP: {value}")
                    valid = False
                    break

                local_entities.append((tag, value))

            if valid and len(local_entities) >= 3:
                values_only = [val for _, val in local_entities]
                log.append(f"{fid} - ✅ Extracted: {' | '.join(values_only)}")
                return local_entities  # 成功，馬上使用，結束迴圈

        return []  # 沒有任何有效匹配

    def correct_invalid_tag(self, tag, val, fid, log):
        """修正無效標籤 - 加強版"""
        tag = tag.upper()
        original_tag = tag
        corrected_tag = tag

        if tag in {"LOCATION-OTHER", "DISTRICT"}:
            val_lower = val.lower()
            reason = ""

            # 檢查是否為醫院/機構名稱（不應該修正為地址）
            if re.search(r"\b(department|unit|service|hospital|centre|center|clinic|medical)\b", val_lower):
                corrected_tag = "DEPARTMENT"
                reason = "醫療機構關鍵詞，強制轉為 DEPARTMENT"

            elif "street" in val_lower or "road" in val_lower or "avenue" in val_lower:
                corrected_tag = "STREET"
                reason = "含街道關鍵詞"
            elif "city" in val_lower or "town" in val_lower or "village" in val_lower:
                corrected_tag = "CITY"
                reason = "含城市關鍵詞"
            elif "territory" in val_lower or "state" in val_lower or (val.isupper() and len(val) <= 5):
                corrected_tag = "STATE"
                reason = "含地區/州關鍵詞或疑似縮寫"
            elif self.zip_fullmatch_pattern.fullmatch(val.strip()):
                corrected_tag = "ZIP"
                reason = "4碼郵遞區號"

            if corrected_tag != tag and reason and "保持原標籤" not in reason:
                log.append(f"{fid} - 🔁 {val}: {tag} → {corrected_tag}（{reason}）")
            elif "保持原標籤" in reason:
                log.append(f"{fid} - ⚠️ 跳過修正 {val}: {reason}")

        return corrected_tag

# 主要執行
extractor = OptimizedAddressExtractor()

print("讀取檔案中...")

# === 讀取資料 ===
try:
    with open(submission_task1_answer, "r", encoding="utf-8") as f:
        task1_lines = f.readlines()

    with open(submission_task2_answer_clean_invalid, "r", encoding="utf-8") as f:
        original_invalid_lines = [line.strip() for line in f.readlines()]
except FileNotFoundError as e:
    print(f"❌ 檔案讀取錯誤: {e}")

print(f"待處理 Task1 資料: {len(task1_lines)} 筆")
print(f"待處理無效標籤: {len(original_invalid_lines)} 筆")

# === 先存起來原本的 invalid 資料 ===
print("存儲原本的 invalid 資料...")
original_invalid_data = []
for line in original_invalid_lines:
    original_invalid_data.append(line)

# === 從 Task1 中擷取地址實體 ===
print("從 Task1 擷取地址實體中...")
fid_entities = defaultdict(list)
log = []

processed_count = 0
for line in task1_lines:
    parts = line.strip().split('\t')
    if len(parts) != 2:
        continue

    fid, sentence = parts
    entities = extractor.extract_entities_from_sentence(fid, sentence, log)

    if entities:
        fid_entities[fid].extend(entities)

    processed_count += 1

    # 顯示進度
    if processed_count % 5000 == 0:
        progress = (processed_count / len(task1_lines)) * 100
        print(f"   進度: {processed_count}/{len(task1_lines)} ({progress:.1f}%)")

# === 開始組合最終結果：新抓到的放最前面 ===
print("組合最終結果（新抓到的放前面）...")
final_lines = []

# 1. 先加入從 Task1 新擷取的實體（放最前面）
print("加入新擷取的實體...")
for fid, entities in fid_entities.items():
    for tag, val in entities:
        final_lines.append(f"{fid}\t{tag}\t{val}")

# 2. 然後修正原本 invalid lines 的 LOCATION-OTHER / DISTRICT 並加入
print("修正並加入原本的無效標籤...")
for line_data in original_invalid_data:
    parts = line_data.strip().split('\t')
    if len(parts) != 3:
        final_lines.append(line_data)
        continue

    fid, tag, val = parts
    corrected_tag = extractor.correct_invalid_tag(tag, val, fid, log)
    final_lines.append(f"{fid}\t{corrected_tag}\t{val}")

# === 寫入檔案 ===
print("寫入結果檔案...")
with open(submission_task2_answer_clean_invalid, "w", encoding="utf-8") as f:
    for line in final_lines:
        f.write(line + "\n")

# === 統計與報告 ===
new_entities_count = sum(len(v) for v in fid_entities.values())

# 統計新實體類別
category_stats = defaultdict(int)
detailed_category_stats = defaultdict(int)

for entities in fid_entities.values():
    for tag, val in entities:
        category_stats[tag] += 1

print("\n" + "="*60)
print("處理完成報告")
print("="*60)
print(f"最終總筆數: {len(final_lines)}")
print(f"Task1 新增筆數: {new_entities_count}")

print(f"\n新增實體類別統計:")
for tag, count in sorted(category_stats.items()):
    print(f"   {tag}: {count} 筆")

# === 顯示處理日誌 ===
if log:
    print(f"\n處理日誌 (共 {len(log)} 筆):")
    print("-" * 60)
    for entry in log:
        print(f"   {entry}")

print(f"\n✅ 處理完成！結果已寫入: {submission_task2_answer_clean_invalid}")

In [None]:
import re
import json

# ======== 正則表達式模式定義 ========

# ---- MEDICAL_RECORD_NUMBER pattern ----
MEDICAL_RECORD_NUMBER_patterns = [
    r'\b\d{6,8}\.?[A-Za-z]{2,5}\.?\b',  # 原始格式，例如 7533626.bkf
]

# ---- ID_NUMBER pattern ----
ID_NUMBER_patterns = [
    r'\b\d{2}[A-Z][A-Za-z0-9]{4,10}\.?\b',  # 原始格式

    # 修正版：確保ID包含數字且不匹配純文字
    r'\b(?:Episode\sNumber|Lab\sNo\.?|Lab\sNumbers?|Laboratory\sNumbers?|ID\snumber|medical\sepisode\sidentified\sas|identification\snumbers?|episode|lab\snumbers?\sare)[:\s]+(?P<id>(?=.*\d)[A-Za-z0-9]{4,15})\b'
]


# -------- DURATION 正則定義 --------
number_words = [
    "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
    "eleven", "twelve", "thirteen", "fourteen", "fifteen",
    "sixteen", "seventeen", "eighteen", "nineteen", "twenty"
]
duration_units = [
    "secs", "seconds", "mins", "minutes", "hours", "days", "weeks", "months", "years"
]
unit_pattern = "|".join(duration_units)
number_pattern = rf"(?:\d+(?:\.\d+)?|{'|'.join(number_words)})"
DURATION_pattern = rf"\b{number_pattern}\s?(?:{unit_pattern})(?!\s?old)\b"

# -------- AGE 正則定義 --------
base_number_words = [
    "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
    "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen",
    "seventeen", "eighteen", "nineteen", "twenty"
]
tens_words = ["twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
composite_number_words_dash = [f"{tens}-{unit}" for tens in tens_words for unit in base_number_words[:9]]
composite_number_words_space = [f"{tens} {unit}" for tens in tens_words for unit in base_number_words[:9]]
all_number_words = base_number_words + composite_number_words_dash + composite_number_words_space
number_word_pattern = r"(?:" + "|".join(re.escape(word) for word in all_number_words) + r")"

AGE_patterns = [
    r"\b(?P<age1>\d{1,3})(?:\s|-)?(?:year|yr)s?(?:\s|-)?old\b",
    rf"\b(?P<age2>{number_word_pattern})(?:\s|-)?(?:year|yr)s?(?:\s|-)?old\b",
    r"\b(?P<age3>\d{2}s)\b",
    rf"\b(?:he|she|they|i)\s+(?:was|is|were|am)\s+(?P<age4>(\d{{1,3}}|{number_word_pattern}))(?!\s+(of|among|amongst|between|the)\b)",
    rf"\b(?:he's|she's|they're|i'm)\s+(?P<age5>(\d{{1,3}}|{number_word_pattern}))(?!\s+(of|among|amongst|between|the)\b)"
]

# ======== TIME / ZIP / DOCTOR pattern ========
TIME_patterns = [
    r'\b(?:[01]?\d|2[0-3]):[0-5]\d\b',  # 24hr format
    r'\b(?:[01]?\d|2[0-3])(?:[:.][0-5]\d)?\s?(?:a\.?\s?m\.?|p\.?\s?m\.?)\b',  # am/pm
    r"\b(?:[1-9]|1[0-2])\s*o'?clock\b"  # e.g. 9 o'clock
]


ZIP_patterns = [
    r'\b(?P<zip>[A-Za-z]\d[A-Za-z][ -]?\d[A-Za-z]\d)\b',  # Canadian
    r'\b(?:ZIP\s?code(?:\s+of)?|Postal\s?code(?:\s+of)?):?\s?(?P<zip>\d{3,10})\b',  # US style
]

DOCTOR_pattern = r'\b(?:Dr\.?|Doctor|Prof\.?|Professor|Associate\s+Professor)\s+(?:[A-Z]\.|\b[A-Z][a-z]+)(?:\s+[A-Z]\.)?(?:\s+[A-Z][a-z]+(?:-[A-Z][a-z]+)?)\b'

# === 先把原有 clean_invalid 的資料讀出來 ===
with open(submission_task2_answer_clean_invalid, encoding="utf-8") as f:
    existing_lines = [line.strip() for line in f if line.strip()]

# ======== 統計與資料儲存變數 ========
added_total = 0
added_counter = {
    "MEDICAL_RECORD_NUMBER": 0,
    "ID_NUMBER": 0,
    "DURATION": 0,
    "AGE": 0,
    "TIME": 0,
    "ZIP": 0,
    "DOCTOR": 0
}
added_records = []

entity_category_mapping = {}
with open(Validation_Dataset_Formal_entity, encoding="utf-8") as f:
    entities_data = json.load(f)
    for item in entities_data:
        for category, entities in item.items():
            for entity in entities:
                entity_category_mapping[entity] = category.upper()
                entity_category_mapping[entity.lower()] = category.upper()

# ======== 開始處理資料並抽取標註結果 ========
with open(submission_task1_answer, encoding="utf-8") as fin, \
     open(submission_task2_answer_clean_invalid, "w", encoding="utf-8") as fout:  # 清空寫入

    for line in fin:
        line = line.strip()
        if not line or "\t" not in line:
            continue

        fid, sentence = line.split("\t", 1)

        # ---- MEDICAL_RECORD_NUMBER ----
        for pattern in MEDICAL_RECORD_NUMBER_patterns:
            for match in re.finditer(pattern, sentence, re.IGNORECASE):
                entity = match.group('mrn') if 'mrn' in match.groupdict() else match.group()
                label = "MEDICAL_RECORD_NUMBER"
                fout.write(f"{fid}\t{label}\t{entity}\n")
                added_counter[label] += 1
                added_total += 1
                added_records.append((fid, label, entity))

        # ---- ID_NUMBER ----
        for pattern in ID_NUMBER_patterns:
            for match in re.finditer(pattern, sentence, re.IGNORECASE):
                entity = match.group('id') if 'id' in match.groupdict() else match.group()
                label = "ID_NUMBER"
                fout.write(f"{fid}\t{label}\t{entity}\n")
                added_counter[label] += 1
                added_total += 1
                added_records.append((fid, label, entity))

        # ---- DURATION（排除 one day）----
        full_matches = re.findall(DURATION_pattern, sentence, re.IGNORECASE)
        filtered_matches = [match for match in full_matches if match.lower() != "one day"]
        for match in filtered_matches:
            label = "DURATION"
            fout.write(f"{fid}\t{label}\t{match}\n")
            added_counter[label] += 1
            added_total += 1
            added_records.append((fid, label, match))

        # ---- AGE ----
        for pattern in AGE_patterns:
            for match in re.finditer(pattern, sentence, flags=re.IGNORECASE):
                for key in ("age1", "age2", "age3", "age4", "age5"):
                    if key in match.groupdict() and match.group(key):
                        age_value = match.group(key)
                        label = "AGE"
                        fout.write(f"{fid}\t{label}\t{age_value}\n")
                        added_counter[label] += 1
                        added_total += 1
                        added_records.append((fid, label, age_value))
                        break

        # ---- TIME ----
        for pattern in TIME_patterns:
            for match in re.finditer(pattern, sentence, re.IGNORECASE):
                entity = match.group()
                label = "TIME"
                fout.write(f"{fid}\t{label}\t{entity}\n")
                added_counter[label] += 1
                added_total += 1
                added_records.append((fid, label, entity))

        # ---- ZIP ----
        for pattern in ZIP_patterns:
            for match in re.finditer(pattern, sentence, re.IGNORECASE):
                # 如果有 group dict（命名群組），取 zip；否則取整個 match
                if isinstance(match, re.Match) and match.groupdict():
                    zip_code = match.groupdict().get("zip", match.group())
                else:
                    zip_code = match.group()

                label = "ZIP"
                fout.write(f"{fid}\t{label}\t{zip_code}\n")
                added_counter[label] += 1
                added_total += 1
                added_records.append((fid, label, zip_code))


        # ---- DOCTOR ----
        for match in re.findall(DOCTOR_pattern, sentence):
            label = "DOCTOR"
            fout.write(f"{fid}\t{label}\t{match}\n")
            added_counter[label] += 1
            added_total += 1
            added_records.append((fid, label, match))


# ======== 將原本的 clean_invalid 清理後再寫回 ========
removal_logs = []
with open(submission_task2_answer_clean_invalid, "a", encoding="utf-8") as fout:
    for line in existing_lines:
        if "\t" not in line:
            removal_logs.append(("MissingTab", line))
            continue

        parts = line.split("\t")
        if len(parts) < 3:
            removal_logs.append(("TooFewParts", line))
            continue

        fid = parts[0]
        label = parts[1].strip().upper()
        entity = "\t".join(parts[2:]).strip()

        entity_lower = entity.lower()
        label_lower = label.lower()

        if entity_lower == label_lower:
            removal_logs.append((fid, label, entity, "entity == label"))
            continue

        tokens = entity_lower.split()
        if len(tokens) == 2 and tokens[0] in {"the", "a", "an", "this", "that", "those"} and tokens[1] == label_lower:
            removal_logs.append((fid, label, entity, "entity = 冠詞+label"))
            continue

        fout.write(f"{fid}\t{label}\t{entity}\n")

# === 顯示被過濾的項目 ===
if removal_logs:
    print("\n被過濾項目：")
    for log in removal_logs:
        print(f"[{log[0]}] {log[1]} | {log[2]}  →  {log[3]}")

# ======== 印出統計結果 ========
print(f"\n✅ 完成 MRN / ID / DURATION / AGE 抽取（已允許重複），結果已寫入：{submission_task2_answer_clean_invalid}")
print(f"新增總筆數：{added_total} 筆")
print(f"各類別新增統計：")
for label, count in added_counter.items():
    print(f"    {label}: {count} 筆")

print("\n新增的資料列表：")
for fid, label, entity in added_records:
    print(f"{fid}\t{label}\t{entity}")

## 防範沒有出現過的東西出現在比賽中，先寫起來

In [None]:
import re

# ======== CONTACT & LOCATION-OTHER 修正版正則表達式 ========
contact_patterns = {
    "EMAIL": r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b",
    "URL": r"\b(?:https?://|www\.)[a-zA-Z0-9\-._~:/?#\[\]@!$&'()*+,;=%]+\b",
    "IPADDRESS": r"\b(?:\d{1,3}\.){3}\d{1,3}\b",
    "LOCATION-OTHER": r"\b(?:P\.?\s?O\.?\s?BOX|GPO\s?BOX|Locked\s?Bag)[-\s]?\d+\b",
}

added_contact_total = 0
added_contact_counter = {key: 0 for key in contact_patterns}
added_contact_counter["ID_NUMBER"] = 0  # 額外統計 ID_NUMBER（for PHONE 修正）
added_contact_records = []

# ======== 第一步：先讀取原本的 invalid 檔案內容（保留） ========
with open(submission_task2_answer_clean_invalid, encoding="utf-8") as f:
    original_invalid_lines = f.readlines()

# ======== 第二步：清空檔案並寫入新的 CONTACT 類資料 ========
with open(submission_task1_answer, encoding="utf-8") as fin, \
     open(submission_task2_answer_clean_invalid, "w", encoding="utf-8") as fout:

    for line in fin:
        line = line.strip()
        if not line or "\t" not in line:
            continue

        fid, sentence = line.split("\t", 1)

        for label, pattern in contact_patterns.items():
            for match in re.findall(pattern, sentence, flags=re.IGNORECASE):
                fout.write(f"{fid}\t{label}\t{match}\n")
                added_contact_counter[label] += 1
                added_contact_total += 1
                added_contact_records.append((fid, label, match))

# ======== 第三步：把原本內容 append 回檔案（修正 PHONE，排除 LOCATION-OTHER） ========
with open(submission_task2_answer_clean_invalid, "a", encoding="utf-8") as fout:
    for line in original_invalid_lines:
        line = line.strip()
        if not line:
            continue

        parts = line.split("\t")
        if len(parts) == 3:
            fid, label, value = parts

            # 直接略過 LOCATION-OTHER
            if label == "LOCATION-OTHER":
                continue

            # ✅ 修正 PHONE → ID_NUMBER
            if label == "PHONE" and not re.fullmatch(r"\d{4}-\d{4}", value):
                print(f"📌 PHONE 改為 ID_NUMBER：{fid}\t{label}\t{value}")
                label = "ID_NUMBER"
                added_contact_counter["ID_NUMBER"] += 1

            fout.write(f"{fid}\t{label}\t{value}\n")
        else:
            fout.write(line + "\n")

# ======== 印出結果 ========
print(f"\n📬 CONTACT / LOCATION-OTHER 修正抽取完成，新增總筆數：{added_contact_total}")
print("各類別統計：")
for label, count in added_contact_counter.items():
    print(f"    {label}: {count} 筆")

print("\n📌 新增資料：")
for record in added_contact_records:
    print(f"{record[0]}\t{record[1]}\t{record[2]}")


## 清理，並轉換人名相關類別

In [None]:
import re
import json
from collections import defaultdict

# === 載入 entity 詞表 JSON ===
with open(Validation_Dataset_Formal_entity, encoding="utf-8") as f:
    entities_data = json.load(f)

# 建立 entity → 類別映射（大小寫都處理）
entity_category_mapping = {}
for item in entities_data:
    for category, entities in item.items():
        for entity in entities:
            entity_category_mapping[entity.strip()] = category.upper()
            entity_category_mapping[entity.strip().lower()] = category.upper()

# === 清理與轉換邏輯 ===
correction_logs = []
conversion_count = defaultdict(int)  # 統計轉換次數

with open(submission_task2_answer_clean_invalid, encoding="utf-8") as fin, \
     open(submission_task2_answer_corrected, "w", encoding="utf-8") as fout:

    for line in fin:
        line = line.strip()
        if not line or "\t" not in line:
            continue

        parts = line.split("\t")
        if len(parts) != 3:
            continue

        fid, category, entity = parts
        corrected_category = category.strip().upper()
        entity = entity.strip()

        # === 移除 <think> 標籤 ===
        original_entity = entity
        entity = re.sub(r"</?think>", "", entity, flags=re.IGNORECASE).strip()
        if entity != original_entity:
            correction_logs.append((fid, original_entity, corrected_category, corrected_category, f"移除 <think> 標籤，轉為 {entity}"))

        # === 移除單獨 > 符號 ===
        if ">" in entity:
            new_entity = entity.replace(">", "").strip()
            if new_entity != entity:
                correction_logs.append((fid, entity, corrected_category, corrected_category, f"移除單獨的 '>' 符號，轉為 {new_entity}"))
                entity = new_entity

        # === 檢查開頭為 Dr. 且類別不正確 → 強制轉為 DOCTOR
        if corrected_category in {"PATIENT", "FAMILYNAME", "PERSONALNAME"}:
            if re.match(r"^dr\.?\s*", entity, flags=re.IGNORECASE):
                original_category = corrected_category
                corrected_category = "DOCTOR"
                correction_logs.append((fid, entity, original_category, corrected_category, "開頭為 Dr.，原為人名類別 → 強制轉為 DOCTOR"))

        # === DOCTOR: 移除開頭的醫師或教授稱謂 ===
        if corrected_category == "DOCTOR":
            original_entity = entity

            # 去除開頭職稱（不區分大小寫）
            entity = re.sub(
                r"^(dr\.?|prof\.?|professor|associate professor)\s*",
                "",
                entity,
                flags=re.IGNORECASE
            ).strip()

            if entity == "":
                correction_logs.append((fid, original_entity, corrected_category, "REMOVED", "DOCTOR 類別僅為稱謂，已移除"))
                continue
            elif entity != original_entity:
                correction_logs.append((fid, original_entity, corrected_category, corrected_category, f"DOCTOR 去除前綴，轉為 {entity}"))

        # === ZIP: 只保留 4 位數
        if corrected_category == "ZIP":
            original_entity = entity
            match = re.search(r"\b\d{4}\b", entity)
            if match:
                entity = match.group(0)
                if entity != original_entity:
                    correction_logs.append((fid, original_entity, corrected_category, corrected_category, f"ZIP 清理為 {entity}"))
            else:
                correction_logs.append((fid, original_entity, corrected_category, "REMOVED", "ZIP 類別無 4 位數郵遞區號，已移除"))
                continue

        # === MRN / ID: 若出現在人名類別則轉換，否則保留 ===
        if corrected_category in {"MEDICAL_RECORD_NUMBER", "ID_NUMBER"}:
            lower_entity = entity.lower()
            mapped_category = entity_category_mapping.get(lower_entity, "")
            if mapped_category in {"PATIENT", "FAMILYNAME", "PERSONALNAME"}:
                correction_logs.append((fid, entity, corrected_category, mapped_category, f"{corrected_category} 出現在 {mapped_category} 類別，轉為 {mapped_category}"))
                conversion_count[(corrected_category, mapped_category)] += 1
                corrected_category = mapped_category

        # === PERSONALNAME: 若出現在 PATIENT 或 FAMILYNAME 詞表中則轉換 ===
        if corrected_category == "PERSONALNAME":
            lower_entity = entity.lower()
            mapped_category = entity_category_mapping.get(lower_entity, "")
            if mapped_category in {"PATIENT", "FAMILYNAME"}:
                correction_logs.append((fid, entity, "PERSONALNAME", mapped_category, f"PERSONALNAME 出現在 {mapped_category} 類別，轉為 {mapped_category}"))
                conversion_count[("PERSONALNAME", mapped_category)] += 1
                corrected_category = mapped_category

        # === 寫入結果 ===
        fout.write(f"{fid}\t{corrected_category}\t{entity}\n")

# === 顯示修正紀錄 ===
if correction_logs:
    print("\n✅ 清理修正紀錄：")
    for log in correction_logs:
        print(f"[{log[0]}] {log[1]} → {log[3]}：{log[4]}")

# === 顯示轉換統計 ===
if conversion_count:
    print("\n類別轉換統計：")
    for (from_cat, to_cat), count in sorted(conversion_count.items()):
        print(f"{from_cat} → {to_cat}: {count} 筆")


## 規則一

In [None]:
import re
from collections import Counter

# 時間單位定義
ALL_UNITS = ["minute", "hour", "day", "week", "weekend", "month", "year",
             "night", "morning", "afternoon", "evening"]
WEEKDAYS = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]
PLURAL_UNITS = ["minute", "hour", "day", "week", "month", "year"]

# 類別定義（全小寫）
CATEGORY_PHRASES = {
    "SET": [
        f"every {unit}" for unit in ALL_UNITS + WEEKDAYS
    ] + [
        f"{prefix} {unit}" for prefix in ["per", "each", "once a", "twice a"] for unit in PLURAL_UNITS
    ] + [
        f"{n} times a {unit}" for n in range(3, 6) for unit in PLURAL_UNITS
    ] + [
        "couple of times", "daily", "weekly", "monthly", "yearly",
    ],

    "DURATION": [
        f"several {unit}s" for unit in PLURAL_UNITS
    ] + [
        f"couple {unit}s" for unit in PLURAL_UNITS
    ] + [
        f"few {unit}s" for unit in PLURAL_UNITS
    ] + [
        f"past {unit}" for unit in PLURAL_UNITS
    ] + [
        f"past several {unit}s" for unit in PLURAL_UNITS
    ] + [
        f"past couple of {unit}s" for unit in PLURAL_UNITS
    ] + [
        f"past couple {unit}s" for unit in PLURAL_UNITS
    ] + [
        f"all {unit}" for unit in ALL_UNITS
    ] + [
        f"long {unit}" for unit in ALL_UNITS
    ] + [
        f"whole {unit}" for unit in ALL_UNITS
    ] + [
        "a short time", "a long time", "whole time", "short time", "long time"
    ],

    "DATE": [
        "today", "yesterday", "tomorrow",
        "this weekend", "next weekend", "last weekend",
        "this week", "next week", "last week",
        "this month", "next month", "last month",
        "this year", "next year", "last year",
        "right now"
    ],

    "TIME": [
        "this morning", "this afternoon", "this evening",
        "tonight", "last night"
    ]
}

# === 載入 submission_task1_answer ===
with open(submission_task1_answer, encoding="utf-8") as f_task1:
    task1_lines = [line.strip().split("\t", 1) for line in f_task1 if "\t" in line]

# === 寫入 submission_task2_answer_rule1，並記錄抓取 ===
rule1_logs = []

with open(submission_task2_answer_rule1, "w", encoding="utf-8") as fout:

    for fid, sentence in task1_lines:
        lowered = sentence.lower()

        for cat, phrase_list in CATEGORY_PHRASES.items():
            for phrase in phrase_list:
                pattern = rf"\b{re.escape(phrase)}\b"
                matches = list(re.finditer(pattern, lowered))
                for _ in matches:
                    actual_phrase = "now" if cat == "DATE" and phrase == "right now" else phrase
                    fout.write(f"{fid}\t{cat}\t{actual_phrase}\n")
                    rule1_logs.append((fid, cat, actual_phrase))

    # === 接續寫入 corrected 的結果 ===
    with open(submission_task2_answer_corrected, encoding="utf-8") as fin:
        for line in fin:
            fout.write(line)

# === 顯示統計 ===
stats = Counter(cat for _, cat, _ in rule1_logs)

for cat in ["SET", "DURATION", "DATE", "TIME"]:
    print(f"{cat} 類別共抓取：{stats[cat]} 筆")

print("\n詳細抓取清單：")
for fid, cat, phrase in rule1_logs:
    print(f"[{fid}] 抓取 {phrase} 為 {cat}")


## 轉換+清理 規則二

## 區域及組織相關互轉

In [None]:
# ======== 讀取 JSON 建立 entity 對應表（支援大小寫&中文處理） ========
entity_category_mapping = {}
# ===== 只對特定類別進行 JSON 白名單過濾，只針對以下類別進行強制補抓（高誤標風險 + JSON 可控）
force_whitelist_categories = {"PROFESSION", "LOCATION-OTHER", "DISTRICT", "COUNTY", "COUNTRY"}
group1_mutual_convert = {"PROFESSION", "DEPARTMENT", "ORGANIZATION", "HOSPITAL", "LOCATION-OTHER", "DISTRICT"}    # 可互轉類別，允許小寫比對
group2_strict_match = {"COUNTRY", "COUNTY", "STATE", "CITY", "STREET", "LOCATION-OTHER", "DISTRICT",
                       "PROFESSION", "DEPARTMENT", "ORGANIZATION", "HOSPITAL", "LOCATION-OTHER"}    # 需大小寫完全一致，互轉時需精準比對

# ================================================================
# group3_category_rules:
# 定義時間相關類別（TIME / DATE / DURATION / SET）之間的語意修正邏輯
# 此結構為 dict，每個 key 為「目標類別」，表示當條件滿足時，
# 該 entity 原本的類別會被修正為此 key 所代表的類別。
# 每個類別修正規則內包含：
#   - trigger_categories: 哪些原始類別可被這個規則轉換
#   - keywords: 觸發規則的關鍵詞集合
#   - blocklist: 禁止轉換的詞（避免錯標，如 Sundays）
#   - condition: lambda 函數，實際執行轉換邏輯
#   - note: 說明此規則用途與語意
# ================================================================
group3_category_rules = {
    "SET": {
        "trigger_categories": {"TIME", "DATE", "DURATION"},
        "keywords": {"every", "once", "twice", "thrice", "each", "daily", "weekly", "times"},
        "condition": lambda entity: any(kw in entity.lower() for kw in group3_category_rules["SET"]["keywords"]),
        "note": "Group3互轉：出現重複性時間詞 → 強制轉為 SET"
    },
    "DURATION": {
        "trigger_categories": {"TIME", "DATE"},
        "keywords": {"seconds", "minutes", "hours", "days", "weeks", "months", "years"},
        "blocklist": {
            "sunday", "monday", "tuesday", "wednesday", "thursday", "friday", "saturday",
            "weekdays", "weekends", "holidays", "days off", "sundays", "mondays"
        },
        "condition": lambda entity: (
            not any(bad in entity.lower() for bad in group3_category_rules["DURATION"]["blocklist"])
            and any(kw in entity.lower() for kw in group3_category_rules["DURATION"]["keywords"])
        ),
        "note": "Group3互轉：包含複數時間單位，且不在日曆詞黑名單中 → 強制轉為 DURATION"
    }
}

# ================================================================
# group4_category_rules:
# 針對組織相關類別（如 HOSPITAL、DEPARTMENT 等）建立基於關鍵詞的類別轉換規則。
# 結構與 group3_category_rules 相同，每個 key 為「目標類別」，代表當觸發條件滿足時，
# 將原本的標註類別強制轉換為此目標類別。
#
# 每個類別規則中包含：
#   - trigger_categories: 可被此規則轉換的原始類別集合
#   - keywords: 若 entity 中出現這些關鍵詞，則視為符合條件
#   - condition: lambda 函數，實際執行判斷 entity 是否命中關鍵詞
#   - note: 對此規則的說明，會記錄進 correction_logs 供分析與除錯
# 因為group4會remove，所以必須把自身算上，剛group3不同的點在這

group4_category_rules = {
    "DEPARTMENT": {
        "trigger_categories": {"HOSPITAL", "DEPARTMENT", "ORGANIZATION",  "LOCATION-OTHER", "ROOM"},
        "keywords": {"department", "unit", "ward", "division", "section", "clinic", "rooms", "central", "pathology"},
        "condition": lambda entity: any(kw in entity.lower() for kw in group4_category_rules["DEPARTMENT"]["keywords"]),
        "note": "Group4互轉：包含部門單位關鍵詞 → 強制轉為 DEPARTMENT"
    },

    "HOSPITAL": {
        "trigger_categories": {"HOSPITAL", "DEPARTMENT", "ORGANIZATION", "LOCATION-OTHER", "PROFESSION", "ROOM"},
        "keywords": {"hospital", "medical center", "health", "healthy", "health center", "service", "centre", "center"},
        "condition": lambda entity: any(kw in entity.lower() for kw in group4_category_rules["HOSPITAL"]["keywords"]),
        "note": "Group4互轉：包含醫療機構關鍵詞 → 強制轉為 HOSPITAL"
    },
}

group1_entities = {}  # Group1 對應的實體詞表（純小寫），用於後續硬抓比對
group2_entities = {}  # Group2 對應的實體詞表（大小寫需相符），用於後續硬抓比對

with open(Validation_Dataset_Formal_entity, encoding="utf-8") as f:
    entities_data = json.load(f)
    for item in entities_data:
        for category, entities in item.items():
            for entity in entities:
                entity_category_mapping[entity] = category.upper()
                entity_category_mapping[entity.lower()] = category.upper()
                if category.upper() in group1_mutual_convert:
                    group1_entities.setdefault(category.upper(), set()).add(entity.lower())
                if category.upper() in group2_strict_match:
                    group2_entities.setdefault(category.upper(), set()).add(entity)

# ======== 類別 TIME → DATE 的日曆詞關鍵字，用於特殊轉換 ========
date_like_keywords = {
    "today", "tomorrow", "yesterday", "now",
    "this week", "last week", "next week",
    "this month", "last month", "next month",
    "this year", "last year", "next year",
    "this weekend", "last weekend", "next weekend",
    "weekend", "week",
    "monday", "tuesday", "wednesday", "thursday",
    "friday", "saturday", "sunday", "sundays",
    "easter", "same day", "same date",
    "may", "june", "july", "august", "september",
    "october", "november", "december", "january", "february",
    "march", "april"
}

# ======== 類別 DATE → TIME 的時間詞關鍵字，用於特殊轉換 ========
time_like_keywords = {
    "morning", "mornings", "late morning", "early morning", "midmorning", "mid-morning",
    "afternoon", "afternoons", "late afternoon", "early afternoon", "mid-afternoon",
    "evening", "evenings", "late evening", "early evening",
    "night", "nights", "early night", "midnight", "tonight",
    "noon", "midday", "last light", "this morning", "yesterday morning", "tomorrow night",
}

# ======== 校正過程：修正類別，並不做去重，保留所有預測項目 ========
correction_logs = []

with open(submission_task2_answer_rule1, encoding="utf-8") as fin, \
     open(submission_task2_answer_rule2, "w", encoding="utf-8") as fout:

    for line in fin:
        line = line.strip()
        if not line or "\t" not in line:
            continue

        parts = line.split("\t")
        if len(parts) != 3:
            continue

        fid, category, entity = parts
        category = category.strip().upper()
        entity = entity.strip()
        corrected_category = category

        # ======== 類別標籤修正映射表（處理常見誤標） ========
        category_correction_map = {
            "LAB_NUMBER": "ID_NUMBER",
            "ADDRESS": "STREET",
            "DAY": "DATE",
            "DATETIME": "DATE",
            # "MONTH": "DATE",
            # "TIMEPOINT": "TIME",
            # "YEARS": "DURATION",  # 目前沒發現這種
        }

        # 轉換常見誤標的類別
        if corrected_category in category_correction_map:
            new_cat = category_correction_map[corrected_category]
            correction_logs.append((fid, entity, corrected_category, new_cat, f"誤標為 {corrected_category}，轉為 {new_cat}"))
            corrected_category = new_cat

        # ===== FAMILYNAME / PERSONALNAME 非對稱修正邏輯（僅允許白名單進入 FAMILYNAME） =====
        mapped = entity_category_mapping.get(entity) or entity_category_mapping.get(entity.lower())

        # 若模型標成 FAMILYNAME，但 JSON 詞表中實際不是 FAMILYNAME → 強轉為 PERSONALNAME
        if category == "FAMILYNAME" and mapped != "FAMILYNAME":
            correction_logs.append((fid, entity, category, "PERSONALNAME", "FAMILYNAME → PERSONALNAME"))
            corrected_category = "PERSONALNAME"

        # 若模型標成 PERSONALNAME，但 JSON 詞表中實際是 FAMILYNAME → 強轉為 FAMILYNAME
        elif category == "PERSONALNAME" and mapped == "FAMILYNAME":
            correction_logs.append((fid, entity, category, "FAMILYNAME", "PERSONALNAME → FAMILYNAME"))
            corrected_category = "FAMILYNAME"

        for target_cat, rule in group3_category_rules.items():
            if corrected_category in rule["trigger_categories"]:
                if rule["condition"](entity):
                    # 嘗試找出觸發規則的關鍵詞（若 rule 中有 keywords）
                    matched_kw = None
                    if "keywords" in rule:
                        for kw in rule["keywords"]:
                            if kw in entity.lower():
                                matched_kw = kw
                                break
                    # 組成 log 訊息
                    reason = f"{rule['note']}（關鍵詞: '{matched_kw}'）" if matched_kw else rule["note"]
                    correction_logs.append((fid, entity, corrected_category, target_cat, reason))
                    corrected_category = target_cat

        # ===== 如果是 TIME 且為日曆詞 → 強制轉為 DATE =====
        if corrected_category == "TIME" and entity.lower() in date_like_keywords:
            correction_logs.append((fid, entity, corrected_category, "DATE", "包含日曆詞，TIME → DATE"))
            corrected_category = "DATE"

        # ===== 如果是 DATE 且含時間詞 → 強制轉為 TIME =====
        if corrected_category == "DATE" and any(k in entity.lower() for k in time_like_keywords):
            correction_logs.append((fid, entity, corrected_category, "TIME", "包含時間詞，DATE → TIME"))
            corrected_category = "TIME"

        # === 自動拆 CITY + ZIP 規則擴充 ===
        extra_entities = []
        if category == "CITY" and re.search(r"\b\d{4}\b", entity):
            zip_match = re.search(r"\b(\d{4})\b", entity)
            if zip_match:
                zip_code = zip_match.group(1)
                entity_city = entity.replace(zip_code, "").strip(" ,")
                correction_logs.append((fid, entity, category, "ZIP", f"CITY 含 4 位數字 ZIP，拆分 → CITY + ZIP（ZIP: {zip_code}）"))
                # 加入 ZIP 為新項目
                extra_entities.append(("ZIP", zip_code))
                # 修改原本這筆的 entity 為純 CITY（不含 ZIP）
                entity = entity_city

        # ===== Group1 小寫互轉 =====
        converted_by_group1 = False
        if corrected_category in group1_entities:
            mapped_category = entity_category_mapping.get(entity.lower())
            if (
                mapped_category
                and mapped_category in group1_entities
                and mapped_category != corrected_category
            ):
                correction_logs.append((fid, entity, corrected_category, mapped_category, "Group1互轉：依 JSON 詞表修正"))
                corrected_category = mapped_category
                converted_by_group1 = True

        # ===== Group2 精確大小寫互轉 =====
        converted_by_group2 = False
        if not converted_by_group1 and corrected_category in group2_entities:
            mapped_category = entity_category_mapping.get(entity)
            if (
                mapped_category
                and mapped_category in group2_entities
                and mapped_category != corrected_category
            ):
                correction_logs.append((fid, entity, corrected_category, mapped_category, "Group2互轉：依 JSON 詞表修正"))
                corrected_category = mapped_category
                converted_by_group2 = True

        # ===== force_whitelist 類別詞表比對（不立刻移除）=====
        in_whitelist_by_group1 = (
            corrected_category in group1_entities and entity.lower() in group1_entities[corrected_category]
        )
        in_whitelist_by_group2 = (
            corrected_category in group2_entities and entity in group2_entities[corrected_category]
        )

        # ===== "X of Y" 拆解邏輯 =====
        if " of " in entity.lower():
            keyword_hits = set()
            entity_lc = entity.lower()
            for cat, rule in group4_category_rules.items():
                if rule["condition"](entity_lc):
                    keyword_hits.add(cat)

            if len(keyword_hits) >= 2:
                for connector in ["of", "and"]:
                    parts = re.split(rf'\s+{connector}\s+', entity, flags=re.IGNORECASE)
                    if len(parts) == 2:
                        left_raw, right_raw = parts
                        left_part = left_raw.strip()
                        right_part = right_raw.strip()

                        left_cat, right_cat = None, None
                        for cat, rule in group4_category_rules.items():
                            if rule["condition"](left_part.lower()):
                                left_cat = cat
                            if rule["condition"](right_part.lower()):
                                right_cat = cat

                        if left_cat and right_cat:
                            fout.write(f"{fid}\t{left_cat}\t{left_part}\n")
                            fout.write(f"{fid}\t{right_cat}\t{right_part}\n")
                            correction_logs.append(
                                (fid, entity, category, f"{left_cat} + {right_cat}",
                                 f"雙詞拆分處理{connector}: left={left_cat}, right={right_cat}")
                            )
                            continue  # ✅ 保持原來的邏輯，處理完這一筆就跳下一筆

         # ===== Group4 keyword-based 類別轉換（照 dict 順序跑一輪，允許多次覆蓋，保留最後一個命中）=====
        converted_by_group4 = False

        # ===== 特例：若 entity 含 "department" → 強制轉為 DEPARTMENT =====
        if "department" in entity.lower():
            if corrected_category != "DEPARTMENT":
                correction_logs.append((fid, entity, corrected_category, "DEPARTMENT", "包含 'department' 字串 → 強制轉為 DEPARTMENT"))
            corrected_category = "DEPARTMENT"
            converted_by_group4 = True
        # ===== Group4 keyword-based 類別轉換（照 dict 順序跑一輪，允許多次覆蓋，保留最後一個命中）=====

        # 如果不是特例才進入一般的 Group4 類別轉換
        if not converted_by_group4:
            for target_cat, rule in group4_category_rules.items():
                if corrected_category in rule["trigger_categories"]:
                    if rule["condition"](entity):
                        old_cat = corrected_category
                        corrected_category = target_cat
                        reason = rule["note"]
                        correction_logs.append((fid, entity, old_cat, target_cat, reason))
                        converted_by_group4 = True

        # ===== 最終篩選：如果是 force_whitelist 類別，且未被任何邏輯保留 → 移除 =====
        if corrected_category in group1_mutual_convert:
            if not (
                converted_by_group1
                or converted_by_group2
                or converted_by_group4
                or in_whitelist_by_group1
                or in_whitelist_by_group2
            ):
                # 雖然未命中，但是就保留，比賽時使用
                correction_logs.append((fid, entity, corrected_category, "比賽保留", "未命中任何條件"))

                # 所有東西未命中就刪除
                # correction_logs.append((fid, entity, corrected_category, "REMOVED", "未命中任何條件，移除"))
                # continue

        # ===== 最終寫入，保留所有標註，不去重 =====
        fout.write(f"{fid}\t{corrected_category}\t{entity}\n")

# ======== 顯示修正結果 ========
print(f"✅ 分類校正完成，基本結果寫入：{submission_task2_answer_rule2}")
print(f"⚡ 總共修正錯誤筆數：{len(correction_logs)}")
if correction_logs:
    print("修正清單（fid, entity, 原本類別, 修正後類別, 原因）：")
    for log in correction_logs:
        print(log)


## 人名相關

In [None]:
import json
import re

# ========= PERSONALNAME → PATIENT / DOCTOR 語境修正邏輯 =========
medical_trigger_categories = {"DOCTOR", "HOSPITAL", "MEDICAL_RECORD_NUMBER", "ID_NUMBER"}
patient_logs = []

# 定義縮寫人名判斷函數
def is_short_initial_name(entity):
    entity = entity.strip()
    if re.fullmatch(r"([A-Za-z]\.\s?){1,2}", entity) or re.fullmatch(r"[A-Za-z]{2}", entity):
        return True
    if re.fullmatch(r"([A-Za-z]\.\s?){1,2}[A-Za-z][a-z]+", entity):
        return True
    return False

# === 讀取 JSON 裡的 PERSONALNAME 作為白名單 ===
json_personalnames = set()
with open(Validation_Dataset_Formal_entity, encoding="utf-8") as f:
    raw_entities = json.load(f)
    for item in raw_entities:
        for category, entities in item.items():
            if category.upper() == "PERSONALNAME":
                for ent in entities:
                    json_personalnames.add(ent.strip())

# === 第一步：掃描每個 fid 是否觸發醫療語境 ===
fid_triggered = {}
with open(submission_task2_answer_rule2, encoding="utf-8") as fin:
    for line in fin:
        line = line.strip()
        if not line or "\t" not in line:
            continue
        fid, category, entity = line.split("\t")
        if category.upper() in medical_trigger_categories and fid not in fid_triggered:
            fid_triggered[fid] = category.upper()

# === 第二步：PERSONALNAME 根據條件轉換 ===
new_lines = []
with open(submission_task2_answer_rule2, encoding="utf-8") as fin:
    for line in fin:
        line = line.strip()
        if not line or "\t" not in line:
            continue
        fid, category, entity = line.split("\t")
        original_category = category.strip().upper()
        entity = entity.strip()

        # 保留白名單中的 PERSONALNAME 不做更改
        if original_category == "PERSONALNAME" and entity in json_personalnames:
            new_lines.append(f"{fid}\t{original_category}\t{entity}\n")
            continue

        # 判斷縮寫 → DOCTOR
        if original_category == "PERSONALNAME" and is_short_initial_name(entity):
            reason = f"姓名縮寫，PERSONALNAME → DOCTOR"
            new_lines.append(f"{fid}\tDOCTOR\t{entity}\n")
            patient_logs.append((fid, entity, "PERSONALNAME", "DOCTOR", reason))

        # 判斷醫療語境 → PATIENT
        elif original_category == "PERSONALNAME" and fid in fid_triggered:
            reason = f"{fid} 中有出現過 {fid_triggered[fid]}，PERSONALNAME → PATIENT"
            new_lines.append(f"{fid}\tPATIENT\t{entity}\n")
            patient_logs.append((fid, entity, "PERSONALNAME", "PATIENT", reason))

        elif original_category == "PERSONALNAME":
            reason = f"PERSONALNAME → PATIENT"
            new_lines.append(f"{fid}\tPATIENT\t{entity}\n")
            patient_logs.append((fid, entity, "PERSONALNAME", "PATIENT", reason))

        else:
            new_lines.append(f"{fid}\t{original_category}\t{entity}\n")

# === 寫回檔案 ===
with open(submission_task2_answer_rule2, "w", encoding="utf-8") as fout:
    fout.writelines(new_lines)

# === 將 JSON 裡的 PERSONALNAME 寫入 ===
with open(submission_task1_answer, encoding="utf-8") as fin, \
     open(submission_task2_answer_rule2, "a", encoding="utf-8") as fout:

    for line in fin:
        if "\t" not in line:
            continue
        fid, sentence = line.strip().split("\t", 1)

        for name in json_personalnames:
            if name in sentence:
                fout.write(f"{fid}\tPERSONALNAME\t{name}\n")
                print(f"[{fid}] 補寫 PERSONALNAME：{name}")


# === 顯示 log ===
print(f"✅ PERSONALNAME 語意修正完成（含縮寫 → DOCTOR），處理筆數：{len(patient_logs)}")
if patient_logs:
    print("修正清單（fid, entity, 原本類別, 修正後類別, 原因）：")
    for log in patient_logs:
        print(log)


##　直接抓取Group1+Group2 規則二

In [None]:
import json
import re

# ======== 額外從 task1 強制抓取指定類別的 entity，只要出現就寫入，並插入到 corrected 最前面 ========

# 新增補抓總數與記錄
forced_added = 0
forced_added_records = []

# ======== 建立 group2_entities（來自原始 JSON） ========
group2_entities = {}
with open(Validation_Dataset_Formal_entity, encoding="utf-8") as f:
    raw_entities = json.load(f)
    for item in raw_entities:
        for category, entities in item.items():
            cat_upper = category.upper()
            if cat_upper in force_whitelist_categories:
                group2_entities.setdefault(cat_upper, []).extend(entities)

# ======== 暫存原本 corrected 內容 ========
with open(submission_task2_answer_rule2, "r", encoding="utf-8") as fin:
    original_lines = fin.readlines()

# ======== 開啟 corrected，先寫補抓，再接上原內容 ========
with open(submission_task2_answer_rule2, "w", encoding="utf-8") as fout, \
     open(submission_task1_answer, encoding="utf-8") as fin_task1:

    for line in fin_task1:
        line = line.strip()
        if not line or "\t" not in line:
            continue

        fid, sentence = line.split("\t", 1)
        sentence_lower = sentence.lower()
        sentence_cleaned = re.sub(r"[^\w\s]", " ", sentence).lower()  # 清除標點進行寬鬆比對

        # ===== Group1：小寫比對詞幹（詞邊界 + 's） =====
        for forced_cat, entity_set in group1_entities.items():
            if forced_cat not in force_whitelist_categories:
                continue
            for entity in entity_set:
                pattern = rf"\b{re.escape(entity)}(?=\b|'s|\s|$)"
                matches = re.findall(pattern, sentence_lower)
                for _ in matches:
                    fout.write(f"{fid}\t{forced_cat}\t{entity}\n")
                    forced_added_records.append((fid, forced_cat, entity, "Group1補抓（小寫詞幹比對）"))
                    forced_added += 1

        # ===== Group2：清除標點後小寫比對詞幹 =====
        for forced_cat, entity_list in group2_entities.items():
            if forced_cat not in force_whitelist_categories:
                continue
            for entity in entity_list:
                pattern = rf"\b{re.escape(entity.lower())}(?=\b|'s|\s|$)"
                matches = re.findall(pattern, sentence_cleaned)
                for _ in matches:
                    fout.write(f"{fid}\t{forced_cat}\t{entity}\n")
                    forced_added_records.append((fid, forced_cat, entity, "Group2補抓（清除標點小寫詞幹比對）"))
                    forced_added += 1


    # ===== 接續原本的模型或校正內容 =====
    fout.writelines(original_lines)

# debug
for i in force_whitelist_categories:
    print(f"[CHECK] group2_entities['{i}'] size = {len(group2_entities.get(i, []))}")

# ======== 顯示強制新增結果 ========
print(f"\n✅ 額外強制抓取完成，共新增 {forced_added} 筆來自 task1 的 entity！")
if forced_added_records:
    print("📌 強制新增的清單（fid, 類別, entity, 補抓依據）：")
    for record in forced_added_records:
        print(record)


## 強制改標籤

In [None]:
import re
from collections import Counter

# === 讀取原始檔案 ===
with open(submission_task2_answer_rule2, encoding="utf-8") as fin:
    lines = fin.readlines()

fixed_lines = []
change_log = []
delete_counter = Counter()
converted_count = 0
district_to_street_count = 0

for line in lines:
    line = line.strip()
    if not line or "\t" not in line:
        continue

    parts = line.split("\t")
    if len(parts) != 3:
        fixed_lines.append(line)
        continue

    fid, label, entity = parts
    label_upper = label.upper()
    entity_lower = entity.strip().lower()

    # === 修正：COUNTRY → STREET
    if label_upper == "COUNTRY" and entity_lower == "wales":
        change_log.append(f"{fid} - COUNTRY → STREET (entity = {entity})")
        label = "STREET"
        converted_count += 1

    # === 修正：DISTRICT → STREET（無條件）
    elif label_upper == "DISTRICT":
        change_log.append(f"{fid} - DISTRICT → STREET (entity = {entity})")
        label = "STREET"
        district_to_street_count += 1

    fixed_lines.append(f"{fid}\t{label}\t{entity}")

# === 寫入修正後的結果 ===
with open(submission_task2_answer_rule2, "w", encoding="utf-8") as fout:
    for line in fixed_lines:
        fout.write(line + "\n")

# === 動態 log 輸出 ===
print(f"✅ 修正完成：")
print(f"  COUNTRY → STREET 的修正筆數：{converted_count}")
print(f"  DISTRICT → STREET 的修正筆數：{district_to_street_count}")
print(f"  含數字而被移除的筆數：{sum(delete_counter.values())}")
for label, count in delete_counter.items():
    print(f"    {label}: {count} 筆")

print("\n修正/移除紀錄：")
for entry in change_log:
    print(f"  {entry}")


## 刪除不必要的entity

In [None]:
# ======== 合法類別 ========
valid_categories = {
    'PATIENT', 'DOCTOR', 'USERNAME', 'FAMILYNAME', 'PERSONALNAME', 'PROFESSION',
    'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY',
    'DISTRICT', 'COUNTY', 'STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER',
    'AGE', 'DATE', 'TIME', 'DURATION', 'SET', 'PHONE', 'FAX', 'EMAIL',
    'URL', 'IPADDRESS', 'SOCIAL_SECURITY_NUMBER', 'MEDICAL_RECORD_NUMBER',
    'HEALTH_PLAN_NUMBER', 'ACCOUNT_NUMBER', 'LICENSE_NUMBER', 'VEHICLE_ID',
    'DEVICE_ID', 'BIOMETRIC_ID', 'ID_NUMBER', 'OTHER'
}

# ======== 容易被誤標的類別 ========
# entity是「代名詞」或「模糊描述」會被視為錯誤
invalid_categories = {
    'DOCTOR',          # 很容易錯抓像 "he", "she" 或 "someone"
    'PATIENT',         # 常被誤抓為家庭成員稱謂（e.g., mom, dad）
    'FAMILYNAME',      # 容易抓到代詞或常見詞（e.g., mine, he）
    'PERSONALNAME',    # 容易混入非人名
    'PROFESSION',      # 容易誤抓 "psychiatrist"、"friend" 等模糊描述
    'DURATION'         # 容易誤抓成無意義時間長度詞
}

# ======== 會被視為無效entity的「代名詞」或泛稱詞彙表 ========
# 如果直接出現在 invalid_categories 對應類別的entity中，就直接移除
invalid_pronouns = {
    # 人稱代名詞
    "his", "him", "her", "he", "she", "i", "mine", "you", "they", "their", "that", "husband", "wife",
    "myself", "yourself", "dr", "dr.",
    # 模糊性別描述或人類泛稱
    "male", "female",
    # 不屬於特定身份的角色
    "phone", "psychiatrist", "psychiatrists",
    "doctor", "doctors", "anesthetist",
    "gp", "professor", "associate professor",
    # 容易混淆的詞
    "age", "city", "early", "supercuts", "facebook", "regular", "long", "short",
    "medical", "profession", "professions", "anesthesiologist", "nurse", "professional"
}

# 適用於非核心類別時的排除條件，用來過濾模型誤抓的entity詞
# ======== 模糊詞彙列表（分為人物類別與其他類別） ========
# 人物相關模糊詞彙
person_keywords = {
    "friend", "started", "shrink", "younger", "older", "you", "my", "your",
    "someone", "mom", "dad", "parent", "husband", "sister", "brother", "daughter", "niece", "cousin", "always",
    "mother", "wife", "grandma", "grandpa", "uncle", "aunt",
    "courses", "psychiatrist", "medical", "%", "anesthesiologist", "professional", "surgeon", "specialists",
    "intern", "worker",
}

# 其他類別模糊詞彙
# 基本是時間相關
other_keywords = {
    "mm", "millimeter", "bed", "work", "psychiatrist",
    "than",
    "dinner",
    "$", "little",
    "page",
    "pages",
    "enough",
    "old",
    "always",
    "regular",
    "millimeter",
    "senior",
    "junior",
    "through",
    "step",
}

# 常見模型生成字/全域刪除
global_keywords = {
    "default",
    "expression",
    "none",
    "specific",
    "calculated",
    "mentioned",
    "explicitly",
    "implied",
    "null",
    "there",
    "think",
    "name",
    "awaiting",
    "extracted",
    "exactly",
    "ongoing",
    "number",
    "united states",
    "roche-cobas",
    "tanzania",
    "doctor",
    "profession",
    "foundation",
    "%",
    "milliliter",
    "long time",
    "few day",
    "three days",
    "couple",
}

# ======== 讀取合法詞表（從 Validation_Dataset_Formal_entity.json 建立） ========

valid_entities_by_category = {}
with open(Validation_Dataset_Formal_entity, encoding="utf-8") as f:
    data = json.load(f)
    for item in data:
        for cat, entities in item.items():
            upper_cat = cat.upper()
            valid_entities_by_category.setdefault(upper_cat, set())
            for e in entities:
                valid_entities_by_category[upper_cat].add(e.lower())

# ======== 開始清理流程 ========
invalid_entries = []

with open(submission_task2_answer_rule2, encoding="utf-8") as fin, \
     open(submission_task2_answer_cleaned, "w", encoding="utf-8") as fout:

    for line in fin:
        line = line.strip()
        if not line or "\t" not in line:
            continue

        parts = line.split("\t")
        if len(parts) < 3:
            continue

        fid, category, entity = parts
        category = category.strip().upper()
        entity = entity.strip()

        # === 類別是否合法 ===
        if category not in valid_categories:
            invalid_entries.append((fid, category, entity))
            continue

        # === 是否為不合格的代名詞 ===
        if category in invalid_categories and entity.lower() in invalid_pronouns:
            invalid_entries.append((fid, category, entity))
            continue

        # === 防止日期型態誤被標為 ID 或 MRN ===
        if category in {"MEDICAL_RECORD_NUMBER", "ID_NUMBER"}:
            if re.search(r"\b(January|February|March|April|May|June|July|August|September|October|November|December)\b", entity):
                invalid_entries.append((fid, category, entity))
                continue
            if re.match(r".*\b\d{1,2}(st|nd|rd|th)?\b[,/\s]*\d{4}\b", entity):
                invalid_entries.append((fid, category, entity))
                continue

        # === 將 entity 拆詞（切分為 tokens）===
        tokens = re.findall(r"[a-zA-Z]+|\d+|\$", entity.lower())

        # === 如果是人物類別但仍出現 Dr. 前綴，代表標錯類，直接移除 ===
        if category in ["PATIENT", "PERSONALNAME", "FAMILYNAME", "PROFESSION", "AGE"]:
            if re.match(r"^dr\.?\s+", entity.lower()):
                invalid_entries.append((fid, category, entity))
                continue

        # === 人物類別使用人物模糊詞（如 mom, dad）===
        if category in ["PATIENT", "DOCTOR", "PERSONALNAME", "FAMILYNAME", "PROFESSION", "AGE"]:
            if entity.lower() not in valid_entities_by_category.get(category, set()):
                if any(t in person_keywords or t.rstrip('s') in person_keywords for t in tokens):
                    invalid_entries.append((fid, category, entity))
                    continue

        # === 非核心類別使用其他模糊詞===
        if category in invalid_categories or category not in [
            "PATIENT", "DOCTOR", "PERSONALNAME", "FAMILYNAME",
            'PROFESSION', 'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION',
            'STREET', 'CITY', 'DISTRICT', 'COUNTY', 'STATE', 'COUNTRY', 'LOCATION-OTHER',
            "MEDICAL_RECORD_NUMBER", "ID_NUMBER"]:
            if any(t in other_keywords for t in tokens):
                invalid_entries.append((fid, category, entity))
                continue

        # === 全域詞排除 ===
        if any(gk in entity.lower() for gk in global_keywords):
            invalid_entries.append((fid, category, entity))
            continue

        # === 符合所有條件才寫入 ===
        fout.write(f"{fid}\t{category}\t{entity}\n")

# ======== 顯示結果 ========
print(f"✅ 清理完成，合法結果寫入：{submission_task2_answer_cleaned}")
print(f"被移除的不合法條目數：{len(invalid_entries)}")
if invalid_entries:
    print("範例錯誤條目:")
    for x in invalid_entries:
        print(x)

## 如果precision>>recall，就加上5倍的量，提高recall，如果沒有就1倍，維持原樣

In [None]:
repeat_n = 1

with open(submission_task2_answer_cleaned, encoding="utf-8") as fin:
    lines = [line.strip() for line in fin if line.strip()]  # 保留原始順序與內容

with open(submission_task2_answer_duplicated, "w", encoding="utf-8") as fout:
    for _ in range(repeat_n):
        for line in lines:
            fout.write(line + "\n")
print(f"已乘 {repeat_n} 倍數到{submission_task2_answer_duplicated}")

## 加上時間戳

In [None]:
import unicodedata
import json
import re
from difflib import SequenceMatcher
from num2words import num2words

# ====== 工具函式 ======
def normalize(text):
    text = unicodedata.normalize("NFKC", text).lower()
    text = text.replace("’s", "s").replace("'s", "s").replace("‘", "")
    return re.sub(r"[“”\".,:;!?]", "", text).strip()

def clean_entity(entity, category):
    entity = entity.strip()
    entity = re.sub(r"^patient[,\s]*", "", entity, flags=re.I)
    entity = re.sub(r"^doctor[,\s]*", "", entity, flags=re.I)
    entity = re.sub(r"[.,\s]*(with|received)$", "", entity, flags=re.I)
    return entity.strip()

def is_similar(a, b, threshold=0.85):
    return SequenceMatcher(None, a, b).ratio() >= threshold

def is_digit_entity(entity):
    return bool(re.match(r'^[\dA-Z-]+$', entity))

def convert_number_to_words(number):
    if number.isdigit():
        return num2words(int(number)).split()
    return [c for c in number if c.isalnum()]

def match_number_sequence(words, entity_words, max_gap=5, max_total_duration=6.0):
    idx = 0
    matched = []
    for w in words:
        word_norm = normalize(w['word'])
        if idx < len(entity_words) and (word_norm == entity_words[idx] or entity_words[idx] in word_norm):
            matched.append(w)
            idx += 1
        if idx == len(entity_words):
            break
    if idx == len(entity_words):
        if matched[-1]['end'] - matched[0]['start'] <= max_total_duration:
            for i in range(1, len(matched)):
                if matched[i]['start'] - matched[i-1]['end'] > max_gap:
                    return None
            return matched
    return None

# ✅ 改良後：同名函數 overlaps，包含 ±0.05 秒 & >20% 時間重疊
def overlaps(start, end, span_list, tolerance=0.05, min_overlap_ratio=0.2):
    for s, e in span_list:
        if abs(start - s) < tolerance and abs(end - e) < tolerance:
            return True
        inter_start = max(start, s)
        inter_end = min(end, e)
        intersection = max(0.0, inter_end - inter_start)
        if intersection > 0:
            duration = end - start
            if duration > 0 and (intersection / duration) >= min_overlap_ratio:
                return True
    return False

# ====== 載入資料 ======
with open(task1_answer_timestamps, encoding="utf-8") as f:
    timestamp_map = {json.loads(line)["filename"]: json.loads(line)["words"] for line in f}

sentence_map = {}
with open(submission_task1_answer, encoding="utf-8") as f:
    for line in f:
        if "\t" in line:
            fid, sentence = line.strip().split("\t", 1)
            sentence_map[fid] = sentence

not_found = []
used_spans = {}

# ====== 對齊流程 ======
with open(submission_task2_answer_duplicated, encoding="utf-8") as fin, \
     open(submission_task2_answer_alignment, "w", encoding="utf-8") as fout:

    total = success = fallback = 0

    for line in fin:
        total += 1
        fid, category, entity = line.strip().split("\t")
        sentence = sentence_map.get(fid, "")
        if not sentence:
            not_found.append((fid, category, entity, "No sentence"))
            continue

        words = timestamp_map.get(fid, [])
        if not words:
            not_found.append((fid, category, entity, "No timestamp"))
            continue

        entity_clean = clean_entity(entity, category)
        entity_norm = normalize(entity_clean)

        if fid not in used_spans:
            used_spans[fid] = []

        matched = False
        was_overlapped = False

        # === 精準比對 ===
        for window in range(1, 6):
            for i in range(len(words) - window + 1):
                segment = words[i:i + window]
                segment_text = " ".join(w["word"] for w in segment)
                if normalize(segment_text) == entity_norm:
                    start = float(segment[0]["start"])
                    end = float(segment[-1]["end"])
                    if overlaps(start, end, used_spans[fid]):
                        was_overlapped = True
                        continue
                    fout.write(f"{fid}\t{category}\t{start:.3f}\t{end:.3f}\t{entity}\n")
                    used_spans[fid].append((start, end))
                    matched = True
                    break
            if matched:
                break

        # === 模糊單字比對 ===
        if not matched:
            for w in words:
                word_norm = normalize(w["word"])
                if (word_norm.startswith(entity_norm) or entity_norm in word_norm or is_similar(word_norm, entity_norm)):
                    start = float(w["start"])
                    end = float(w["end"])
                    if overlaps(start, end, used_spans[fid]):
                        was_overlapped = True
                        continue
                    fout.write(f"{fid}\t{category}\t{start:.3f}\t{end:.3f}\t{entity}\n")
                    used_spans[fid].append((start, end))
                    matched = True
                    break

        # === 數字實體比對 ===
        if not matched and is_digit_entity(entity):
            entity_words = convert_number_to_words(entity.replace('-', ''))
            matched_seq = match_number_sequence(words, entity_words)
            if matched_seq:
                start = float(matched_seq[0]["start"])
                end = float(matched_seq[-1]["end"])
                if overlaps(start, end, used_spans[fid]):
                    was_overlapped = True
                else:
                    fout.write(f"{fid}\t{category}\t{start:.3f}\t{end:.3f}\t{entity}\n")
                    used_spans[fid].append((start, end))
                    matched = True

        # fallback 比對應該在這個 if not matched: 裡面
        if not matched:
            for window in range(1, 8):  # 可調整最大長度
                for i in range(len(words) - window + 1):
                    segment = words[i:i + window]
                    segment_text = " ".join(w["word"] for w in segment)
                    sim_ratio = SequenceMatcher(None, normalize(segment_text), entity_norm).ratio()

                    if sim_ratio >= 0.75:
                        start = float(segment[0]["start"])
                        end = float(segment[-1]["end"])

                        if overlaps(start, end, used_spans.get(fid, [])):
                            continue

                        fout.write(f"{fid}\t{category}\t{start:.3f}\t{end:.3f}\t{entity}\n")
                        used_spans.setdefault(fid, []).append((start, end))
                        fallback += 1
                        matched = True
                        break
                if matched:
                    break


        if not matched:
            reason = "時間戳重疊" if was_overlapped else "Cannot align"
            not_found.append((fid, category, entity, reason))
        else:
            success += 1

# ====== 結果輸出 ======
print(f"\n✅ 對齊完成！總筆數: {total}，成功: {success}，fallback: {fallback}，失敗: {len(not_found)}")
print("📌 無法對齊或被排除的項目:")
for item in not_found:
    print(item)

## 硬抓中文時間戳

In [None]:
import json
import re
Validation_Dataset_Formal_entity= base_path / "Validation_Dataset_Formal_entity.json"

# === 定義硬抓字串對應的類別 ===
hardcoded_patterns = {
    "SET": [r"[一二三四五六七八九十]次"],
    "PROFESSION": [],
    "COUNTRY": [],
    "DURATION": [r"[幾几一二两三四五六七八九十]+(?:年半|天)"],
    "ROOM": [r"[一二两三四五六七八九十]+(?:台|床)"]
}

# === 保留原有正則 pattern ===
partial_familyname_patterns = [r"[赵钱孙李吴郑王冯陈褚卫蒋沉韩杨朱秦尤许何吕施张劉孔曹严华金魏陶姜](太太|先生)"]
partial_personalname_patterns = []

# === 載入 JSON 路徑變數（Validation_Dataset_Formal_entity） ===
with open(Validation_Dataset_Formal_entity, "r", encoding="utf-8") as f:
    Validation_Dataset_Formal_entity = json.load(f)

def extract_list_by_key(entity_list, key):
    for item in entity_list:
        if key in item:
            return item[key]
    return []

full_familyname_patterns = [re.escape(name) for name in extract_list_by_key(Validation_Dataset_Formal_entity, "FAMILYNAME")]
full_personalname_patterns = [re.escape(name) for name in extract_list_by_key(Validation_Dataset_Formal_entity, "PERSONALNAME")]
hardcoded_patterns["PROFESSION"] = [re.escape(name) for name in extract_list_by_key(Validation_Dataset_Formal_entity, "PROFESSION")]
hardcoded_patterns["COUNTRY"] = [re.escape(name) for name in extract_list_by_key(Validation_Dataset_Formal_entity, "COUNTRY")]

special_categories = {"FAMILYNAME", "PERSONALNAME", "COUNTRY", "PROFESSION", "SET", "DURATION", "ROOM"}
surname_set = set("赵钱孙李吴郑王冯陈褚卫蒋沉韩杨朱秦尤许何吕施张劉孔曹严华金魏陶姜")

# === 暫存原始檔內容 ===
with open(submission_task2_answer_alignment, "r", encoding="utf-8") as f:
    original_lines = [line.strip() for line in f if line.strip()]

# === 建立原始標註表，依 fid 分類 ===
original_by_fid = {}
for line in original_lines:
    parts = line.split("\t")
    if len(parts) != 5:
        continue
    fid, tag = parts[0], parts[1]
    original_by_fid.setdefault(fid, []).append(line)

# === 開始處理 ===
new_outputs_by_fid = {}
processed_fids = set()

with open(task1_answer_timestamps_ZH, "r", encoding="utf-8") as fin:
    for line in fin:
        try:
            data = json.loads(line)
            fid = data["filename"]
            fid_int = int(fid)
            words = data.get("words", [])

            word_list = [w["word"] for w in words]
            word_spans = [(w["start"], w["end"]) for w in words]
            full_text = "".join(word_list)

            output_lines = []
            skip_positions = set()
            profession_spans = []

            def is_within_existing_span(start, end, spans):
                for s, e in spans:
                    if start >= s and end <= e:
                        return True
                return False

            def write_tag(tag, match_text, start_char, end_char):
                char_index = 0
                start_index = end_index = None
                for idx, word in enumerate(word_list):
                    word_len = len(word)
                    next_index = char_index + word_len
                    if start_index is None and start_char < next_index:
                        start_index = idx

                    if start_char < next_index and end_char <= next_index:
                        end_index = idx
                        break

                    char_index = next_index
                if start_index is not None and end_index is not None:
                    start_time = word_spans[start_index][0]
                    end_time = word_spans[end_index][1]
                    line = f"{fid}\t{tag}\t{start_time:.3f}\t{end_time:.3f}\t{match_text}"
                    output_lines.append(line)
                    if tag in special_categories:
                        print(f"[{fid}] {tag}：{match_text}（{start_time:.3f}s ~ {end_time:.3f}s）")

            if fid_int < 80000:
                output_lines.extend(original_by_fid.get(fid, []))
            else:
                processed_fids.add(fid)
                original_tags = original_by_fid.get(fid, [])
                retained_lines = [l for l in original_tags if l.split("\t")[1] not in special_categories]
                output_lines.extend(retained_lines)

                for category, patterns in hardcoded_patterns.items():
                    if category not in special_categories:
                        continue
                    for pattern in patterns:
                        for match in re.finditer(pattern, full_text):
                            match_text = match.group()
                            start = match.start()
                            end = match.end()

                            if category == "PROFESSION":
                                name_preceded_professions = {
                                    "法官", "律師", "老師", "導師", "法師", "牧師", "講師",
                                    "律师", "老师", "导师", "法师", "牧师", "讲师"
                                }

                                if match_text in name_preceded_professions:
                                    char_pos = 0
                                    for i in range(len(word_list)):
                                        if char_pos == start:
                                            if i >= 2:
                                                name_candidate = word_list[i-2] + word_list[i-1]
                                                if word_list[i-1] in surname_set:
                                                    name_start = sum(len(word_list[j]) for j in range(i-1))
                                                    name_end = sum(len(word_list[j]) for j in range(i))
                                                    write_tag("PERSONALNAME", word_list[i-1], name_start, name_end)
                                                else:
                                                    name_start = sum(len(word_list[j]) for j in range(i-2))
                                                    name_end = sum(len(word_list[j]) for j in range(i))
                                                    write_tag("PERSONALNAME", name_candidate, name_start, name_end)

                                                skip_positions.add((start, end))
                                                profession_spans.append((start, end))
                                                break
                                        char_pos += len(word_list[i])

                                if is_within_existing_span(start, end, profession_spans):
                                    continue

                                if start > 0:
                                    prev_char = full_text[start - 1]
                                    if prev_char in surname_set:
                                        combined = prev_char + match_text
                                        skip_positions.add((start - 1, end))
                                        profession_spans.append((start - 1, end))
                                        write_tag("PERSONALNAME", combined, start - 1, end)
                                        continue

                                    elif match_text in [
                                        "作家", "校長", "記者", "委員", "主管", "顧問", "立委", "司機", "議員",
                                        "作家", "校长", "记者", "委员", "主管", "顾问", "立委", "司机", "议员"
                                    ]:
                                        prior_text = full_text[max(0, start - 1):start]
                                        if not any(c in surname_set for c in prior_text):
                                            continue

                                profession_spans.append((start, end))
                                write_tag("PROFESSION", match_text, start, end)
                                continue

                            # 其他類別照常處理
                            if (start, end) in skip_positions:
                                continue
                            write_tag(category, match_text, start, end)

                for pattern in partial_familyname_patterns:
                    for match in re.finditer(pattern, full_text):
                        write_tag("FAMILYNAME", match.group()[0], match.start(), match.start() + 1)
                for pattern in full_familyname_patterns:
                    for match in re.finditer(pattern, full_text):
                        write_tag("FAMILYNAME", match.group(), match.start(), match.end())

                for pattern in partial_personalname_patterns:
                    for match in re.finditer(pattern, full_text):
                        write_tag("PERSONALNAME", match.group()[0], match.start(), match.start() + 1)
                for pattern in full_personalname_patterns:
                    for match in re.finditer(pattern, full_text):
                        write_tag("PERSONALNAME", match.group(), match.start(), match.end())


            new_outputs_by_fid[fid] = output_lines

        except Exception as e:
            print(f"[ERROR] 處理 {data.get('filename', '<UNKNOWN>')} 失敗：{e}")
            continue

# === 將未出現在 timestamp 的 <80000 的 fid 也補回來 ===
for fid, lines in original_by_fid.items():
    if int(fid) < 80000 and fid not in new_outputs_by_fid:
        new_outputs_by_fid[fid] = lines

with open(submission_task2_answer_alignment, "w", encoding="utf-8") as fout:
    for lines in new_outputs_by_fid.values():
        for line in lines:
            fout.write(line + "\n")

print(f"\n✅ 寫入：{submission_task2_answer_alignment}")


## 去重

In [None]:
import json

# === 讀取 timestamp 對應的 word list ===
timestamp_map = {}
with open(task1_answer_timestamps, encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        timestamp_map[data["filename"]] = data["words"]

# 統計
fallback_added = 0
failed = 0
output_entries = []  # 使用 dict 儲存每筆資料，便於時間比對

# === 計算時間重疊比例 ===
def time_overlap_ratio(a_start, a_end, b_start, b_end):
    overlap = max(0, min(a_end, b_end) - max(a_start, b_start))
    duration = min(a_end - a_start, b_end - b_start)
    return overlap / duration if duration > 0 else 0

# === 嘗試加入不重疊的項目 ===
def add_if_not_overlap(new_entry):
    for existing in output_entries:
        if (new_entry["fid"] == existing["fid"] and
            new_entry["category"] == existing["category"] and
            new_entry["entity"] == existing["entity"]):

            ratio = time_overlap_ratio(
                new_entry["start"], new_entry["end"],
                existing["start"], existing["end"]
            )
            if ratio > 0.4:
                return False  # 重疊比例過高，不加

    output_entries.append(new_entry)
    return True

# === 開始處理 alignment ===
with open(submission_task2_answer_alignment, encoding="utf-8") as fin:
    for line in fin:
        line = line.strip()
        if not line or line.upper() == "PHI:NULL":
            continue

        parts = line.split("\t")
        if len(parts) != 5:
            print(f"[ERROR] 格式錯誤：{line}")
            failed += 1
            continue

        fid, category, start, end, entity = parts

        # 已有合法時間戳 → 驗證是否為 float
        try:
            start_f = float(start)
            end_f = float(end)
            entry = {
                "fid": fid,
                "category": category,
                "start": start_f,
                "end": end_f,
                "entity": entity,
                "raw_line": line
            }
            add_if_not_overlap(entry)
            continue
        except ValueError:
            pass  # fallback 處理

        # fallback：從 timestamp map 嘗試比對
        words = timestamp_map.get(fid, [])
        if not words:
            print(f"[SKIP] 無對應 timestamp：{fid}")
            failed += 1
            continue

        entity_tokens = list(entity)  # 中文每字一 token

        for i in range(len(words)):
            idx = 0
            matched_words = []

            for j in range(i, len(words)):
                word = words[j]["word"]
                if idx < len(entity_tokens) and word == entity_tokens[idx]:
                    matched_words.append(words[j])
                    idx += 1
                    if idx == len(entity_tokens):
                        break
                else:
                    break

            if idx == len(entity_tokens):
                new_start = float(matched_words[0]["start"])
                new_end = float(matched_words[-1]["end"])
                raw_line = f"{fid}\t{category}\t{new_start:.3f}\t{new_end:.3f}\t{entity}"
                entry = {
                    "fid": fid,
                    "category": category,
                    "start": new_start,
                    "end": new_end,
                    "entity": entity,
                    "raw_line": raw_line
                }
                if add_if_not_overlap(entry):
                    fallback_added += 1
                    print(f"[FALLBACK] {fid} {entity} \u2794 {new_start:.3f}s ~ {new_end:.3f}s")
                break
        else:
            print(f"[MISS] 無法補時間：{fid}\t{entity}")
            failed += 1

# === 寫入結果（去重後） ===
with open(submission_task2_answer_sort, "w", encoding="utf-8") as fout:
    fout.write("\n".join(sorted(entry["raw_line"] for entry in output_entries)) + "\n")

# === 統計輸出 ===
print(f"\n✅ 完成！最終輸出筆數：{len(output_entries)}")
print(f"🔁 fallback 成功補時間：{fallback_added}")
print(f"❌ 無法處理（格式錯或無 timestamp）：{failed}")


## 排序+計算數量

In [None]:
from collections import Counter

# ======== 定義類別順序 ========
category_order = [
    'PATIENT', 'DOCTOR', 'USERNAME', 'PERSONALNAME', 'FAMILYNAME', 'PROFESSION',
    'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY', 'STATE',
    'COUNTRY', 'COUNTY', 'ZIP', 'LOCATION-OTHER', 'DISTRICT', 'AGE', 'DATE',
    'TIME', 'DURATION', 'SET', 'PHONE', 'FAX', 'EMAIL', 'URL', 'IPADDRESS',
    'OTHER', 'SOCIAL_SECURITY_NUMBER', 'MEDICAL_RECORD_NUMBER', 'HEALTH_PLAN_NUMBER',
    'ACCOUNT_NUMBER', 'LICENSE_NUMBER', 'VEHICLE_ID', 'DEVICE_ID', 'BIOMETRIC_ID',
    'ID_NUMBER'
]

# ======== 統計各類別出現次數 ========
category_counter = Counter()

with open(submission_task2_answer_sort, encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) == 5:  # 格式: fid, category, start, end, entity
            _, category, _, _, _ = parts
            category = category.strip().upper()
            category_counter[category] += 1

# ======== 顯示原始統計（照最初的格式）========
for cat in category_order:
    print(f"{cat}: {category_counter.get(cat, 0)}")

# ======== 排序（依 fid → start → end）========
entries = []

with open(submission_task2_answer_sort, encoding="utf-8") as f:
    for line in f:
        parts = line.strip().split("\t")
        if len(parts) == 5:
            fid, category, start, end, entity = parts
            try:
                fid_num = int(fid)
                start_num = float(start)
                end_num = float(end)
                entries.append((fid_num, start_num, end_num, fid, category, entity))
            except ValueError:
                print(f"[SKIP] 非法 fid/start/end 無法轉為整數：{line}")
                continue

# ======== 排序後寫入最終檔案========
entries.sort(key=lambda x: (x[0], x[1], x[2]))

with open(submission_task2_answer_finally, "w", encoding="utf-8") as fout:
    for _, start_num, end_num, fid, category, entity in entries:
        fout.write(f"{fid}\t{category}\t{start_num}\t{end_num}\t{entity}\n")

print(f"\nTOTAL: {sum(category_counter.values())}")
print(f"\n\n{submission_task2_answer_finally}排序已完成")