## 路徑設定及安裝

In [None]:
# !pip install git+https://github.com/openai/whisper.git

In [None]:
import os
from pathlib import Path
from groq import Groq
from pathlib import Path
import whisperx
import torch
import json
from tqdm import tqdm  # ← 這樣匯入的是函數，而非整個模組
import difflib
import re

base_path = Path(r"your_path")

submission_task1_answer_L = base_path / "submission/task1_answer_L.txt"

submission_task2_answer_L = base_path / "submission/task2_answer_L.txt"


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: {device}")

# 類別設定
train_phi_category = ['PATIENT', 'DOCTOR', 'USERNAME', 'FAMILYNAME', "PERSONALNAME",'PROFESSION',
                      'ROOM', 'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY',
                      'DISTRICT', 'COUNTY', 'STATE', 'COUNTRY', 'ZIP', 'LOCATION-OTHER',
                      'AGE',
                      'DATE', 'TIME', 'DURATION', 'SET',
                      'PHONE', 'FAX', 'EMAIL', 'URL', 'IPADDRESS',
                      'SOCIAL_SECURITY_NUMBER', 'MEDICAL_RECORD_NUMBER', 'HEALTH_PLAN_NUMBER', 'ACCOUNT_NUMBER',
                      'LICENSE_NUMBER', 'VEHICLE_ID', 'DEVICE_ID', 'BIOMETRIC_ID', 'ID_NUMBER',
                      'OTHER']

# 確保資料夾存在
# wav_dir.mkdir(parents=True, exist_ok=True)


## task2

## LLM1_prompt

In [None]:
import openai
import time

special_note = """
- Extract exact numeric time expressions as TIME only if they refer to a specific clock time (e.g., "2:00", "two"). Do not label time spans like "three hours" or "six months" as TIME—those must be labeled as DURATION.
- Always label "today", "yesterday", "now", "tomorrow", "Monday", "this week", and "June" as DATE.
- Label full expressions like "Monday morning" or "Friday night" as TIME. Do not split them.
- If a number clearly refers to a time point (e.g., "two", "maybe two"), extract it as TIME even if casual.
- Label "three hours", "several weeks", "six months", "a couple of minutes", etc. as DURATION. These refer to a time span, not a point in time.
- Label "every morning", "twice a week", "once a day" as SET. These refer to recurring patterns and must include recurrence indicators like "every" or "once".
- Do not label vague frequency adverbs like "frequently", "occasionally", "sometimes", or "regularly" as SET.
- Do not label event-triggered phrases (e.g., "after three hours", "before dinner", "when I got home", "once I arrived") as TIME, DURATION, or any category. These are conditionals, not independent time expressions.
- Prioritize labeling based on the phrase’s standalone semantic meaning, not on its surrounding context.
- Extract all valid TIME, DATE, DURATION, or SET expressions that clearly indicate a specific time, even if embedded in broken or casual sentences.
- Time-related phrases must independently and concretely express a time point, span, or recurrence. Do not extract phrases that rely on another action to be meaningful.
- TIME expressions must refer to a clock time, time of day, or specific point (e.g., "3 PM", "Friday morning", "noon", "late at night").
- DURATION expressions must refer to a measurable span (e.g., "four days", "two hours", "a long time") with a time unit.
- DATE expressions must refer to a calendar reference (e.g., "Friday", "August 12", "yesterday", "last year").
- SET expressions must refer to a recurring schedule and include recurrence indicators (e.g., "every Monday", "twice a day").
- AGE expressions must contain a clearly stated number that represents a person's age (e.g., "65", "three", "twenty-one"). The age must be explicit and quantifiable.
- FAMILYNAME, PERSONALNAME, DOCTOR, and PATIENT: Only extract full, proper given names (e.g., "Emily", "John"). Do not extract roles or relational phrases (e.g., "Ivan's dad").
- If the full name is unknown or missing, do not extract FAMILYNAME or PERSONALNAME under any condition.
- Label people based on role:
  - DOCTOR: Named individuals providing care or diagnosis.
  - PATIENT: Named individuals receiving care or diagnosis.
  - FAMILYNAME: Named relatives of the patient.
  - PERSONALNAME: Named unrelated individuals not acting as doctor, patient, or family.
- If the role is unclear, default to PERSONALNAME.
- Do not extract LOCATION-OTHER unless it matches a known list. If uncertain, try CITY, STATE, COUNTY, or DISTRICT. If still unclear, do not label.
- Label CITY only if the name clearly refers to a real-world city or town. Ambiguous or fictional locations must not be labeled.
- Label each entity occurrence individually. If the same entity appears multiple times in the sentence or paragraph, each occurrence must be labeled separately, even if the category and entity are identical.
- For each entity occurrence (token), assign exactly one PHI category.
- Do not infer or guess entities. Only extract if explicitly stated and clearly match the defined criteria.
- Preserve the exact original casing, spacing, and punctuation of extracted entities.
- Output format must be strict: CATEGORY: entity. Do not add explanations, justifications, or notes.
- Minor errors are acceptable. Missing valid entities is a serious mistake.
- If unsure whether an entity exists, attempt extraction. Output PHI:NULL only if no candidate fits after careful checking.
- After extraction, verify that all PHI categories present in the sentence have been labeled. Missing eligible entities is considered a major mistake.
- If no valid PHI entity can be extracted, output exactly: PHI:NULL.
"""

Rules = """
- Extract each entity exactly as it appears in the text, preserving the original casing, spaces, and punctuation. No normalization, expansion, or abbreviation is allowed.
- For each occurrence (token), assign exactly one PHI category. Do not label the same token with multiple categories.
- Each entity must be assigned to only one PHI category based on context. Do not label the same entity with multiple categories.
- Extract every occurrence of an entity, even if the category and entity are identical and appear multiple times in the same sentence or paragraph. Do not skip, deduplicate, or merge repeated mentions—each instance must be labeled individually.
- If the same entity appears multiple times (even at different positions), each occurrence must be extracted separately. Do not deduplicate.
- Exclude titles like "Dr" or "Dr." when extracting DOCTOR names; only extract the actual name (e.g., "James" from "Dr. James").
- Do not extract generic words like "hospital", "phone", or "address" unless they have specific identifying information (e.g., "Bamaga Hospital", "911").
- Use only the following PHI categories:
    'PATIENT', 'DOCTOR', 'PERSONALNAME', 'FAMILYNAME', 'PROFESSION',
    'DEPARTMENT', 'HOSPITAL', 'ORGANIZATION', 'STREET', 'CITY', 'STATE',
    'COUNTRY', 'COUNTY', 'ZIP', 'LOCATION-OTHER', 'DISTRICT', 'AGE', 'DATE',
    'TIME', 'DURATION', 'SET', 'PHONE', 'MEDICAL_RECORD_NUMBER', 'ID_NUMBER'
- Format output strictly as: CATEGORY: entity.
- PATIENT: if referring to the patient, including by full name.
- DOCTOR: if referring to the doctor, including by full name.
- FAMILYNAME: for named family members of the patient (e.g., "John", "Maria") when their family role is clear (e.g., parent, sibling).
- PERSONALNAME: for named unrelated persons or bystanders not identified as patient, doctor, or family.
- If the role of a person is unclear, assign PERSONALNAME by default.
- If a location contains multiple components (e.g., street, city, state), extract each part under the appropriate category.
- CITY: Extract only if the location name clearly refers to a real-world city or town, and is not ambiguous, generic, or a district, state, or country (e.g., "Chicago", "Miami", "Hamden", "San Antonio", "Austin"). Do not classify as CITY if the entity contains numbers. Use ZIP if address-like.
- STATE: First-level administrative regions within a country (e.g., "Delaware", "Montana", "Oregon", "South Australia", "Western Australia", "Texas", "RI", "QLD").
- COUNTRY: Use for names of recognized sovereign nations or independent countries (e.g., "Madagascar", "Denmark", "France", "Japan", "Brazil", "Australia", "USA").
- COUNTY: Mid-level administrative divisions often below state level (e.g., "Cheshire").
- DISTRICT: Smaller administrative or geographic areas within cities or counties (e.g., "Greenwich").
- STREET: Full street names with or without numbers (e.g., "Main Street", "456 Maple Avenue", "Oxendon").
- Absolutely no inference, guessing, semantic matching, or partial similarity is allowed. Only exact string matches are valid.
- If the location name does not exactly match the above list, do not extract it as LOCATION-OTHER under any condition.
- LOCATION-OTHER must match a predefined whitelist. Otherwise, classify as CITY, STATE, COUNTY, or DISTRICT, or discard.
- Locations that cannot be confidently categorized as CITY, STATE, COUNTRY, COUNTY, DISTRICT, STREET, or exactly match LOCATION-OTHER must not be extracted at all.
- Label a location as CITY only if it is explicitly and clearly indicated as a real-world city or town based on the context. If the location is ambiguous, fictional, incomplete, or contextually unclear, do not label it.
- HOSPITAL: Extract the full name of any hospital, medical center, health service, or healthcare institution as HOSPITAL. The name must be specific and identifiable. Generic terms without specific names (e.g., "hospital", "health center") must not be extracted. Classify as HOSPITAL if entity contains keywords like “Health Service”, “District Health”, “Hospital”, “Clinic”, “Medical Centre”.
- ORGANIZATION: Extract the names of institutions, companies, libraries, businesses, or organized groups only if they are explicitly and clearly identifiable as organizations based on the text. Example entries include Subway, Divinity School Library, Career Services, seekers workshop, Cambridge, Starbucks, Michaels, Google, Datsun, and Orizia.
- Do not extract any health service, hospital, or medical center names as ORGANIZATION. Such entities must be classified as HOSPITAL if specific naming is present.
- ZIP: Treat "postal code", "postcode", "zip code", "ZIP", "ZIP number", "mail code", "delivery code", and "postal ZIP" as ZIP. Use "area code" or "address code" only if clearly referring to a mailing address.
- AGE: Extract only the numeric age of a person. Convert simple number words (e.g., "five") to digits ("5"). Exclude words like "years" or "old". Ignore vague phrases (e.g., "a couple of years") and non-numeric terms (e.g., "young", "teenager").
- DURATION: Extract only when the expression clearly refers to a measurable length or span of time with an explicit time unit such as "seconds", "minutes", "hours", "days", "weeks", "months", or "years".
  - Examples: "two days", "past few weeks", "10 minutes", "six months", "a long time", "a couple of hours"
  - Do not extract standalone numbers without time units.
  - Do not classify calendar expressions such as "last year", "next week", "this Saturday", or "last Friday" as DURATION — these should be classified as DATE.
  - Do not extract seasonal or event-based phrases such as "last summer" or "next winter" — these are also DATE.
- TIME: Extract when the expression refers to a specific time of day or clock time.
  - Examples: "3 PM", "2:30", "morning", "afternoon", "evening", "night", "last night", "middle of the night", "Monday morning", "next Friday morning"
  - Do not extract vague expressions such as "soon", "later", or "sometime".
  - Always reclassify expressions like "Friday morning" or "next Monday morning" as TIME, not DATE.
- DATE: Extract when the expression refers to a specific calendar point, named date, or time-referenced event.
  - Examples: "now", "on Friday", "Monday", "August 5", "May", "last year", "next week", "today", "yesterday", "tomorrow", "Christmas", "New Year's Eve"
  - Always classify "today", "yesterday", and "tomorrow" as DATE.
  - Days of the week like "Monday" and "Friday" are DATE unless clearly part of a recurring pattern (e.g., "every Monday", which is SET).
- SET: Extract only when the expression clearly refers to a repeated time pattern or schedule with explicit recurrence keywords.
  - Examples: "twice a week", "every Monday", "once a day", "three times a week", "every night"
  - Do not extract vague frequency terms like "sometimes", "occasionally", "frequently", "usually", "normally"
  - SET expressions must include recurrence indicators like "every", "once a", "twice per", or "three times each".
  - Do not confuse DURATION ("two weeks") with SET ("every two weeks").
- ROOM: Label room numbers or hospital room identifiers, such as “room 302”, “bed A”, or “A3”.
- PROFESSION: Extract explicit job titles and professional roles (e.g., "babysitter", "IT", "manager", "lawyer", "a lawyer", "engineer", "accountant").
  - Do not classify organization names (e.g., "Google", "Starbucks") or department names (e.g., "HR department", "Trauma team") as PROFESSION.
- DEPARTMENT: Extract all mentions of departments, teams, units, groups, wards, or divisions explicitly described (e.g., "HR department", "Intensive Care Unit", "Immunology Department", "Trauma team").
  - The mention must clearly include keywords such as "department", "team", "unit", "group", or "ward" to qualify.
  - Departments related to medical, administrative, academic, or organizational contexts are eligible.
  - Do not extract general locations, organization names, or professions as DEPARTMENT unless they explicitly include the structural keywords above.
- MEDICAL_RECORD_NUMBER: Use for alphanumeric codes that are clearly tied to a patient’s medical record, such as "1706458.VTX". Must be supported by context with terms like "MRN", "record number", or "medical file". If the text refers to a value as “medical record number” even without a decimal, still classify as MEDICAL_RECORD_NUMBER.
- ID_NUMBER: Use for general-purpose identifiers that are not explicitly linked to medical records. This includes episode numbers, lab numbers, form codes, and internal or personal IDs.
  - If the identifier is not clearly labeled as a medical record number (e.g., “MRN” or “medical record number”) and does not contain a decimal point, label it as ID_NUMBER.
  - Do not use ID_NUMBER for values that clearly represent date expressions (e.g., "17th, 2063", "17/06/2063", "21, 2063"). Use DATE instead unless context shows it is functioning as an identifier.
  - If a value is clearly referred to as an ID number, episode number, or lab number, it may be labeled as ID_NUMBER regardless of format.

Other rules:
- Do not infer or hallucinate missing entities.
- Preserve the original entity casing and spelling.
- If no PHI entities are found, output exactly: PHI:NULL.
"""


fewshot_example = """
Sentence:
Dr. Connie examined patient Florrie Minion at Kangaroo Island Health Service on June 20, 1989. Her medical record number 4402074.WNE and lab ID 44B20748 were recorded in the Department of Cardiology, located at Blue Cow Street, Camden Haven, Western Australia, ZIP 5067.
DOCTOR: Connie
PATIENT: Florrie Minion
HOSPITAL: Kangaroo Island Health Service
DATE: June 20, 1989
MEDICAL_RECORD_NUMBER: 4402074.WNE
ID_NUMBER: 44B20748
DEPARTMENT: Department of Cardiology
STREET: Blue Cow Street
CITY: Camden Haven
STATE: Western Australia
ZIP: 5067

Sentence:
Ashley, a chiropractor, visited the HR department twice a week. The chiropractor worked three hours in the evening at the district office in Greenwich, then spoke with Carl and Ivan’s dad from Cheshire County.
PERSONALNAME: Ashley
PROFESSION: chiropractor
DEPARTMENT: HR department
SET: twice a week
PROFESSION: chiropractor
DURATION: three hours
TIME: evening
DISTRICT: Greenwich
PERSONALNAME: Carl
FAMILYNAME: Ivan
COUNTY: Cheshire

Sentence:
On Monday morning in Victoria, Western Australia, I met with the organization Orizia. We verified ID 57X22961, confirmed her age is 65, and she stayed previously at P.O. Box 15.
TIME: Monday morning
STATE: Victoria
STATE: Western Australia
ORGANIZATION: Orizia
ID_NUMBER: 57X22961
AGE: 65
LOCATION-OTHER: P.O. Box 15

Sentence:
He came in last year, stayed for three weeks, returned two hours later, then again every Monday morning around two, left yesterday afternoon, and usually came back two hours later once a week for tests. Push him to A3.
DATE: last year
DURATION: three weeks
DURATION: two hours
SET: every Monday
TIME: Monday morning
TIME: two
TIME: yesterday afternoon
DURATION: two hours
SET: once a week
ROOM: A3

Sentence:
Beaconsfield District Health Service confirmed Ramona's follow-up was at 9:11 a.m. on Oxendon Street, Kyabram, ZIP 7000, where her medical record 4402074.WNE was reviewed again. Her lab number was 44B20748.
HOSPITAL: Beaconsfield District Health Service
PATIENT: Ramona
TIME: 9:11 a.m.
STREET: Oxendon Street
CITY: Kyabram
ZIP: 7000
MEDICAL_RECORD_NUMBER: 4402074.WNE
ID_NUMBER: 44B20748
"""

## 設定

In [None]:
import time
import openai
from tqdm import tqdm
from openai import OpenAI

Groq_key = "your_API_key"

client = OpenAI(
    api_key=Groq_key,
    base_url="https://api.groq.com/openai/v1"
)


## 測試 有checkpoint的，LLM1

In [None]:
import time
import os
import openai
from tqdm import tqdm
from openai import OpenAI
import random

# ======== 包裝：遇到錯誤就停止 ========
def chat_with_stop_on_error(messages, fid, model="llama3-70b-8192", max_tokens=512, temperature=0.0):
    try:
        response = client.chat.completions.create(
            model=model,
            messages=messages,
            temperature=temperature,
            max_tokens=max_tokens,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        error_message = str(e)
        print(f"[FATAL ERROR] Error at fid {fid}: {error_message}")
        raise RuntimeError(f"Program stopped due to error at fid {fid}.")

# ======== 啟動時找最後一個 fid 並清理 answer 檔案 ========
restart_fid = None
if os.path.exists(submission_task2_answer_L):
    with open(submission_task2_answer_L, encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]

    if lines:
        last_line = lines[-1]
        if "\t" in last_line:
            restart_fid = last_line.split("\t")[0]

    if restart_fid:
        with open(submission_task2_answer_L, encoding="utf-8") as fin:
            all_lines = fin.readlines()
        with open(submission_task2_answer_L, "w", encoding="utf-8") as fout:
            for line in all_lines:
                if not line.startswith(restart_fid + "\t"):
                    fout.write(line)

# ======== 核心 prompt 設定 ========
system_prompt = f"""
You are an expert at extracting PHI (Protected Health Information) entities from doctor-patient conversations or Daily conversation.

Special note:
{special_note}
Rules:
{Rules}
Few-shot examples:
{fewshot_example}
"""

# ======== 主程式：逐句處理，遇錯就停 ========
try:
    with open(submission_task1_answer_L, encoding="utf-8") as fin, \
         open(submission_task2_answer_L, "a", encoding="utf-8") as fout:

        system_messages = [{"role": "system", "content": system_prompt}]
        start_processing = restart_fid is None

        pbar = tqdm(fin, desc="Extracting PHI entities")
        for line in pbar:
            if "\t" not in line:
                continue
            fid, sentence = line.strip().split("\t", 1)

            pbar.set_description(f"Extracting PHI entities (FID {fid})")
            if not start_processing:
                if fid == restart_fid:
                    start_processing = True
                else:
                    continue

            user_message = [{"role": "user", "content": f"Sentence:\n{sentence}"}]
            messages = system_messages + user_message

            prediction = chat_with_stop_on_error(messages, fid=fid)
            written_preds = []

            for pred_line in prediction.splitlines():
                if pred_line.upper() == "PHI:NULL":
                    continue
                if ":" in pred_line:
                    category, entity = pred_line.split(":", 1)
                    fout.write(f"{fid}\t{category.strip()}\t{entity.strip()}\n")
                    fout.flush()
                    written_preds.append(f"{category.strip()}: {entity.strip()}")

            if written_preds:
                print(f"[WRITE] FID {fid}: {written_preds}")
            else:
                print(f"[WRITE] FID {fid}: (No entities extrated)")
            time.sleep(35 + random.uniform(1, 30))  # 加點 jitter

except RuntimeError as e:
    print(f"[EXIT] {e}")
    exit(1)