In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from huggingface_hub import login
from google.colab import userdata
huggingface = userdata.get('HF_API')
login(huggingface)

In [None]:
!pip install python.Levenshtein

Collecting python.Levenshtein
  Downloading python_levenshtein-0.27.3-py3-none-any.whl.metadata (3.9 kB)
Collecting Levenshtein==0.27.3 (from python.Levenshtein)
  Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Collecting rapidfuzz<4.0.0,>=3.9.0 (from Levenshtein==0.27.3->python.Levenshtein)
  Downloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (12 kB)
Downloading python_levenshtein-0.27.3-py3-none-any.whl (9.5 kB)
Downloading levenshtein-0.27.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (153 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.3/153.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m88.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packa

In [None]:
import json
import numpy as np
import pandas as pd
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
import Levenshtein
from transformers import pipeline

# ========== 파일 경로 설정 ==========
input_json_path = '/content/drive/MyDrive/1106_오후회의_whisper.json'

output_json_path = '/content/drive/MyDrive/1106_오후회의_stt_segments_with_name_nf.json'
cluster_json_path = '/content/drive/MyDrive/1106_오후회의_name_clusters_nf.json'
final_namelist_path = '/content/drive/MyDrive/1106_오후회의_final_namelist_nf.txt'
output_txt_path = '/content/drive/MyDrive/1106_오후회의_name_check_transcript_nf.txt'
# ====================================

# ========== 하이퍼파라미터 설정 ==========
NER_THRESHOLD = 0.8  # NER 신뢰도 임계값 (0.0 ~ 1.0)
CLUSTER_THRESHOLD = 1.5  # 군집화 거리 임계값
# ========================================

print("="*60)
print("이름 추출 및 군집화 파이프라인 시작")
print("="*60)

# ========== 모델 로드 ==========
print("\n[1/5] NER 모델 로딩 중...")
ner = pipeline(
    "token-classification",
    model="seungkukim/korean-pii-masking",
    aggregation_strategy="simple",
    device_map="auto"
)
print("✓ 모델 로드 완료")

# ========== 함수 정의 ==========
def extract_person_names(ner_results, threshold=0.85):
    """NER 결과에서 PERSON 엔티티 추출"""
    if not ner_results:
        return []

    persons = []

    for entity in ner_results:
        if entity['score'] >= threshold and entity['entity_group'] == 'PS_NAME':
            persons.append({
                'name': entity['word'],
                'score': entity['score']
            })

    return persons

def cluster_names(name_score_dict, threshold=1.5):
    """레벤슈타인 거리 기반 이름 군집화 (score 기반 대표명 선정)"""
    if len(name_score_dict) == 0:
        return {}

    names = list(name_score_dict.keys())

    if len(names) == 1:
        return {names[0]: names[0]}

    n = len(names)
    distance_matrix = np.zeros((n, n))

    for i in range(n):
        for j in range(i+1, n):
            dist = Levenshtein.distance(names[i], names[j])
            distance_matrix[i][j] = dist
            distance_matrix[j][i] = dist

    condensed_dist = squareform(distance_matrix)
    linkage_matrix = linkage(condensed_dist, method='average')
    clusters = fcluster(linkage_matrix, threshold, criterion='distance')

    cluster_dict = {}
    for name, cluster_id in zip(names, clusters):
        if cluster_id not in cluster_dict:
            cluster_dict[cluster_id] = []
        cluster_dict[cluster_id].append(name)

    name_clusters = {}
    for cluster_id, cluster_names in cluster_dict.items():
        # score 기준으로 정렬하여 가장 높은 score를 가진 이름을 대표명으로 선정
        cluster_names_sorted = sorted(cluster_names,
                                     key=lambda x: name_score_dict[x],
                                     reverse=True)
        representative = cluster_names_sorted[0]

        if len(cluster_names) == 1:
            name_clusters[representative] = cluster_names[0]
        else:
            # 대표명을 제외한 나머지 이름들도 score 순으로 정렬
            name_clusters[representative] = cluster_names_sorted

    return name_clusters

# ========== STEP 1: NER 수행 ==========
print("\n[2/5] NER 수행 중...")
with open(input_json_path, 'r', encoding='utf-8') as f:
    segments = json.load(f)

print(f"  - 총 {len(segments)}개 세그먼트 처리")
print(f"  - NER 임계값: {NER_THRESHOLD}")

segments_with_names = []
all_names = []
name_scores = {}  # 이름별 최대 score 저장

for idx, segment in enumerate(segments):
    if (idx + 1) % 100 == 0:
        print(f"  - 진행: {idx + 1}/{len(segments)}")

    text = segment['text']
    start_time = segment['start']
    end_time = segment['end']

    ner_results = ner(text)
    person_names_with_scores = extract_person_names(ner_results, threshold=NER_THRESHOLD)

    # 이름만 추출
    person_names = [item['name'] for item in person_names_with_scores]

    # 각 이름의 최대 score 업데이트
    for item in person_names_with_scores:
        name = item['name']
        score = item['score']
        if name not in name_scores or score > name_scores[name]:
            name_scores[name] = score

    segment_with_name = {
        'text': text,
        'start': start_time,
        'end': end_time,
        'name': person_names if person_names else None
    }

    segments_with_names.append(segment_with_name)

    if person_names:
        all_names.extend(person_names)

unique_names = sorted(set(all_names))
print(f"✓ NER 완료: {len(unique_names)}개 고유 이름 추출")

# ========== STEP 2: 이름 군집화 ==========
print("\n[3/5] 이름 군집화 중...")
print(f"  - 군집화 임계값: {CLUSTER_THRESHOLD}")

if len(unique_names) > 0:
    name_clusters = cluster_names(name_scores, threshold=CLUSTER_THRESHOLD)
    final_namelist = sorted(name_clusters.keys())

    multi_clusters = sum(1 for v in name_clusters.values() if isinstance(v, list) and len(v) > 1)
    print(f"✓ 군집화 완료: {len(unique_names)} → {len(final_namelist)}개 대표명")
    print(f"  - 유사 이름 군집: {multi_clusters}개")
else:
    name_clusters = {}
    final_namelist = []
    print("  - 추출된 이름 없음")

# ========== STEP 3: 이름 체크 트랜스크립트 생성 ==========
print("\n[4/5] 이름 체크 트랜스크립트 생성 중...")

# unique_names를 set으로 변환하여 빠른 검색
unique_names_set = set(unique_names)

output_lines = []
name_found_count = 0

for segment in segments_with_names:
    text = segment['text']
    names = segment.get('name')

    # unique_names에 있는 이름이 하나라도 포함되어 있는지 확인
    has_valid_name = False
    if names is not None and names != [] and names != '':
        # names가 리스트인 경우
        if isinstance(names, list):
            for name in names:
                if name in unique_names_set:
                    has_valid_name = True
                    break
        # names가 문자열인 경우
        elif isinstance(names, str):
            if names in unique_names_set:
                has_valid_name = True

    if has_valid_name:
        check_mark = 'v'
        name_found_count += 1
    else:
        check_mark = ' '

    line = f"[{check_mark}] '{text}'"
    output_lines.append(line)

print(f"✓ 트랜스크립트 생성 완료: {name_found_count}/{len(segments_with_names)}개 세그먼트에서 이름 발견")

# ========== STEP 4: 파일 저장 ==========
print("\n[5/5] 결과 파일 저장 중...")

# 1. stt_segments_with_name.json
with open(output_json_path, 'w', encoding='utf-8') as f:
    json.dump(segments_with_names, f, ensure_ascii=False, indent=2)
print(f"  ✓ {output_json_path}")

# 2. name_clusters.json
if len(name_clusters) > 0:
    with open(cluster_json_path, 'w', encoding='utf-8') as f:
        json.dump(name_clusters, f, ensure_ascii=False, indent=2)
    print(f"  ✓ {cluster_json_path}")

# 3. final_namelist.txt
if len(final_namelist) > 0:
    with open(final_namelist_path, 'w', encoding='utf-8') as f:
        for name in final_namelist:
            f.write(name + '\n')
    print(f"  ✓ {final_namelist_path}")

# 4. name_check_transcript.txt
with open(output_txt_path, 'w', encoding='utf-8') as f:
    for line in output_lines:
        f.write(line + '\n')
print(f"  ✓ {output_txt_path}")

# ========== 최종 통계 ==========
print("\n" + "="*60)
print("처리 완료!")
print("="*60)
print(f"NER 추출 이름: {len(unique_names)}개")
print(f"최종 대표명: {len(final_namelist)}개")
print(f"이름 발견 세그먼트: {name_found_count}/{len(segments_with_names)}개 ({name_found_count/len(segments_with_names)*100:.1f}%)")
print("="*60)