In [10]:
from pydub import AudioSegment
from pydub.utils import make_chunks
import time
import io

# 1. 오디오 파일 불러오기
audio = AudioSegment.from_wav("data1.wav")

# 2. 청크 단위로 자르기 (예: 1초 = 1000ms)
chunk_length_ms = 1000  # 1초 단위
chunks = make_chunks(audio, chunk_length_ms)

# 3. 스트리밍 시뮬레이션
print("🔊 Start streaming...\n")

for i, chunk in enumerate(chunks):
    # 실제 스트리밍이라면 여기에 전송 코드가 들어감
    buf = io.BytesIO()
    chunk.export(buf, format="wav")
    
    # 예시: 전송받은 chunk를 처리하는 부분 (여기서는 그냥 출력)
    print(f"✅ Sent chunk {i+1}/{len(chunks)} - Duration: {len(chunk)} ms")
    
    # 1초 간격으로 전송 (실시간처럼)
    time.sleep(chunk_length_ms / 1000.0)

print("\n🚀 All chunks sent.")



🔊 Start streaming...

✅ Sent chunk 1/88 - Duration: 1000 ms
✅ Sent chunk 2/88 - Duration: 1000 ms
✅ Sent chunk 3/88 - Duration: 1000 ms
✅ Sent chunk 4/88 - Duration: 1000 ms
✅ Sent chunk 5/88 - Duration: 1000 ms
✅ Sent chunk 6/88 - Duration: 1000 ms
✅ Sent chunk 7/88 - Duration: 1000 ms
✅ Sent chunk 8/88 - Duration: 1000 ms
✅ Sent chunk 9/88 - Duration: 1000 ms
✅ Sent chunk 10/88 - Duration: 1000 ms
✅ Sent chunk 11/88 - Duration: 1000 ms
✅ Sent chunk 12/88 - Duration: 1000 ms
✅ Sent chunk 13/88 - Duration: 1000 ms
✅ Sent chunk 14/88 - Duration: 1000 ms
✅ Sent chunk 15/88 - Duration: 1000 ms
✅ Sent chunk 16/88 - Duration: 1000 ms
✅ Sent chunk 17/88 - Duration: 1000 ms
✅ Sent chunk 18/88 - Duration: 1000 ms
✅ Sent chunk 19/88 - Duration: 1000 ms
✅ Sent chunk 20/88 - Duration: 1000 ms
✅ Sent chunk 21/88 - Duration: 1000 ms
✅ Sent chunk 22/88 - Duration: 1000 ms
✅ Sent chunk 23/88 - Duration: 1000 ms
✅ Sent chunk 24/88 - Duration: 1000 ms
✅ Sent chunk 25/88 - Duration: 1000 ms
✅ Sent chunk

In [11]:
import time
import numpy as np
import torch
import torchaudio
from resemblyzer import VoiceEncoder
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# 파일명 및 설정
wav_path = "data1.wav"
chunk_ms = 1000
step_ms = 700
sr_target = 16000

# silero-vad 모델 불러오기
model, utils = torch.hub.load('snakers4/silero-vad', 'silero_vad', trust_repo=True)
(get_speech_ts, _, _, _, _) = utils

# Resemblyzer 초기화
encoder = VoiceEncoder()

# 오디오 전체 로드
wav, sr = torchaudio.load(wav_path)
if sr != sr_target:
    wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sr_target)(wav)
    sr = sr_target
wav = wav.mean(dim=0).numpy()

# 스트리밍 시뮬레이션 시작
chunk_size = int(sr * chunk_ms / 1000)
step_size = int(sr * step_ms / 1000)
start_idx = 0

buffered_embeddings = []
segment_times = []

print("🎙️ 실시간 스트리밍 및 화자 분리 시작...\n")

while start_idx + chunk_size < len(wav):
    end_idx = start_idx + chunk_size
    chunk = wav[start_idx:end_idx]
    audio_tensor = torch.tensor(chunk)

    # 음성 유무 판단
    speech_ts = get_speech_ts(audio_tensor, model, sampling_rate=sr)
    if len(speech_ts) > 0:
        embed = encoder.embed_utterance(chunk, return_partials=False)
        buffered_embeddings.append(embed)
        segment_times.append((start_idx / sr, end_idx / sr))

        # 최소 3개 이상이 쌓였을 때부터 클러스터링
        if len(buffered_embeddings) >= 3:
            emb_arr = np.vstack(buffered_embeddings)
            scaled = StandardScaler().fit_transform(emb_arr)
            labels = AgglomerativeClustering(n_clusters=2).fit_predict(scaled)

            print("🧠 화자 분리 결과:")
            for i, (s, e) in enumerate(segment_times):
                print(f"  [화자{labels[i]+1}] {s:.2f}s ~ {e:.2f}s")

            print("-" * 30)

    start_idx += step_size
    time.sleep(step_ms / 1000.0)  # 실시간 시뮬레이션을 위한 sleep

print("\n✅ 스트리밍 종료")


Using cache found in C:\Users\fluffycat/.cache\torch\hub\snakers4_silero-vad_master
  checkpoint = torch.load(weights_fpath, map_location="cpu")


Loaded the voice encoder model on cuda in 4.66 seconds.
🎙️ 실시간 스트리밍 및 화자 분리 시작...



  "class": algorithms.Blowfish,


🧠 화자 분리 결과:
  [화자1] 0.00s ~ 1.00s
  [화자1] 0.70s ~ 1.70s
  [화자2] 1.40s ~ 2.40s
------------------------------
🧠 화자 분리 결과:
  [화자1] 0.00s ~ 1.00s
  [화자1] 0.70s ~ 1.70s
  [화자2] 1.40s ~ 2.40s
  [화자2] 2.10s ~ 3.10s
------------------------------
🧠 화자 분리 결과:
  [화자1] 0.00s ~ 1.00s
  [화자1] 0.70s ~ 1.70s
  [화자2] 1.40s ~ 2.40s
  [화자2] 2.10s ~ 3.10s
  [화자1] 2.80s ~ 3.80s
------------------------------
🧠 화자 분리 결과:
  [화자1] 0.00s ~ 1.00s
  [화자1] 0.70s ~ 1.70s
  [화자2] 1.40s ~ 2.40s
  [화자2] 2.10s ~ 3.10s
  [화자1] 2.80s ~ 3.80s
  [화자1] 4.20s ~ 5.20s
------------------------------
🧠 화자 분리 결과:
  [화자1] 0.00s ~ 1.00s
  [화자1] 0.70s ~ 1.70s
  [화자2] 1.40s ~ 2.40s
  [화자2] 2.10s ~ 3.10s
  [화자1] 2.80s ~ 3.80s
  [화자1] 4.20s ~ 5.20s
  [화자2] 4.90s ~ 5.90s
------------------------------
🧠 화자 분리 결과:
  [화자2] 0.00s ~ 1.00s
  [화자2] 0.70s ~ 1.70s
  [화자1] 1.40s ~ 2.40s
  [화자1] 2.10s ~ 3.10s
  [화자2] 2.80s ~ 3.80s
  [화자2] 4.20s ~ 5.20s
  [화자1] 4.90s ~ 5.90s
  [화자1] 5.60s ~ 6.60s
------------------------------
🧠 화자 분리 결과:
  [화

In [12]:
import time
import csv
import numpy as np
import torch
import torchaudio
from resemblyzer import VoiceEncoder
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

# 파일 경로 및 설정
wav_path = "data1.wav"
chunk_ms = 1000
step_ms = 700
sr_target = 16000

# silero-vad 로드
model, utils = torch.hub.load('snakers4/silero-vad', 'silero_vad', trust_repo=True)
(get_speech_ts, _, _, _, _) = utils

# Resemblyzer
encoder = VoiceEncoder()

# 오디오 불러오기 및 리샘플링
wav, sr = torchaudio.load(wav_path)
if sr != sr_target:
    wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sr_target)(wav)
    sr = sr_target
wav = wav.mean(dim=0).numpy()

# 설정
chunk_size = int(sr * chunk_ms / 1000)
step_size = int(sr * step_ms / 1000)
start_idx = 0

# 저장용 버퍼
buffered_embeddings = []
segment_times = []
diarization_output = []

print("🎙️ 실시간 스트리밍 및 화자 분리 시작...\n")

# 스트리밍 루프
while start_idx + chunk_size < len(wav):
    end_idx = start_idx + chunk_size
    chunk = wav[start_idx:end_idx]
    audio_tensor = torch.tensor(chunk)

    # VAD로 음성 검출
    speech_ts = get_speech_ts(audio_tensor, model, sampling_rate=sr)
    if len(speech_ts) > 0:
        embed = encoder.embed_utterance(chunk, return_partials=False)
        buffered_embeddings.append(embed)
        segment_times.append((start_idx / sr, end_idx / sr))

        # 최소 3개 이상일 때 클러스터링 수행
        if len(buffered_embeddings) >= 3:
            emb_arr = np.vstack(buffered_embeddings)
            scaled = StandardScaler().fit_transform(emb_arr)
            labels = AgglomerativeClustering(n_clusters=2).fit_predict(scaled)

            print("🧠 화자 분리 결과:")
            diarization_output.clear()  # 매번 덮어쓰기 위해 초기화
            for i, (s, e) in enumerate(segment_times):
                speaker_id = labels[i] + 1
                print(f"  [화자{speaker_id}] {s:.2f}s ~ {e:.2f}s")
                diarization_output.append({
                    "speaker": speaker_id,
                    "start_sec": round(s, 2),
                    "end_sec": round(e, 2),
                    "duration_sec": round(e - s, 2)
                })
            print("-" * 30)

    # 스트리밍 시뮬레이션
    start_idx += step_size
    time.sleep(step_ms / 1000.0)

print("\n✅ 스트리밍 종료")

# diarization_result.csv 저장
with open("diarization_result.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["speaker", "start_sec", "end_sec", "duration_sec"])
    writer.writeheader()
    writer.writerows(diarization_output)

print("📁 diarization_result.csv 저장 완료.")


Using cache found in C:\Users\fluffycat/.cache\torch\hub\snakers4_silero-vad_master
  checkpoint = torch.load(weights_fpath, map_location="cpu")


Loaded the voice encoder model on cuda in 0.32 seconds.
🎙️ 실시간 스트리밍 및 화자 분리 시작...

🧠 화자 분리 결과:
  [화자1] 0.00s ~ 1.00s
  [화자1] 0.70s ~ 1.70s
  [화자2] 1.40s ~ 2.40s
------------------------------
🧠 화자 분리 결과:
  [화자1] 0.00s ~ 1.00s
  [화자1] 0.70s ~ 1.70s
  [화자2] 1.40s ~ 2.40s
  [화자2] 2.10s ~ 3.10s
------------------------------
🧠 화자 분리 결과:
  [화자1] 0.00s ~ 1.00s
  [화자1] 0.70s ~ 1.70s
  [화자2] 1.40s ~ 2.40s
  [화자2] 2.10s ~ 3.10s
  [화자1] 2.80s ~ 3.80s
------------------------------
🧠 화자 분리 결과:
  [화자1] 0.00s ~ 1.00s
  [화자1] 0.70s ~ 1.70s
  [화자2] 1.40s ~ 2.40s
  [화자2] 2.10s ~ 3.10s
  [화자1] 2.80s ~ 3.80s
  [화자1] 4.20s ~ 5.20s
------------------------------
🧠 화자 분리 결과:
  [화자1] 0.00s ~ 1.00s
  [화자1] 0.70s ~ 1.70s
  [화자2] 1.40s ~ 2.40s
  [화자2] 2.10s ~ 3.10s
  [화자1] 2.80s ~ 3.80s
  [화자1] 4.20s ~ 5.20s
  [화자2] 4.90s ~ 5.90s
------------------------------
🧠 화자 분리 결과:
  [화자2] 0.00s ~ 1.00s
  [화자2] 0.70s ~ 1.70s
  [화자1] 1.40s ~ 2.40s
  [화자1] 2.10s ~ 3.10s
  [화자2] 2.80s ~ 3.80s
  [화자2] 4.20s ~ 5.20s
  [화자1] 

In [13]:
print("\n💾 화자별 음성 저장 중...")

# 음성 전체를 AudioSegment로 불러오기 (타입 변환 필수)
wav_full = AudioSegment(
    data=wav.astype(np.float32).tobytes(),
    frame_rate=sr,
    sample_width=4,  # float32 = 4 bytes
    channels=1
)

# 화자별 segment 모으기
speaker_segments = {}
for seg in diarization_output:
    speaker = seg["speaker"]
    start_ms = int(seg["start_sec"] * 1000)
    end_ms = int(seg["end_sec"] * 1000)
    chunk = wav_full[start_ms:end_ms]
    
    if speaker not in speaker_segments:
        speaker_segments[speaker] = chunk
    else:
        speaker_segments[speaker] += chunk  # 이어붙이기

# 저장
for speaker, audio in speaker_segments.items():
    filename = f"speaker_{speaker}.wav"
    audio.export(filename, format="wav")
    print(f"✅ {filename} 저장 완료")

print("📁 모든 화자별 음원 저장 완료.")


💾 화자별 음성 저장 중...
✅ speaker_1.wav 저장 완료
✅ speaker_2.wav 저장 완료
📁 모든 화자별 음원 저장 완료.


In [15]:
import time
import csv
import numpy as np
import torch
import torchaudio
from resemblyzer import VoiceEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from pydub import AudioSegment

# 설정
wav_path = "data1.wav"
chunk_ms = 1000
step_ms = 700
sr_target = 16000
n_speakers = 2
SIM_THRESHOLD = 0.75

# 모델 로드
model, utils = torch.hub.load('snakers4/silero-vad', 'silero_vad', trust_repo=True)
(get_speech_ts, _, _, _, _) = utils
encoder = VoiceEncoder()

# 오디오 로딩
wav, sr = torchaudio.load(wav_path)
if sr != sr_target:
    wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sr_target)(wav)
    sr = sr_target
wav = wav.mean(dim=0).numpy()

# 스트리밍 설정
chunk_size = int(sr * chunk_ms / 1000)
step_size = int(sr * step_ms / 1000)
start_idx = 0

# centroid 초기화용
centroids = [None] * n_speakers
clusters = {i + 1: [] for i in range(n_speakers)}
diarization_output = []

print(f"🎙️ 고정 화자 수: {n_speakers}명 - 실시간 분리 시작...\n")

# 전체 오디오를 AudioSegment로 변환
wav_full = AudioSegment(
    data=wav.astype(np.float32).tobytes(),
    frame_rate=sr,
    sample_width=4,
    channels=1
)

# 초기화 flag
centroid_initialized = False
init_chunks_per_speaker = 3

# 루프
while start_idx + chunk_size < len(wav):
    end_idx = start_idx + chunk_size
    chunk = wav[start_idx:end_idx]
    audio_tensor = torch.tensor(chunk)

    speech_ts = get_speech_ts(audio_tensor, model, sampling_rate=sr)
    if len(speech_ts) > 0:
        embed = encoder.embed_utterance(chunk, return_partials=False).reshape(1, -1)

        if not centroid_initialized:
            # 초기 centroid 설정
            for i in range(n_speakers):
                if len(clusters[i + 1]) < init_chunks_per_speaker:
                    clusters[i + 1].append(embed[0])
                    print(f"▶️ 초기화 중: 화자{i+1} → chunk {len(clusters[i+1])}")
                    break
            if all(len(clusters[i + 1]) >= init_chunks_per_speaker for i in range(n_speakers)):
                for i in range(n_speakers):
                    centroids[i] = np.mean(clusters[i + 1], axis=0)
                centroid_initialized = True
                print("\n✅ centroid 초기화 완료!\n")
            start_idx += step_size
            time.sleep(step_ms / 1000.0)
            continue

        # 유사도 기반 할당
        sims = cosine_similarity(embed, np.vstack(centroids)).flatten()
        speaker_id = np.argmax(sims) + 1

        # centroid 업데이트
        clusters[speaker_id].append(embed[0])
        centroids[speaker_id - 1] = np.mean(clusters[speaker_id], axis=0)

        # 결과 저장
        start_sec = round(start_idx / sr, 2)
        end_sec = round(end_idx / sr, 2)
        diarization_output.append({
            "speaker": speaker_id,
            "start_sec": start_sec,
            "end_sec": end_sec,
            "duration_sec": round(end_sec - start_sec, 2)
        })
        print(f"[화자{speaker_id}] {start_sec:.2f}s ~ {end_sec:.2f}s")

    start_idx += step_size
    time.sleep(step_ms / 1000.0)

print("\n✅ 스트리밍 종료")

# diarization_result.csv 저장
with open("diarization_result.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["speaker", "start_sec", "end_sec", "duration_sec"])
    writer.writeheader()
    writer.writerows(diarization_output)

print("📁 diarization_result.csv 저장 완료.")

# 화자별 음성 저장
print("💾 화자별 음성 저장 중...")
speaker_segments = {}
for seg in diarization_output:
    speaker = seg["speaker"]
    start_ms = int(seg["start_sec"] * 1000)
    end_ms = int(seg["end_sec"] * 1000)
    chunk = wav_full[start_ms:end_ms]
    if speaker not in speaker_segments:
        speaker_segments[speaker] = chunk
    else:
        speaker_segments[speaker] += chunk

for speaker, audio in speaker_segments.items():
    filename = f"speaker_{speaker}.wav"
    audio.export(filename, format="wav")
    print(f"✅ {filename} 저장 완료")

print("📁 모든 화자별 음원 저장 완료.")


Using cache found in C:\Users\fluffycat/.cache\torch\hub\snakers4_silero-vad_master
  checkpoint = torch.load(weights_fpath, map_location="cpu")


Loaded the voice encoder model on cuda in 0.03 seconds.
🎙️ 고정 화자 수: 2명 - 실시간 분리 시작...

▶️ 초기화 중: 화자1 → chunk 1
▶️ 초기화 중: 화자1 → chunk 2
▶️ 초기화 중: 화자1 → chunk 3
▶️ 초기화 중: 화자2 → chunk 1
▶️ 초기화 중: 화자2 → chunk 2
▶️ 초기화 중: 화자2 → chunk 3

✅ centroid 초기화 완료!

[화자2] 4.90s ~ 5.90s
[화자2] 5.60s ~ 6.60s
[화자1] 6.30s ~ 7.30s
[화자1] 7.00s ~ 8.00s
[화자1] 7.70s ~ 8.70s
[화자1] 8.40s ~ 9.40s
[화자1] 9.10s ~ 10.10s
[화자1] 9.80s ~ 10.80s
[화자1] 10.50s ~ 11.50s
[화자1] 11.20s ~ 12.20s
[화자1] 11.90s ~ 12.90s
[화자1] 12.60s ~ 13.60s
[화자1] 13.30s ~ 14.30s
[화자1] 14.00s ~ 15.00s
[화자1] 14.70s ~ 15.70s
[화자1] 15.40s ~ 16.40s
[화자1] 16.10s ~ 17.10s
[화자1] 17.50s ~ 18.50s
[화자1] 18.20s ~ 19.20s
[화자1] 18.90s ~ 19.90s
[화자2] 19.60s ~ 20.60s
[화자1] 20.30s ~ 21.30s
[화자1] 21.00s ~ 22.00s
[화자1] 21.70s ~ 22.70s
[화자2] 22.40s ~ 23.40s
[화자2] 23.10s ~ 24.10s
[화자2] 23.80s ~ 24.80s
[화자1] 24.50s ~ 25.50s
[화자1] 25.20s ~ 26.20s
[화자1] 25.90s ~ 26.90s
[화자1] 26.60s ~ 27.60s
[화자1] 27.30s ~ 28.30s
[화자1] 28.00s ~ 29.00s
[화자1] 28.70s ~ 29.70s
[화자1] 29.40s ~

In [16]:
import time
import csv
import numpy as np
import torch
import torchaudio
from resemblyzer import VoiceEncoder
from sklearn.metrics.pairwise import cosine_similarity
from pydub import AudioSegment
from collections import defaultdict

# 설정
wav_path = "data1.wav"
chunk_ms = 1000
step_ms = 700
sr_target = 16000
n_speakers = 2
SIM_THRESHOLD = 0.75

# 모델 로드
model, utils = torch.hub.load('snakers4/silero-vad', 'silero_vad', trust_repo=True)
(get_speech_ts, _, _, _, _) = utils
encoder = VoiceEncoder()

# 오디오 로딩 및 리샘플링
wav, sr = torchaudio.load(wav_path)
if sr != sr_target:
    wav = torchaudio.transforms.Resample(orig_freq=sr, new_freq=sr_target)(wav)
    sr = sr_target
wav = wav.mean(dim=0).numpy()

# int16 변환 (울림 방지)
wav_int16 = np.int16(wav * 32767)
wav_full = AudioSegment(
    data=wav_int16.tobytes(),
    frame_rate=sr,
    sample_width=2,  # int16
    channels=1
)

# 스트리밍 설정
chunk_size = int(sr * chunk_ms / 1000)
step_size = int(sr * step_ms / 1000)
start_idx = 0

# centroid 및 클러스터 초기화
centroids = [None] * n_speakers
clusters = {i + 1: [] for i in range(n_speakers)}
diarization_output = []

print(f"🎙️ 고정 화자 수: {n_speakers}명 - 실시간 분리 시작...\n")

# 초기화 flag
centroid_initialized = False
init_chunks_per_speaker = 3

# 루프
while start_idx + chunk_size < len(wav):
    end_idx = start_idx + chunk_size
    chunk = wav[start_idx:end_idx]
    audio_tensor = torch.tensor(chunk)

    speech_ts = get_speech_ts(audio_tensor, model, sampling_rate=sr)
    if len(speech_ts) > 0:
        embed = encoder.embed_utterance(chunk, return_partials=False).reshape(1, -1)

        if not centroid_initialized:
            # centroid 초기화
            for i in range(n_speakers):
                if len(clusters[i + 1]) < init_chunks_per_speaker:
                    clusters[i + 1].append(embed[0])
                    print(f"▶️ 초기화 중: 화자{i+1} → chunk {len(clusters[i+1])}")
                    break
            if all(len(clusters[i + 1]) >= init_chunks_per_speaker for i in range(n_speakers)):
                for i in range(n_speakers):
                    centroids[i] = np.mean(clusters[i + 1], axis=0)
                centroid_initialized = True
                print("\n✅ centroid 초기화 완료!\n")
            start_idx += step_size
            time.sleep(step_ms / 1000.0)
            continue

        # 유사도 기반 화자 분리
        sims = cosine_similarity(embed, np.vstack(centroids)).flatten()
        speaker_id = np.argmax(sims) + 1

        # centroid 업데이트
        clusters[speaker_id].append(embed[0])
        centroids[speaker_id - 1] = np.mean(clusters[speaker_id], axis=0)

        # 결과 저장
        start_sec = round(start_idx / sr, 2)
        end_sec = round(end_idx / sr, 2)
        diarization_output.append({
            "speaker": speaker_id,
            "start_sec": start_sec,
            "end_sec": end_sec,
            "duration_sec": round(end_sec - start_sec, 2)
        })
        print(f"[화자{speaker_id}] {start_sec:.2f}s ~ {end_sec:.2f}s")

    start_idx += step_size
    time.sleep(step_ms / 1000.0)

print("\n✅ 스트리밍 종료")

# diarization_result.csv 저장
with open("diarization_result.csv", "w", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=["speaker", "start_sec", "end_sec", "duration_sec"])
    writer.writeheader()
    writer.writerows(diarization_output)

print("📁 diarization_result.csv 저장 완료.")

# 💾 화자별 음성 저장 (중복 제거 포함)
print("💾 화자별 음성 저장 중...")

speaker_segments = defaultdict(list)
last_end_ms = {spk: -1 for spk in range(1, n_speakers + 1)}

for seg in diarization_output:
    speaker = seg["speaker"]
    start_ms = int(seg["start_sec"] * 1000)
    end_ms = int(seg["end_sec"] * 1000)

    # 🔒 중복 제거
    if start_ms <= last_end_ms[speaker]:
        start_ms = last_end_ms[speaker] + 1
    if start_ms >= end_ms:
        continue

    chunk = wav_full[start_ms:end_ms]
    speaker_segments[speaker].append(chunk)
    last_end_ms[speaker] = end_ms

# 저장
for speaker, chunks in speaker_segments.items():
    merged = sum(chunks[1:], chunks[0]) if len(chunks) > 1 else chunks[0]
    filename = f"speaker_{speaker}.wav"
    merged.export(filename, format="wav")
    print(f"✅ {filename} 저장 완료")

print("📁 모든 화자별 음원 저장 완료.")


Using cache found in C:\Users\fluffycat/.cache\torch\hub\snakers4_silero-vad_master
  checkpoint = torch.load(weights_fpath, map_location="cpu")


Loaded the voice encoder model on cuda in 0.03 seconds.
🎙️ 고정 화자 수: 2명 - 실시간 분리 시작...

▶️ 초기화 중: 화자1 → chunk 1
▶️ 초기화 중: 화자1 → chunk 2
▶️ 초기화 중: 화자1 → chunk 3
▶️ 초기화 중: 화자2 → chunk 1
▶️ 초기화 중: 화자2 → chunk 2
▶️ 초기화 중: 화자2 → chunk 3

✅ centroid 초기화 완료!

[화자2] 4.90s ~ 5.90s
[화자2] 5.60s ~ 6.60s
[화자1] 6.30s ~ 7.30s
[화자1] 7.00s ~ 8.00s
[화자1] 7.70s ~ 8.70s
[화자1] 8.40s ~ 9.40s
[화자1] 9.10s ~ 10.10s
[화자1] 9.80s ~ 10.80s
[화자1] 10.50s ~ 11.50s
[화자1] 11.20s ~ 12.20s
[화자1] 11.90s ~ 12.90s
[화자1] 12.60s ~ 13.60s
[화자1] 13.30s ~ 14.30s
[화자1] 14.00s ~ 15.00s
[화자1] 14.70s ~ 15.70s
[화자1] 15.40s ~ 16.40s
[화자1] 16.10s ~ 17.10s
[화자1] 17.50s ~ 18.50s
[화자1] 18.20s ~ 19.20s
[화자1] 18.90s ~ 19.90s
[화자2] 19.60s ~ 20.60s
[화자1] 20.30s ~ 21.30s
[화자1] 21.00s ~ 22.00s
[화자1] 21.70s ~ 22.70s
[화자2] 22.40s ~ 23.40s
[화자2] 23.10s ~ 24.10s
[화자2] 23.80s ~ 24.80s
[화자1] 24.50s ~ 25.50s
[화자1] 25.20s ~ 26.20s
[화자1] 25.90s ~ 26.90s
[화자1] 26.60s ~ 27.60s
[화자1] 27.30s ~ 28.30s
[화자1] 28.00s ~ 29.00s
[화자1] 28.70s ~ 29.70s
[화자1] 29.40s ~