In [None]:
!pip install torch==2.1.0+cu118 torchaudio==2.1.0+cu118 -f https://download.pytorch.org/whl/torch_stable.html
!pip install -q speechbrain
!pip install -q scikit-learn
!pip install -q matplotlib seaborn
!pip install numpy==1.26.4
!pip install faster-whisper
!pip install transformers==4.30.2
!pip install tokenizers==0.13.3
!pip install -q fastapi pyngrok uvicorn

Looking in links: https://download.pytorch.org/whl/torch_stable.html
Collecting torch==2.1.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torch-2.1.0%2Bcu118-cp311-cp311-linux_x86_64.whl (2325.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m980.6 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchaudio==2.1.0+cu118
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.1.0%2Bcu118-cp311-cp311-linux_x86_64.whl (3.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m104.0 MB/s[0m eta [36m0:00:00[0m
Collecting triton==2.1.0 (from torch==2.1.0+cu118)
  Downloading triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading triton-2.1.0-0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.2/89.2 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling c

Collecting faster-whisper
  Downloading faster_whisper-1.1.1-py3-none-any.whl.metadata (16 kB)
Collecting ctranslate2<5,>=4.0 (from faster-whisper)
  Downloading ctranslate2-4.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting onnxruntime<2,>=1.14 (from faster-whisper)
  Downloading onnxruntime-1.22.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting av>=11 (from faster-whisper)
  Downloading av-14.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.6 kB)
Collecting coloredlogs (from onnxruntime<2,>=1.14->faster-whisper)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime<2,>=1.14->faster-whisper)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Downloading faster_whisper-1.1.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m

In [None]:
from fastapi import FastAPI, UploadFile, File, Form
from fastapi.responses import JSONResponse
import shutil
import os
import torch
import torchaudio
from pyngrok import ngrok
import nest_asyncio
import uvicorn

def run_full_pipeline(audio_path):
    import torchaudio
    import torch
    import numpy as np
    from speechbrain.inference.speaker import EncoderClassifier
    from sklearn.metrics.pairwise import pairwise_distances
    from scipy.sparse import csgraph
    from scipy.linalg import eigh
    from sklearn.cluster import SpectralClustering
    from transformers import pipeline as hf_pipeline
    from faster_whisper import WhisperModel
    from difflib import SequenceMatcher
    import re

    # 1. 오디오 로드 및 리샘플링
    waveform, sr = torchaudio.load(audio_path)
    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)
    if sr != 16000:
        waveform = torchaudio.functional.resample(waveform, sr, 16000)
        sr = 16000
        audio_path = "resampled.wav"
        torchaudio.save(audio_path, waveform, sr)

    # 2. 화자 분리
    def is_silent(segment, threshold=0.005):
        rms = torch.sqrt(torch.mean(segment ** 2))
        return rms.item() < threshold

    segment_duration = 2.3
    segment_samples = int(segment_duration * sr)
    stride_samples = int(0.4 * sr)

    segments, frame_times = [], []
    for i in range(0, waveform.shape[1] - segment_samples + 1, stride_samples):
        seg = waveform[:, i:i + segment_samples]
        if not is_silent(seg):
            segments.append(seg)
            frame_times.append(i / sr)

    classifier = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-ecapa-voxceleb", savedir="tmp_ecapa"
    )
    embeddings = []
    for seg in segments:
        try:
            emb = classifier.encode_batch(seg).squeeze(0).detach().numpy()
            embeddings.append(emb)
        except:
            continue

    embedding_matrix = np.vstack(embeddings)
    valid_indices = ~np.isnan(embedding_matrix).any(axis=1)
    embedding_matrix = embedding_matrix[valid_indices]
    frame_times = [frame_times[i] for i, v in enumerate(valid_indices) if v]

    dist_matrix = pairwise_distances(embedding_matrix, metric='cosine')
    dist_matrix = np.nan_to_num(dist_matrix, nan=1.0, posinf=1.0, neginf=1.0)
    affinity = 1 - np.clip(dist_matrix, 0.0, 1.0)

    laplacian = csgraph.laplacian(affinity, normed=True)
    laplacian = np.nan_to_num(laplacian, nan=0.0, posinf=0.0, neginf=0.0)

    max_index = min(9, len(embedding_matrix) - 1)
    eigenvals, _ = eigh(laplacian, subset_by_index=[0, max_index])
    gaps = np.diff(eigenvals)
    best_k = np.argmax(gaps) + 1

    clustering = SpectralClustering(n_clusters=best_k, affinity='precomputed')
    labels = clustering.fit_predict(affinity)

    merged = []
    current_speaker = labels[0]
    start_time = frame_times[0]
    for i in range(1, len(labels)):
        end_time = frame_times[i] + segment_duration
        if labels[i] != current_speaker:
            merged.append((current_speaker, start_time, end_time))
            current_speaker = labels[i]
            start_time = frame_times[i]
    merged.append((current_speaker, start_time, frame_times[-1] + segment_duration))

    # 3. Whisper 자막 + 화자 매핑
    fw_model = WhisperModel("medium", device="cuda", compute_type="float16")
    fwh_segments, _ = fw_model.transcribe(audio_path, beam_size=5)
    fwh_segments = list(fwh_segments)

    def overlap_time(a_start, a_end, b_start, b_end):
        return max(0.0, min(a_end, b_end) - max(a_start, b_start))

    for seg in fwh_segments:
        best_overlap = 0.0
        best_speaker = "?"
        for speaker, s_start, s_end in merged:
            overlap = overlap_time(seg.start, seg.end, s_start, s_end)
            if overlap > best_overlap:
                best_overlap = overlap
                best_speaker = speaker
        seg.speaker = best_speaker

    merged_segments = []
    prev_speaker = None
    buffer_text = ""
    for seg in fwh_segments:
        speaker = seg.speaker
        text = seg.text.strip()
        if speaker == prev_speaker:
            buffer_text += " " + text
        else:
            if prev_speaker is not None:
                merged_segments.append((prev_speaker, buffer_text.strip()))
            buffer_text = text
            prev_speaker = speaker
    if buffer_text:
        merged_segments.append((prev_speaker, buffer_text.strip()))

    # 4. 파인튜닝된 Whisper로 문장 덮어쓰기
    asr_pipeline = hf_pipeline(
        "automatic-speech-recognition",
        model="urewui/ktf",
        device=0,  # 0번 GPU
        chunk_length_s=15,
        use_auth_token="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",
        framework="pt"
    )
    ft_result = asr_pipeline(audio_path)
    if "text" not in ft_result:
        raise ValueError("파인튜닝 모델에서 텍스트 결과 없음")

    ft_text = ft_result["text"].strip()
    ft_sentences = [m.group().strip() for m in re.finditer(r"[^.?!]+[.?!]", ft_text)]

    def find_best_combo_match(whisper_text, ft_sentences, max_combo=3):
        best_score = 0.0
        best_text = whisper_text
        used_range = set()
        for i in range(len(ft_sentences)):
            for j in range(i + 1, min(len(ft_sentences), i + max_combo) + 1):
                if any(k in used_range for k in range(i, j)):
                    continue
                combo = " ".join(ft_sentences[i:j])
                score = SequenceMatcher(None, whisper_text, combo).ratio()
                if score > best_score:
                    best_score = score
                    best_text = combo
                    best_range = set(range(i, j))
        if best_score > 0.0:
            used_range.update(best_range)
        return best_text

    updated_segments = []
    for speaker, old_text in merged_segments:
        new_text = find_best_combo_match(old_text, ft_sentences)
        updated_segments.append((speaker, new_text))

    def speaker_id_to_letter(speaker_id):
        try:
            return chr(ord("A") + int(speaker_id))
        except:
            return str(speaker_id)

    # 전체 스크립트 생성
    full_transcript = " ".join([text for _, text in updated_segments])

    # speaker별 JSON 변환
    speaker_json = [
        {"speaker": speaker_id_to_letter(s), "text": t} for s, t in updated_segments
    ]

    return full_transcript, speaker_json

app = FastAPI()

@app.post("/api/analyze-audio")
async def analyze_audio(audio: UploadFile = File(...), userId: str = Form(...)):
    save_path = f"./{audio.filename}"

    # 1. 파일 저장
    try:
        with open(save_path, "wb") as f:
            shutil.copyfileobj(audio.file, f)
    except Exception as e:
        return JSONResponse(status_code=500, content={"error": f"파일 저장 실패: {str(e)}"})

    # 2. 화자 분리 + 자막 분석 실행
    try:
        transcript, speaker_segments = run_full_pipeline(save_path)
    except Exception as e:
        return JSONResponse(status_code=500, content={"error": f"오디오 처리 실패: {str(e)}"})

    # 3. JSON 포맷으로 결과 생성
    result = {
        "transcript": transcript,
        "speakers": speaker_segments
    }

    return JSONResponse(content=result)

# ngrok 연결
ngrok.set_auth_token("2ycQxO9vUYLivsVwyXYiyDBw59q_6xNH6MmgSaJKMMZ3VM2zx")
public_url = ngrok.connect(8000, domain="tops-beetle-vocal.ngrok-free.app")
print(f"API 주소: {public_url}")

# Colab 환경에 맞게 uvicorn 실행 준비
nest_asyncio.apply()
uvicorn.run(app, host="0.0.0.0", port=8000)

API 주소: NgrokTunnel: "https://tops-beetle-vocal.ngrok-free.app" -> "http://localhost:8000"


INFO:     Started server process [1488]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8000 (Press CTRL+C to quit)
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/tmp_ecapa/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmp_ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/tmp_ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tmp_ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/tmp_ecapa/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.pa

INFO:     44.227.217.144:0 - "POST /api/analyze-audio HTTP/1.1" 200 OK


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/tmp_ecapa/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmp_ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/tmp_ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tmp_ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/tmp_ecapa/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/tmp_ecapa/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/tmp_ecapa/classifier.ckpt'
DEBUG:speec

INFO:     44.227.217.144:0 - "POST /api/analyze-audio HTTP/1.1" 500 Internal Server Error


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/tmp_ecapa/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmp_ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/tmp_ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tmp_ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/tmp_ecapa/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/tmp_ecapa/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/tmp_ecapa/classifier.ckpt'
DEBUG:speec

INFO:     44.227.217.144:0 - "POST /api/analyze-audio HTTP/1.1" 500 Internal Server Error


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/tmp_ecapa/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmp_ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/tmp_ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tmp_ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/tmp_ecapa/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/tmp_ecapa/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/tmp_ecapa/classifier.ckpt'
DEBUG:speec

INFO:     44.227.217.144:0 - "POST /api/analyze-audio HTTP/1.1" 200 OK


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/tmp_ecapa/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmp_ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/tmp_ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tmp_ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/tmp_ecapa/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/tmp_ecapa/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/tmp_ecapa/classifier.ckpt'
DEBUG:speec

INFO:     44.227.217.144:0 - "POST /api/analyze-audio HTTP/1.1" 200 OK
INFO:     27.0.238.187:0 - "GET / HTTP/1.1" 404 Not Found


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/tmp_ecapa/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmp_ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/tmp_ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tmp_ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/tmp_ecapa/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/tmp_ecapa/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/tmp_ecapa/classifier.ckpt'
DEBUG:speec

INFO:     44.227.217.144:0 - "POST /api/analyze-audio HTTP/1.1" 200 OK


INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Using symlink found at '/content/tmp_ecapa/hyperparams.yaml'
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
DEBUG:speechbrain.utils.parameter_transfer:Collecting files (or symlinks) for pretraining in tmp_ecapa.
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Using symlink found at '/content/tmp_ecapa/embedding_model.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["embedding_model"] = /content/tmp_ecapa/embedding_model.ckpt
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Using symlink found at '/content/tmp_ecapa/mean_var_norm_emb.ckpt'
DEBUG:speechbrain.utils.parameter_transfer:Set local path in self.paths["mean_var_norm_emb"] = /content/tmp_ecapa/mean_var_norm_emb.ckpt
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Using symlink found at '/content/tmp_ecapa/classifier.ckpt'
DEBUG:speec

INFO:     44.227.217.144:0 - "POST /api/analyze-audio HTTP/1.1" 200 OK


INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [1488]
