In [1]:
input_file = "./241118_1543.mp3"
output_file = input_file.replace(".mp3", ".wav")

In [2]:
"""
import ffmpeg

try:
    stream = ffmpeg.input(input_file)
    stream = ffmpeg.output(stream, output_file)
    ffmpeg.run(stream)
    print("Conversion successful")
except ffmpeg.Error as e:
    print(f"Error: {e.stderr.decode()}")
"""

'\nimport ffmpeg\n\ntry:\n    stream = ffmpeg.input(input_file)\n    stream = ffmpeg.output(stream, output_file)\n    ffmpeg.run(stream)\n    print("Conversion successful")\nexcept ffmpeg.Error as e:\n    print(f"Error: {e.stderr.decode()}")\n'

In [3]:
import os
from huggingface_hub import login

# To save your Huggingface token, run your terminal:
# echo 'export HF_TOKEN="hf_*******************************"' >> $HOME/.bashrc

# Otherwise, the login function will prompt a login interface
login(token=os.environ.get("HF_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [4]:
from pyannote.audio import Pipeline
import torch
import whisper
import pandas as pd


# Initialize Pyannote pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")

# Load audio file
audio_file = output_file

# Check if MPS is available and set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device}")

# Send pipeline to the appropriate device
pipeline.to(device)

# Perform diarization
diarization = pipeline(audio_file)

# Load Whisper model and transcribe audio
model = whisper.load_model("base.en")
result = model.transcribe(audio_file)

# Process diarization and transcription results
transcript = {}
assigned_sections = set()  # To track assigned sections

def calculate_overlap(start1, end1, start2, end2):
    """Calculate the overlap between two time intervals."""
    overlap_start = max(start1, start2)
    overlap_end = min(end1, end2)
    return max(0, overlap_end - overlap_start)

# Store overlaps for each section
for segment, _, speaker in diarization.itertracks(yield_label=True):
    segment_duration = segment.end - segment.start
    if speaker not in transcript:
        transcript[speaker] = []

    for section in result["segments"]:
        overlap = calculate_overlap(segment.start, segment.end, section["start"], section["end"])
        overlap_fraction = overlap / (section["end"] - section["start"])

        # Store overlap details
        section.setdefault("overlaps", [])
        section["overlaps"].append((speaker, overlap, overlap_fraction))

# Assign text based on overlap conditions
for section in result["segments"]:
    if "overlaps" in section and section["id"] not in assigned_sections:
        overlaps = sorted(section["overlaps"], key=lambda x: x[1], reverse=True)  # Sort by overlap amount
        max_overlap = overlaps[0]

        if max_overlap[2] > 0.5:
            # Assign to the segment with >50% overlap
            speaker = max_overlap[0]
            transcript[speaker].append((section["start"], section["end"], section["text"]))
            assigned_sections.add(section["id"])
        elif len(overlaps) > 1:
            # Assign to the segment with the most overlap if all are <50%
            speaker = max_overlap[0]
            transcript[speaker].append((section["start"], section["end"], section["text"]))
            assigned_sections.add(section["id"])

# Create DataFrame
df = pd.DataFrame([(speaker, start, end, text) 
                   for speaker, segments in transcript.items() 
                   for start, end, text in segments],
                  columns=["Speaker", "Start", "End", "Text"])

# Sort by start time and save to CSV
df.sort_values(by="Start").reset_index(drop=True).to_csv("transcription_results.csv", index=False)

print("Transcription results saved to 'transcription_results.csv'")


INFO:speechbrain.utils.quirks:Applied quirks (see `speechbrain.utils.quirks`): [allow_tf32, disable_jit_profiling]
INFO:speechbrain.utils.quirks:Excluded quirks specified by the `SB_DISABLE_QUIRKS` environment (comma-separated list): []
Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../../.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0. Bad things might happen unless you revert torch to 1.x.


  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)
INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder
INFO:speechbrain.utils.fetching:Fetch hyperparams.yaml: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch custom.py: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa

Using mps


INFO:speechbrain.utils.fetching:Fetch embedding_model.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch mean_var_norm_emb.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch classifier.ckpt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.fetching:Fetch label_encoder.txt: Fetching from HuggingFace Hub 'speechbrain/spkrec-ecapa-voxceleb' if not cached
INFO:speechbrain.utils.parameter_transfer:Loading pretrained files for: embedding_model, mean_var_norm_emb, classifier, label_encoder


Transcription results saved to 'transcription_results.csv'


In [None]:
# SLOWER () pipeline using https://github.com/yinruiqing/pyannote-whisper
"""
from pyannote.audio import Pipeline
import torch
import utils # CREDIT: https://github.com/yinruiqing/pyannote-whisper
import whisper
import pandas as pd

# Initialize Pyannote pipeline
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")

# Load audio file
audio_file = output_file

model = whisper.load_model("base.en")
asr_result = model.transcribe(audio_file)
diarization_result = pipeline(audio_file)
final_result = utils.diarize_text(asr_result, diarization_result)

for seg, spk, sent in final_result:
    line = f'{seg.start:.2f} {seg.end:.2f} {spk} {sent}'
    print(line)
"""