In [None]:
# Run this once when you restart the notebook or start this notebook
!pip install -q torch==2.0.0+cu118 torchvision==0.15.1+cu118 torchaudio==2.0.1+cu118 torchtext==0.15.1 torchdata==0.6.0 --extra-index-url https://download.pytorch.org/whl/cu118 -U
!pip install pydub
!pip install --upgrade -q faster-whisper ipython-autotime
!pip install -q git+https://github.com/pyannote/pyannote-audio > /dev/null

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.3/2.3 GB[0m [31m738.3 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.1/6.1 MB[0m [31m73.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.4/4.4 MB[0m [31m81.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m82.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.3/63.3 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25

In [None]:
import datetime

import subprocess
import os

import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
embedding_model = PretrainedSpeakerEmbedding(
    "speechbrain/spkrec-ecapa-voxceleb",
    device=torch.device("cuda"))

from pyannote.audio import Audio
from pyannote.core import Segment

import wave
import contextlib

from pydub import AudioSegment
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from tqdm import tqdm


from faster_whisper import WhisperModel

Downloading (…)ain/hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

Downloading (…)an_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

Downloading (…)in/label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

In [None]:
audio = '.mp3'


time: 283 µs (started: 2023-10-30 16:57:52 +00:00)


In [None]:
num_speakers = 9 #@param {type:"integer"}

time: 259 µs (started: 2023-10-30 16:57:52 +00:00)


In [None]:
def get_list(array_segments):
  segments = []
  for segment in tqdm(array_segments):
    formatted_segment = {
          'id': segment.id,
          'seek': segment.seek,
          'start': segment.start,
          'end': segment.end,
          'text': segment.text,
          'tokens': segment.tokens,
          'temperature': segment.temperature,
          'avg_logprob': segment.avg_logprob,
          'compression_ratio': segment.compression_ratio,
          'no_speech_prob': segment.no_speech_prob
      }
    segments.append(formatted_segment)
  return segments

def convert_to_mono(audio_file):
    audio = AudioSegment.from_file(audio_file)

    if audio_file[-3:] != 'wav':
      subprocess.call(['ffmpeg', '-i', audio_file, 'audio.wav', '-y'])
      audio_file = 'audio.wav'

    if audio.channels > 1:

        audio = audio.set_channels(1)

        mono_audio_file = 'mono_' + audio_file
        audio.export(mono_audio_file, format="wav")
        return mono_audio_file
    return audio_file



def get_duration(path):
  with contextlib.closing(wave.open(path,'r')) as f:
    frames = f.getnframes()
    rate = f.getframerate()
    duration = frames / float(rate)
    return duration


def segment_embedding(segments, duration, num_speakers):
  audio = Audio()


  embeddings = np.zeros(shape=(len(segments), 192))
  for i, segment in enumerate(segments):
    start = segment['start']
  # Whisper overshoots the end timestamp in the last segment
    end = min(duration, segment['end'])
    clip = Segment(start, end)
    waveform, sample_rate = audio.crop(path, clip)
    embeddings[i] = embedding_model(waveform[None])

  embeddings = np.nan_to_num(embeddings)

  clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
  labels = clustering.labels_
  for i in range(len(segments)):

    segments[i]['speaker'] = 'SPEAKER ' + str(labels[i] + 1)
  return segments

def time(secs):
  return datetime.timedelta(seconds=round(secs))

def get_transcript(time, segments):
  f = open("transcript.txt", "w")

  for (i, segment) in enumerate(segments):
    if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
      f.write("\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n')
    f.write(segment["text"][1:] + ' ')
  f.close()

  file_paths = ['/content/audio.wav', '/content/mono_audio.wav']

  for file_path in file_paths:
    if os.path.exists(file_path):
        os.remove(file_path)
  return 'Transcript is Successfully Generated'

time: 2.55 ms (started: 2023-10-30 17:16:41 +00:00)


In [None]:
model_size = "large-v2"

# Run on GPU with FP16
fast_model = WhisperModel(model_size, device="cuda", compute_type="float16")
segments, info = fast_model.transcribe(audio, beam_size=1)

time: 49 s (started: 2023-10-30 17:18:56 +00:00)


In [None]:
# Check wheather the video is mono or stereo and audio is in .wav format
path = convert_to_mono(audio)
# Duration
duration = get_duration(path)
# Segments List
# segments = get_list(segments)
segments = get_list(segments)
# Segment embedding
generated_segments = segment_embedding(segments, duration, num_speakers = num_speakers)
result = get_transcript(time, generated_segments)
print(result)

3232it [07:17,  7.39it/s]


Transcript is Successfully Generated
time: 8min 41s (started: 2023-10-30 17:20:10 +00:00)
