Please note: this notebook was designed to be used in a Sagemaker environment

In [None]:
!pip install ffmpeg-python

In [None]:
import warnings
from transformers.utils import logging
import os
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
import time
import boto3
from os.path import isfile, join
from pydub import AudioSegment
import io
import tempfile
import numpy as np
from io import BytesIO
import ffmpeg
import csv
import argparse
import string
from pathlib import Path
from tqdm import tqdm

In [None]:
BUCKET_NAME = 'your/bucket/name'
script_dir = join(os.getcwd())

In [6]:
warnings.filterwarnings("ignore")

transformers_logger = logging.get_logger("transformers")
transformers_logger.setLevel(logging.ERROR)

In [None]:
def get_s3_client():
    return boto3.client('s3')


def list_s3_files_folder(folder_name):
    if not folder_name.endswith('/'):
        folder_name += '/'

    keys = []
    continuation_token = None

    while True:
        if continuation_token:
            response = s3.list_objects_v2(
                Bucket=BUCKET_NAME,
                Prefix=folder_name,
                ContinuationToken=continuation_token
            )
        else:
            response = s3.list_objects_v2(
                Bucket=BUCKET_NAME,
                Prefix=folder_name
            )

        contents = response.get('Contents', [])
        keys.extend(
            obj['Key'] for obj in contents if obj['Key'].endswith('.wav')
        )

        if response.get('IsTruncated'):
            continuation_token = response.get('NextContinuationToken')
        else:
            break

    return keys


def get_file_from_folder(folder_name, file_name):
    if not folder_name.endswith('/'):
        folder_name += '/'

    buffer = BytesIO()
    s3.download_fileobj(BUCKET_NAME, file_name, buffer)

    return buffer.getvalue()


def get_reference(folder_name):
    if not folder_name.endswith('/'):
        folder_name += '/'

    x = folder_name.rstrip('/').split('.')[0]
    expected_filename = f"{x}_reference.stm"
    expected_key = folder_name + expected_filename

    if (folder_name == 'testing_subset/'):
        expected_key = "1. nonnative-read/1_reference.stm"
        folder_name = "1. nonnative-read/"

    response = s3.list_objects_v2(Bucket=BUCKET_NAME, Prefix=folder_name)

    for obj in response.get('Contents', []):
        if obj['Key'] == expected_key:
            buffer = BytesIO()
            s3.download_fileobj(BUCKET_NAME, expected_key, buffer)
            stm_text = buffer.getvalue().decode("utf-8")
            return stm_text

    return None


def list_folders():
    response = s3.list_objects_v2(Bucket=BUCKET_NAME, Delimiter='/')
    return [p['Prefix'].rstrip('/') for p in response.get('CommonPrefixes', [])]


def get_duration(stm_text, s3_key):
    # Extract the filename without extension from the full S3 key
    clip_base = os.path.splitext(os.path.basename(s3_key))[0]

    for line in stm_text.splitlines():
        parts = line.strip().split()
        if len(parts) < 5:
            continue

        if parts[0] == clip_base:
            try:
                start = float(parts[3])
                end = float(parts[4])
                return end - start
            except ValueError:
                continue

    return None

In [8]:
def make_pipeline(model_id):
    """Create a pipeline for automatic speech recognition using the specified model."""
    device = "cuda:0"
    torch_dtype = torch.float16

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
        return_timestamps="word"
    )

    return pipe

In [None]:
def predict_clips_with_models_ctm(models):
    for model_path in models:
        pipe = make_pipeline(model_path)
        model_name = model_path.split('/')[-1]
        counter = 0

        # Set up output directories
        model_dir = Path("results") / model_name
        ctm_dir = model_dir / "ctm"
        tsv_dir = model_dir / "tsv"
        model_dir.mkdir(parents=True, exist_ok=True)
        ctm_dir.mkdir(exist_ok=True)
        tsv_dir.mkdir(exist_ok=True)

        # Progress tracking
        progress_file = model_dir / "progress.tsv"
        completed_folders = set()
        if progress_file.exists():
            with open(progress_file, "r") as pf:
                completed_folders = set(line.strip() for line in pf)

        folders = list_folders()

        for folder in tqdm(folders, desc=f"{model_name} - folders", unit="folder", dynamic_ncols=True, position=0):
            if folder in completed_folders:
                continue

            x = folder.rstrip('/').split('.')[0]
            files = list_s3_files_folder(folder)

            ctm_lines = []
            timing_lines = []
            file_sentences = {}

            ref_file = get_reference(folder)

            for file in tqdm(files, desc=f"Â  {folder}", unit="file", leave=False, dynamic_ncols=True, position=1):
                counter += 1
                audio_file = get_file_from_folder(folder, file)

                start_time = time.perf_counter()
                result = pipe(audio_file, generate_kwargs={"language": "dutch"}) ## set to english for primock
                end_time = time.perf_counter()

                file_id = Path(file).stem

                chunks = result.get("chunks", [])
                words = []
                for i, word_data in enumerate(chunks):
                    start = float(word_data["timestamp"][0])
                    end = word_data["timestamp"][1]

                    if end is None:
                        if i < len(chunks) - 1:
                            end = float(chunks[i + 1]["timestamp"][0])
                        else:
                            end = float(get_duration(ref_file, file))

                    duration = end - start
                    word = word_data["text"].strip()
                    confidence = word_data.get("confidence", 1.0)

                    ctm_lines.append(f"{file_id} 1 {start:.2f} {duration:.2f} {word} {confidence:.4f}")
                    words.append(word)

                execution_time = end_time - start_time
                duration = get_duration(ref_file, file)

                rtf = f"{(execution_time / duration):.4f}"

                prediction = " ".join(words)
                file_sentences[file_id] = prediction

                timing_lines.append(f"{file_id}\t{rtf}\t{prediction}")

            safe_folder_name = folder.split(".")[0]
            ctm_filename = f"{model_name}_{safe_folder_name}.ctm"
            tsv_filename = f"{model_name}_{safe_folder_name}.tsv"

            with open(ctm_dir / ctm_filename, "w", encoding="utf-8") as f:
                f.write("\n".join(ctm_lines) + "\n")

            with open(tsv_dir / tsv_filename, "w", encoding="utf-8") as f:
                f.write("file\tRTF\tprediction\n")  # TSV header
                f.write("\n".join(timing_lines) + "\n")

            # Mark this folder as completed
            with open(progress_file, "a") as pf:
                pf.write(folder + "\n")


In [10]:
s3 = get_s3_client()

In [None]:
models_to_test = ["openai/whisper-tiny", "openai/whisper-small", "openai/whisper-base", 
                  "openai/whisper-medium", "openai/whisper-large-v3", "openai/whisper-large-v3-turbo"]

predict_clips_with_models_ctm(models_to_test)