### Accent Classification with OpenAI Whisper - Research POC

In [None]:
! pip install git+https://github.com/openai/whisper.git

In [None]:
! pip install jiwer

### Setting up FFMPEG

In [None]:
import os
os.environ["PATH"] += os.pathsep + "/opt/homebrew/bin"

# Confirm ffmpeg is now discoverable
import shutil
print("FFmpeg path:", shutil.which("ffmpeg"))

### Setting up Paths

In [None]:
audio_file_path = '/Users/zp3146/Desktop/projects_hamza/accent_classification/research/english_us.mp3'
whsiper_model = 'turbo'

In [None]:
import whisper

model = whisper.load_model(whsiper_model)

# load audio and pad/trim it to fit 30 seconds
audio = whisper.load_audio(audio_file_path)
audio = whisper.pad_or_trim(audio)

# make log-Mel spectrogram and move to the same device as the model
mel = whisper.log_mel_spectrogram(audio, n_mels=model.dims.n_mels).to(model.device)

# detect the spoken language
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# decode the audio
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

# print the recognized text
print(result.text)

### End - End Accent Classification

In [None]:
import json
from dataclasses import dataclass
from typing import List, Optional

import torch
import torch.nn.functional as F
from tqdm import tqdm
import whisper
from whisper.audio import N_FRAMES, N_MELS, log_mel_spectrogram, pad_or_trim
from whisper.model import Whisper
from whisper.tokenizer import get_tokenizer




In [None]:
# Set these to your paths
audio_file_path = "/Users/zp3146/Desktop/projects_hamza/accent_classification/research/english_uk.wav"
class_names_file_path = "/Users/zp3146/Desktop/projects_hamza/accent_classification/research/class_names.txt"


In [None]:
@dataclass
class AudioData:
    audio_path: str
    category: Optional[str] = None


def read_class_names(path: str) -> List[str]:
    with open(path) as f:
        return [line.strip() for line in f]


@torch.no_grad()
def calculate_audio_features(audio_path: Optional[str], model: Whisper) -> torch.Tensor:
    if audio_path is None:
        segment = torch.zeros((N_MELS, N_FRAMES), dtype=torch.float32).to(model.device)
    else:
        mel = log_mel_spectrogram(audio_path)
        segment = pad_or_trim(mel, N_FRAMES).to(model.device)
    return model.embed_audio(segment.unsqueeze(0))


@torch.no_grad()
def calculate_average_logprobs(
    model: Whisper,
    audio_features: torch.Tensor,
    class_names: List[str],
    tokenizer,
) -> torch.Tensor:
    initial_tokens = (
        torch.tensor(tokenizer.sot_sequence_including_notimestamps).unsqueeze(0).to(model.device)
    )
    eot_token = torch.tensor([tokenizer.eot]).unsqueeze(0).to(model.device)

    average_logprobs = torch.zeros(len(class_names))
    for i, class_name in enumerate(class_names):
        class_name_tokens = (
            torch.tensor(tokenizer.encode(" " + class_name)).unsqueeze(0).to(model.device)
        )
        input_tokens = torch.cat([initial_tokens, class_name_tokens, eot_token], dim=1)

        logits = model.logits(input_tokens, audio_features)  # (1, T, V)
        logprobs = F.log_softmax(logits, dim=-1).squeeze(0)  # (T, V)
        logprobs = logprobs[len(tokenizer.sot_sequence_including_notimestamps) - 1 : -1]  # (T', V)
        logprobs = torch.gather(logprobs, dim=-1, index=class_name_tokens.view(-1, 1))  # (T', 1)
        average_logprob = logprobs.mean().item()
        average_logprobs[i] = average_logprob

    return average_logprobs


@torch.no_grad()
def classify(
    model: Whisper,
    audio_path: str,
    class_names: List[str],
    tokenizer,
    internal_lm_average_logprobs: Optional[torch.Tensor] = None,
    verbose: bool = True,
) -> str:
    audio_features = calculate_audio_features(audio_path, model)

    average_logprobs = calculate_average_logprobs(
        model=model,
        audio_features=audio_features,
        class_names=class_names,
        tokenizer=tokenizer,
    )

    if internal_lm_average_logprobs is not None:
        average_logprobs -= internal_lm_average_logprobs

    sorted_indices = sorted(
        range(len(class_names)), key=lambda i: average_logprobs[i], reverse=True
    )

    if verbose:
        print("🔍 Log probabilities for each class:")
        for i in sorted_indices:
            print(f"  {class_names[i]}: {average_logprobs[i]:.3f}")

    return class_names[sorted_indices[0]]


In [None]:
# Load model and tokenizer
model = whisper.load_model("large", device="cuda" if torch.cuda.is_available() else "cpu")
tokenizer = get_tokenizer(multilingual=True, language="en")

# Read class names
class_names = read_class_names(class_names_file_path)

# Classify
predicted_class = classify(
    model=model,
    audio_path=audio_file_path,
    class_names=class_names,
    tokenizer=tokenizer,
    verbose=True,
)

print(f"\n🎯 Final predicted class: {predicted_class}")
