# Imports

In [25]:
import os
import torch
import librosa
import numpy as np
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from typing import List, Tuple
os.environ['HF_TOKEN'] = open('token.txt', 'r').read().strip()

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


# Initialize the Model and Processor

In [27]:
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny").to(device)
model.eval()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


WhisperForConditionalGeneration(
  (model): WhisperModel(
    (encoder): WhisperEncoder(
      (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
      (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
      (embed_positions): Embedding(1500, 384)
      (layers): ModuleList(
        (0-3): 4 x WhisperEncoderLayer(
          (self_attn): WhisperSdpaAttention(
            (k_proj): Linear(in_features=384, out_features=384, bias=False)
            (v_proj): Linear(in_features=384, out_features=384, bias=True)
            (q_proj): Linear(in_features=384, out_features=384, bias=True)
            (out_proj): Linear(in_features=384, out_features=384, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=384, out_features=1536, bias=True)
          (fc2): Linear(in_features=1536, out_features=384, bias=True)
          

# Load and Resample Audio

In [29]:
def load_audio(file_path, target_sampling_rate=16000):
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Audio file not found: {file_path}")
    
    try:
        audio, sr = librosa.load(file_path, sr=target_sampling_rate, mono=True)
        return audio, sr
    except Exception as e:
        raise RuntimeError(f"Error loading audio file: {e}")


In [30]:
load_audio("sample_audio.mp3")

(array([ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00, ...,
        -9.9069439e-08,  1.4549005e-06,  4.4094631e-06], dtype=float32),
 16000)

# Chunking Audio

In [None]:
def chunk_audio(audio, chunk_length=30, sampling_rate=16000):
    chunk_size = chunk_length * sampling_rate
    return [audio[i:i+chunk_size] for i in range(0, len(audio), chunk_size)]


# Transcribing the chunk

In [None]:
def transcribe_chunk(chunk, sampling_rate=16000):
    input_features = processor(chunk, sampling_rate=sampling_rate, return_tensors="pt").input_features
    input_features = input_features.to(device)
    
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    
    return processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

