In [1]:
# Check the device type, it is highly recommanded to run this code with a gpu device
import torch 
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## Feature Extractor, Tokenizer, Processor

In [2]:
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from transformers import WhisperFeatureExtractor, WhisperTokenizer

In [3]:
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-small")
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small", language="Swahili", task="transcribe")
processor = WhisperProcessor.from_pretrained("Jayem-11/whisper-small-swahili-3")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Model

In [4]:
# Load the final model from huggingface
model = WhisperForConditionalGeneration.from_pretrained("model/").to(device)
forced_decoder_ids = processor.get_decoder_prompt_ids(language="sw", task="transcribe")

## Process Test dataset

In [5]:
from moviepy.editor import *
# Load the video file from the uploaded file
video = VideoFileClip("add video path")

# Extract audio from the video
audio = video.audio

# Save the audio to a temporary file
audio.write_audiofile("temp_audio.wav")


MoviePy - Writing audio in temp_audio.wav


                                                                                                                       

MoviePy - Done.


In [6]:
import librosa
y, sr = librosa.load("temp_audio.wav")

In [68]:
# Resample all audio files to 16 kHz
audio_resampled = librosa.resample(y, orig_sr = sr, target_sr=16000)

In [69]:
input_id = feature_extractor(audio_resampled, sampling_rate = 16000).input_features[0]

In [70]:
input_id_3d = np.expand_dims(input_id, axis=0)

### Evaluation metric

In [24]:
# Load the Word Error Rate (WER) metric
import evaluate

metric = evaluate.load("wer")

Downloading builder script:   0%|          | 0.00/4.49k [00:00<?, ?B/s]

## Inference

In [72]:
import torch
inputs = torch.from_numpy(input_id_3d)

In [73]:
generated_tokens = (
    model.generate(
    input_features=inputs.to(device),
    max_new_tokens=255,
    )
    .cpu()
    .numpy()
)


In [None]:
decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
print(decoded_preds)