In [None]:
!pip install onnx onnxruntime transformers soundfile
!pip install git+https://github.com/facebookresearch/fairseq.git
!pip install git+https://github.com/openai/whisper.git

Collecting git+https://github.com/openai/whisper.git
  Cloning https://github.com/openai/whisper.git to /tmp/pip-req-build-m91lxdgi
  Running command git clone --filter=blob:none --quiet https://github.com/openai/whisper.git /tmp/pip-req-build-m91lxdgi
  Resolved https://github.com/openai/whisper.git to commit ba3f3cd54b0e5b8ce1ab3de13e32122d0d5f98ab
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tiktoken (from openai-whisper==20231117)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->openai-whisper==20231117)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==1

In [None]:
import torch
import numpy as np
from collections import OrderedDict
from pathlib import Path
import struct
from whisper import Whisper, ModelDimensions

# Specify file paths directly
fname_inp = '/content/drive/MyDrive/Models/ggml-base.en.bin'
dir_out = '.'  # Current directory
fname_out = 'torch-model.pt'

# Open the ggml file
with open(fname_inp, "rb") as f:
    # Read magic number and hyperparameters
    magic_number, n_vocab, n_audio_ctx, n_audio_state, n_audio_head, n_audio_layer, n_text_ctx, n_text_state, n_text_head, n_text_layer, n_mels, use_f16 = struct.unpack("12i", f.read(48))
    print(f"Magic number: {magic_number}")
    print(f"Vocab size: {n_vocab}")
    print(f"Audio context size: {n_audio_ctx}")
    print(f"Audio state size: {n_audio_state}")
    print(f"Audio head size: {n_audio_head}")
    print(f"Audio layer size: {n_audio_layer}")
    print(f"Text context size: {n_text_ctx}")
    print(f"Text head size: {n_text_head}")
    print(f"Mel size: {n_mels}")

    # Read mel filters
    filters_shape_0 = struct.unpack("i", f.read(4))[0]
    print(f"Filters shape 0: {filters_shape_0}")
    filters_shape_1 = struct.unpack("i", f.read(4))[0]
    print(f"Filters shape 1: {filters_shape_1}")

    mel_filters = np.zeros((filters_shape_0, filters_shape_1))
    for i in range(filters_shape_0):
        for j in range(filters_shape_1):
            mel_filters[i][j] = struct.unpack("f", f.read(4))[0]

    # Read tokenizer tokens
    bytes_data = f.read(4)
    num_tokens = struct.unpack("i", bytes_data)[0]
    tokens = {}
    for _ in range(num_tokens):
        token_len = struct.unpack("i", f.read(4))[0]
        token = f.read(token_len)
        tokens[token] = {}

    # Read model variables
    model_state_dict = OrderedDict()
    while True:
        try:
            n_dims, name_length, ftype = struct.unpack("iii", f.read(12))
        except struct.error:
            break  # End of file

        dims = [struct.unpack("i", f.read(4))[0] for _ in range(n_dims)]
        dims = dims[::-1]
        name = f.read(name_length).decode("utf-8")

        if ftype == 1:  # f16
            data = np.fromfile(f, dtype=np.float16, count=np.prod(dims)).reshape(dims)
        else:  # f32
            data = np.fromfile(f, dtype=np.float32, count=np.prod(dims)).reshape(dims)

        if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
            data = data[:, 0]

        model_state_dict[name] = torch.from_numpy(data)

# Create Whisper model with correct dimensions
dims = ModelDimensions(
    n_mels=n_mels,
    n_audio_ctx=n_audio_ctx,
    n_audio_state=n_audio_state,
    n_audio_head=n_audio_head,
    n_audio_layer=n_audio_layer,
    n_text_ctx=n_text_ctx,
    n_text_state=n_text_state,
    n_text_head=n_text_head,
    n_text_layer=n_text_layer,
    n_vocab=n_vocab,
)
model = Whisper(dims)

# Load the state dict into the model
model.load_state_dict(model_state_dict)

# Save the model in PyTorch format
torch.save(model.state_dict(), fname_out)

print(f"Model successfully converted and saved to {fname_out}")

Magic number: 1734831468
Vocab size: 51864
Audio context size: 1500
Audio state size: 512
Audio head size: 8
Audio layer size: 6
Text context size: 448
Text head size: 8
Mel size: 80
Filters shape 0: 80
Filters shape 1: 201
Model successfully converted and saved to torch-model.pt


In [73]:
import torch
import numpy as np
from collections import OrderedDict
from pathlib import Path
import struct
from whisper import Whisper, ModelDimensions

# Specify file paths directly
fname_inp = '/content/drive/MyDrive/Models/ggml-base.en.bin'
dir_out = '.'  # Current directory
fname_out = 'torch-model1.pt'

# Open the ggml file
with open(fname_inp, "rb") as f:
    # Read magic number and hyperparameters
    magic_number, n_vocab, n_audio_ctx, n_audio_state, n_audio_head, n_audio_layer, n_text_ctx, n_text_state, n_text_head, n_text_layer, n_mels, use_f16 = struct.unpack("12i", f.read(48))
    print(f"Magic number: {magic_number}")
    print(f"Vocab size: {n_vocab}")
    print(f"Audio context size: {n_audio_ctx}")
    print(f"Audio state size: {n_audio_state}")
    print(f"Audio head size: {n_audio_head}")
    print(f"Audio layer size: {n_audio_layer}")
    print(f"Text context size: {n_text_ctx}")
    print(f"Text head size: {n_text_head}")
    print(f"Mel size: {n_mels}")

    # Read mel filters
    filters_shape_0 = struct.unpack("i", f.read(4))[0]
    print(f"Filters shape 0: {filters_shape_0}")
    filters_shape_1 = struct.unpack("i", f.read(4))[0]
    print(f"Filters shape 1: {filters_shape_1}")

    mel_filters = np.zeros((filters_shape_0, filters_shape_1))
    for i in range(filters_shape_0):
        for j in range(filters_shape_1):
            mel_filters[i][j] = struct.unpack("f", f.read(4))[0]

    # Read tokenizer tokens (this part might not be necessary, but I'm leaving it as is)
    bytes_data = f.read(4)
    num_tokens = struct.unpack("i", bytes_data)[0]
    tokens = {}
    for _ in range(num_tokens):
        token_len = struct.unpack("i", f.read(4))[0]
        token = f.read(token_len)
        tokens[token] = {}

    # Read model variables
    model_state_dict = OrderedDict()
    while True:
        try:
            n_dims, name_length, ftype = struct.unpack("iii", f.read(12))
        except struct.error:
            break  # End of file

        dims = [struct.unpack("i", f.read(4))[0] for _ in range(n_dims)]
        dims = dims[::-1]
        name = f.read(name_length).decode("utf-8")

        if ftype == 1:  # f16
            data = np.fromfile(f, dtype=np.float16, count=np.prod(dims)).reshape(dims)
        else:  # f32
            data = np.fromfile(f, dtype=np.float32, count=np.prod(dims)).reshape(dims)

        if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
            data = data[:, 0]

        model_state_dict[name] = torch.from_numpy(data)

# Create Whisper model with correct dimensions
dims = ModelDimensions(
    n_mels=n_mels,
    n_audio_ctx=n_audio_ctx,
    n_audio_state=n_audio_state,
    n_audio_head=n_audio_head,
    n_audio_layer=n_audio_layer,
    n_text_ctx=n_text_ctx,
    n_text_state=n_text_state,
    n_text_head=n_text_head,
    n_text_layer=n_text_layer,
    n_vocab=n_vocab,
)
model = Whisper(dims)

# Load the state dict into the model
model.load_state_dict(model_state_dict)

# --- IMPORTANT: Save the model AND dimensions ---
torch.save({
    'dims': dims,        # Save the ModelDimensions object
    'model_state_dict': model.state_dict()
}, fname_out)

print(f"Model successfully converted and saved to {fname_out}")

Magic number: 1734831468
Vocab size: 51864
Audio context size: 1500
Audio state size: 512
Audio head size: 8
Audio layer size: 6
Text context size: 448
Text head size: 8
Mel size: 80
Filters shape 0: 80
Filters shape 1: 201
Model successfully converted and saved to torch-model1.pt


In [78]:
import whisper
import torch

# Load the Whisper model

# 1. Load the checkpoint dictionary
checkpoint = torch.load("torch-model1.pt")

# 2. Get the ModelDimensions object directly
dims = checkpoint['dims']  # <---- No need to unpack

# 3. Create the Whisper model using the loaded dims
model = whisper.Whisper(dims)

# 4. Load only the model weights
model.load_state_dict(checkpoint['model_state_dict'])
# Prepare dummy inputs
audio_dummy = whisper.load_audio("/content/drive/MyDrive/Models/jfk.wav")
audio_dummy = whisper.pad_or_trim(audio_dummy)
mel_dummy = whisper.log_mel_spectrogram(audio_dummy).to(model.device)
mel_dummy = mel_dummy[None, :] # Add batch dimension

tokenizer = whisper.tokenizer.get_tokenizer(model.is_multilingual, language="en", task="transcribe")
decoder_input_ids = torch.tensor([[tokenizer.sot]], dtype=torch.long)

# Set the model to inference mode
model.eval()

# --- Export the encoder ---
torch.onnx.export(
    model.encoder,
    (mel_dummy,),
    "encoder.onnx",
    input_names=["mel"],
    output_names=["encoder_output"],
    dynamic_axes={
        "mel": {0: "batch", 1: "time"},
        "encoder_output": {0: "batch", 1: "encoder_sequence"}
    }
)
print("Encoder exported to encoder.onnx")

# --- Export the decoder ---
torch.onnx.export(
    model.decoder,
    (decoder_input_ids, model.encoder(mel_dummy)),
    "decoder.onnx",
    input_names=["tokens", "encoder_output"],
    output_names=["logits"],
    dynamic_axes={
        "tokens": {0: "batch", 1: "text_length"},
        "encoder_output": {0: "batch", 1: "encoder_sequence"},
        "logits": {0: "batch", 1: "text_length"}
    }
)
print("Decoder exported to decoder.onnx")

  assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"


Encoder exported to encoder.onnx
Decoder exported to decoder.onnx


In [83]:
import whisper
import torch
import onnx

# Load the Whisper model
model_path = "/content/torch-model1.pt"  # Replace with your model path
checkpoint = torch.load(model_path)
dims = checkpoint['dims']
model = whisper.Whisper(dims)
model.load_state_dict(checkpoint['model_state_dict'])

# Prepare dummy inputs
audio_dummy = whisper.load_audio("/content/drive/MyDrive/Models/jfk.wav")
audio_dummy = whisper.pad_or_trim(audio_dummy)
mel_dummy = whisper.log_mel_spectrogram(audio_dummy).to(model.device)
mel_dummy = mel_dummy[None, :]

tokenizer = whisper.tokenizer.get_tokenizer(model.is_multilingual, language="en", task="transcribe")
decoder_input_ids = torch.tensor([[tokenizer.sot]], dtype=torch.long)

# Set the model to inference mode
model.eval()

# --- Export the combined model ---
torch.onnx.export(
    model,
    (mel_dummy, decoder_input_ids),
    "whisper_combined.onnx",
    input_names=["mel", "decoder_input_ids"],
    output_names=["logits"],
    dynamic_axes={
        "mel": {0: "batch", 1: "time"},
        "decoder_input_ids": {0: "batch", 1: "text_length"},
        "logits": {0: "batch", 1: "text_length"}
    },
    opset_version=14
)
print("Combined Whisper model exported to whisper_combined.onnx")

  assert x.shape[1:] == self.positional_embedding.shape, "incorrect audio shape"


Combined Whisper model exported to whisper_combined.onnx


In [None]:
import whisper
from transformers import WhisperTokenizer

# --- Load Whisper Model in PyTorch ---
model = whisper.load_model("base.en")

# --- Load Tokenizer ---
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base.en")

# --- Load and Preprocess Audio ---
audio_file = "/content/drive/MyDrive/Models/jfk.wav"
audio = whisper.load_audio(audio_file)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
mel = mel.unsqueeze(0)  # Add batch dimension

# --- Transcribe using Whisper's decode method ---
options = whisper.DecodingOptions(language="en", without_timestamps=True)
results = whisper.decode(model, mel, options) # 'results' is a list

# --- Print the transcription ---
print("PyTorch Transcription:", results[0].text)

PyTorch Transcription: And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.


In [76]:
import torch
import whisper
from transformers import WhisperTokenizer

# --- Load Whisper Model Architecture ---
model = whisper.load_model("base.en")  # Create a new model instance

# 1. Load the checkpoint dictionary
checkpoint = torch.load("torch-model1.pt")

# 2. Get the ModelDimensions object directly
dims = checkpoint['dims']  # <---- No need to unpack

# 3. Create the Whisper model using the loaded dims
model = whisper.Whisper(dims)

# 4. Load only the model weights
model.load_state_dict(checkpoint['model_state_dict'])
# --- Load Tokenizer ---
tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base.en")

# --- Load and Preprocess Audio ---
audio_file = "/content/drive/MyDrive/Models/jfk.wav"
audio = whisper.load_audio(audio_file)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(model.device)
mel = mel.unsqueeze(0)  # Add batch dimension

# --- Transcribe using Whisper's decode method ---
options = whisper.DecodingOptions(language="en", without_timestamps=True)
results = whisper.decode(model, mel, options)

# --- Print the transcription ---
print("PyTorch Transcription:", results[0].text)


PyTorch Transcription: And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.


In [82]:
import time
import numpy as np
import onnxruntime
import whisper

# --- 1. Load ONNX models ---
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_encoder = onnxruntime.InferenceSession("encoder.onnx", sess_options=sess_options)
sess_decoder = onnxruntime.InferenceSession("decoder.onnx", sess_options=sess_options)

# --- 2. Load audio and prepare mel spectrogram ---
audio_file = "/content/drive/MyDrive/Models/jfk.wav"
audio = whisper.load_audio(audio_file)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio)
mel = mel.unsqueeze(0).numpy()

# --- 3. Load tokenizer ---
tokenizer = whisper.tokenizer.get_tokenizer(multilingual=False, language="en", task="transcribe")

# --- 4. Set inference parameters ---
max_tokens = 512
temperature = 0
# ... add other parameters like beam_size, best_of, etc. as needed ...

# --- 5. ONNX Inference ---
start_time = time.time()

# Encode the audio
encoder_output, = sess_encoder.run(["encoder_output"], {"mel": mel})

# Initialize decoder input with start of sequence (sot) token
tokens = [tokenizer.sot]

# Loop to generate tokens
for _ in range(max_tokens):
    # Prepare decoder input
    decoder_input = np.array([tokens], dtype=np.int64)

    # Run the decoder
    logits, = sess_decoder.run(["logits"], {"tokens": decoder_input, "encoder_output": encoder_output})

    # Sample the next token (greedy decoding for now)
    next_token = logits[0, -1].argmax()

    # Append the token to the sequence
    tokens.append(next_token)

    # Stop if end-of-sequence (eot) token is generated
    if next_token == tokenizer.eot:
        break

# Decode the generated tokens
transcription = tokenizer.decode(tokens)


transcription = transcription.replace("<|startoftranscript|>", "")
transcription = transcription.replace("<|notimestamps|>", "")
transcription = transcription.replace("<|endoftext|>", "")

end_time = time.time()
print(f"ONNX Inference Time: {end_time - start_time:.2f} seconds")
print(f"Transcription: {transcription}")

ONNX Inference Time: 10.85 seconds
Transcription:  And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.


In [84]:
import time
import numpy as np
import onnxruntime
import whisper

# --- 1. Load the combined ONNX model ---
sess_options = onnxruntime.SessionOptions()
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
session = onnxruntime.InferenceSession("whisper_combined.onnx", sess_options=sess_options)

# --- 2. Load audio and prepare mel spectrogram ---
audio_file = "/content/drive/MyDrive/Models/jfk.wav" # Replace with your audio file
audio = whisper.load_audio(audio_file)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio)
mel = mel.unsqueeze(0).numpy()  # Add batch dimension

# --- 3. Load tokenizer ---
tokenizer = whisper.tokenizer.get_tokenizer(multilingual=False, language="en", task="transcribe")

# --- 4. Set inference parameters ---
max_tokens = 512
temperature = 0

# --- 5. ONNX Inference ---
start_time = time.time()

# Initialize decoder input with start of sequence (sot) token
tokens = [tokenizer.sot]

# Loop to generate tokens
for _ in range(max_tokens):
    decoder_input = np.array([tokens], dtype=np.int64)

    # Run the combined ONNX model
    logits, = session.run(["logits"], {"mel": mel, "decoder_input_ids": decoder_input})

    next_token = logits[0, -1].argmax()
    tokens.append(next_token)

    if next_token == tokenizer.eot:
        break

# Decode the generated tokens
transcription = tokenizer.decode(tokens)

# --- 6. Remove special tokens ---
transcription = transcription.replace("<|startoftranscript|>", "")
transcription = transcription.replace("<|notimestamps|>", "")
transcription = transcription.replace("<|endoftext|>", "")

end_time = time.time()
print(f"ONNX Inference Time: {end_time - start_time:.2f} seconds")
print(f"Transcription: {transcription}")

ONNX Inference Time: 70.96 seconds
Transcription:  And so my fellow Americans ask not what your country can do for you, ask what you can do for your country.
