In [None]:
!pip install datasets
!pip install librosa
!pip install pydub

In [None]:
# Load model directly
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

processor = AutoProcessor.from_pretrained("openai/whisper-tiny")
model = AutoModelForSpeechSeq2Seq.from_pretrained ("openai/whisper-tiny")

In [None]:
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import torch
import librosa  # For loading audio files
from pydub import AudioSegment  # For handling AAC files

# Load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
model.config.forced_decoder_ids = None

# Load your AAC audio file and convert to WAV
audio_file = "/content/test1.aac"  # Replace with your AAC file path
audio = AudioSegment.from_file(audio_file, format="aac")
audio.export("temp.wav", format="wav")  # Export as temporary WAV file

# Load the temporary WAV file using librosa
audio_input, sample_rate = librosa.load("temp.wav", sr=16000) # Load and resample to 16kHz

# Process audio with language specified
input_features = processor(
    audio_input, 
    sampling_rate=sample_rate, 
    return_tensors="pt", 
    language="en"  # Specify English for translation
)

# Get the required padding length
padding_length = 3000 - input_features.input_features.shape[-1]

# Pad the input features manually
input_features.input_features = torch.nn.functional.pad(
    input_features.input_features,
    (0, padding_length),
    mode="constant",
    value=processor.tokenizer.pad_token_id
)

# Manually create attention mask
attention_mask = torch.ones_like(input_features.input_features)
attention_mask[input_features.input_features == processor.tokenizer.pad_token_id] = 0

# Generate token ids, providing the attention mask
predicted_ids = model.generate(
    input_features.input_features, 
    attention_mask=attention_mask
)

# Decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
print(transcription)