<a href="https://colab.research.google.com/github/MikeCorv/WhisperFineTuning/blob/main/testing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers peft accelerate bitsandbytes librosa ffmpeg-python

In [None]:
!pip install -q deepmultilingualpunctuation

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

In [None]:
import torch
if torch.cuda.is_available():
    print(f"‚úÖ GPU Detected: {torch.cuda.get_device_name(0)}")
else:
    print("‚ö†Ô∏è No GPU detected. It will work, but it will be slow.")

In [None]:
ADAPTER_PATH = "/content/drive/MyDrive/whisper-large-v3-turbo-italian-lora"
BASE_MODEL_ID = "openai/whisper-large-v3-turbo"

In [None]:
from transformers import BitsAndBytesConfig, AutoModelForSpeechSeq2Seq, AutoProcessor
from peft import PeftModel

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
)

In [None]:
base_model = AutoModelForSpeechSeq2Seq.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto"
)

In [None]:
fine_tuned_model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)

In [None]:
processor = AutoProcessor.from_pretrained(ADAPTER_PATH)

In [None]:
from transformers import pipeline

In [None]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=fine_tuned_model,  # <--- UPDATED HERE
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    chunk_length_s=30,
    batch_size=16,
    return_timestamps=True,
)

In [None]:
#The url of the test video is available here: https://www.youtube.com/shorts/Wwz1FenTxTY (lol)

In [None]:
#Transcription with Fine-Tuned Model
import time
from deepmultilingualpunctuation import PunctuationModel

AUDIO_FILE = "test.mp3"
start_time = time.time()

result = pipe(
    AUDIO_FILE,
    generate_kwargs={"language": "italian"}
)
end_time = time.time()

punct_model = PunctuationModel(model="oliverguhr/fullstop-punctuation-multilang-large")

print("\n" + "="*30)
print(f"‚è±Ô∏è Time taken: {end_time - start_time:.2f} seconds")
print("="*30)
print("\nüìù TRANSCRIPTION:\n")
print(result['text'])
print("\n" + "="*30)
print(f"Punctuated Output: {punct_model.restore_punctuation(result['text'])}")