In [1]:
!pip install openai-whisper

Collecting openai-whisper
  Downloading openai-whisper-20231117.tar.gz (798 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.6/798.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l- \ | / - \ | / done
[?25h  Getting requirements to build wheel ... [?25l- done
[?25h  Preparing metadata (pyproject.toml) ... [?25l- done
[?25hCollecting triton<3,>=2.0.0 (from openai-whisper)
  Downloading triton-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting tiktoken (from openai-whisper)
  Downloading tiktoken-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading triton-2.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (168.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m168.1/168.1 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.6.0-cp310-cp310-manylinu

In [2]:
import os
import pandas as pd
import whisper
import torch

In [3]:
def transcribe_audio(audio_file, model, device):
    audio = whisper.load_audio(audio_file)
    audio = whisper.pad_or_trim(audio)
    
    mel = whisper.log_mel_spectrogram(audio).to(device)
    
    options = whisper.DecodingOptions(language="hi") # setting decoding options to transcribe only from hindi
    result = whisper.decode(model, mel, options)
    
    return result.text

In [4]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"using {device} as accelerator")
    
    model = whisper.load_model("base")
    model.to(device)
    
    audio_directory = "/kaggle/input/gv-eval-3h/GV_Eval_3h/Audio"
    
    transcriptions = []
    
    count = 0
    for audio_file in os.listdir(audio_directory):
        if audio_file.endswith(".mp3"):
            audio_path = os.path.join(audio_directory, audio_file)
            transcription = transcribe_audio(audio_path, model, device)
            transcriptions.append((audio_file, transcription))
            count += 1
            if count % 10 == 0:
                print(f"Total {count} files transcribed till now")
    print("\n\n\n\n-------------------------")
    print(f"A total of {count} files were transcribed")
    df = pd.DataFrame(transcriptions, columns=["Audio File", "Transcription"])
    
    df.to_excel("transcription.xlsx", index=False)

In [5]:
if __name__ == "__main__":
    main()

using cuda as accelerator


100%|████████████████████████████████████████| 139M/139M [00:00<00:00, 165MiB/s]


Total 10 files transcribed till now
Total 20 files transcribed till now
Total 30 files transcribed till now
Total 40 files transcribed till now
Total 50 files transcribed till now
Total 60 files transcribed till now
Total 70 files transcribed till now
Total 80 files transcribed till now
Total 90 files transcribed till now
Total 100 files transcribed till now
Total 110 files transcribed till now
Total 120 files transcribed till now
Total 130 files transcribed till now
Total 140 files transcribed till now
Total 150 files transcribed till now
Total 160 files transcribed till now
Total 170 files transcribed till now
Total 180 files transcribed till now
Total 190 files transcribed till now
Total 200 files transcribed till now
Total 210 files transcribed till now
Total 220 files transcribed till now
Total 230 files transcribed till now
Total 240 files transcribed till now
Total 250 files transcribed till now
Total 260 files transcribed till now
Total 270 files transcribed till now
Total 280 