# Automatic Speech Recognition (ASR)

In [None]:
import os
import time
import json

## 1) Wav2Vec2: wav2vec2-xls-r-1b-english, wav2vec2-large-xlsr-53-english

In [3]:
from huggingsound import SpeechRecognitionModel

# Select model
model_name = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
# model_name = "jonatasgrosman/wav2vec2-xls-r-1b-english"

# Load ASR model
model = SpeechRecognitionModel(model_name)

In [5]:
# Transcribe audio files from the training set
for group in ['cc','cd']:
    print(group)
    path = 'data/ADReSS-IS2020/train/Full_wave_enhanced_audio/'+ group + '/'
    path_save = 'data/ADReSS-IS2020/train/transcription_ASR_full_'+model_name.split('/')[1]+'/'+ group + '/'
    files = os.listdir(path)
    tic = time.time()
    for file in files:
        filename = path + file
        transcriptions = model.transcribe([filename])
        with open(path_save+file.split('.')[0]+'.json', 'w') as f:
            json.dump(transcriptions, f)
    toc = time.time()
    print('Duration:',round((toc-tic)/60,2),'min')

In [4]:
# Transcribe audio files from the test set
path = 'data/ADReSS-IS2020/test/Full_wave_enhanced_audio/'
path_save = 'data/ADReSS-IS2020/test/transcription_ASR_full_'+model_name.split('/')[1]+'/'

files = os.listdir(path)
tic = time.time()
for file in files:
    filename = path + file
    transcriptions = model.transcribe([filename])
    with open(path_save+file.split('.')[0]+'.json', 'w') as f:
        json.dump(transcriptions, f)
toc = time.time()
print('Duration:',round((toc-tic)/60,2),'min')

## 2.1) Whisper: base, large

In [None]:
import whisper

# Select model
model_name = "base"
# model_name = "large"

# Load ASR model
model = whisper.load_model(model_name)

In [7]:
# Transcribe audio files from the training set
for group in ['cc','cd']:
    print(group)
    path = 'data/ADReSS-IS2020/train/Full_wave_enhanced_audio/'+ group + '/'
    path_save = 'data/ADReSS-IS2020/train/transcription_ASR_full_whisper_large/'+ group + '/' 
    files = os.listdir(path)
    tic = time.time()
    for file in files:
        filename = path + file
        transcriptions = model.transcribe(filename)
        with open(path_save+file.split('.')[0]+'.json', 'w') as f:
            json.dump(transcriptions, f)
    toc = time.time()
    print('Duration:',round((toc-tic)/60,2),'min')

In [6]:
# Transcribe audio files from the test set
path = 'data/ADReSS-IS2020/test/Full_wave_enhanced_audio/'
path_save = 'data/ADReSS-IS2020/test/transcription_ASR_full_whisper_large/'

files = os.listdir(path)
tic = time.time()
for file in files:
    filename = path + file
    transcriptions = model.transcribe(filename)
    with open(path_save+file.split('.')[0]+'.json', 'w') as f:
        json.dump(transcriptions, f)
toc = time.time()
print('Duration:',round((toc-tic)/60,2),'min')

## 2.2) WhisperX: base, large

In [None]:
import whisperx

# Select model
model_name = "base"
# model_name = "large"

# Load ASR model
model = whisperx.load_model(model_name, device="cuda")

In [8]:
# Transcribe audio files from the training set
for group in ['cc','cd']:
    print(group)
    path = 'data/ADReSS-IS2020/train/Full_wave_enhanced_audio/'+ group + '/'
    path_save = 'data/ADReSS-IS2020/train/transcription_ASR_full_whisper_large_X/'+ group + '/' 
    files = os.listdir(path)
    tic = time.time()
    for file in files:
        filename = path + file
        
        # Transcribe audio with whisper
        result = model.transcribe(filename, language = 'en')
        with open(path_save+file.split('.')[0]+'_text.json', 'w') as f:
            json.dump(result, f)
        
        # Load alignment model and metadata and align whisper output
        model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
        result_aligned = whisperx.align(result["segments"], model_a, metadata, filename, device)

        with open(path_save+file.split('.')[0]+'_alignements.json', 'w') as f:
            json.dump(result_aligned['word_segments'], f)
        
    toc = time.time()
    print('Duration:',round((toc-tic)/60,2),'min') # Duration 12.06min + 15.05 min = 30min

In [9]:
# Transcribe audio files from the test set
path = 'data/ADReSS-IS2020/test/Full_wave_enhanced_audio/'
path_save = 'data/ADReSS-IS2020/test/transcription_ASR_full_whisper_large_X/'

files = os.listdir(path)
tic = time.time()
for file in files:
    filename = path + file
    
    # Transcribe audio with whisper
    result = model.transcribe(filename, language = 'en')
    with open(path_save+file.split('.')[0]+'_text.json', 'w') as f:
        json.dump(result, f)
    
    # Load alignment model and metadata and align whisper output
    model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
    result_aligned = whisperx.align(result["segments"], model_a, metadata, filename, device)

    with open(path_save+file.split('.')[0]+'_alignements.json', 'w') as f:
        json.dump(result_aligned['word_segments'], f)

toc = time.time()
print('Duration:',round((toc-tic)/60,2),'min')  