In [1]:
!pip install opensmile gdown transformers soundfile --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/996.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m993.3/996.0 kB[0m [31m39.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m996.0/996.0 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Accessing Dataset

In [2]:
import os
import tarfile
from pathlib import Path
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import librosa
import librosa.display
from IPython.display import Audio, display
import pandas as pd
import numpy as np
import soundfile as sf
from transformers import pipeline
import torch

# install dataset using gdown
!gdown 1p4ZQOwbHkD2RAvq2K5ekY5gcMi24XLnS

tarball_filename = "ADReSSo21-diagnosis-train.tar"

# extract contents from tarball
with tarfile.open(tarball_filename, 'r:*') as tar:
    tar.extractall(path="./")

# remove tarball after extraction
os.remove(tarball_filename)

Downloading...
From (original): https://drive.google.com/uc?id=1p4ZQOwbHkD2RAvq2K5ekY5gcMi24XLnS
From (redirected): https://drive.google.com/uc?id=1p4ZQOwbHkD2RAvq2K5ekY5gcMi24XLnS&confirm=t&uuid=5cc67864-4e5e-4ef5-b19e-9a61902f2202
To: /content/ADReSSo21-diagnosis-train.tar
100% 1.75G/1.75G [00:23<00:00, 73.0MB/s]


### Helper Functions

In [5]:
# define base dataset directory and related paths
dataset_dir = Path('ADReSSo21/diagnosis/train')
paths = {
    'audio': dataset_dir / 'audio',
    'audio_ad': dataset_dir / 'audio' / 'ad',
    'audio_cn': dataset_dir / 'audio' / 'cn',
    'segmentation': dataset_dir / 'segmentation',
    'segmentation_ad': dataset_dir / 'segmentation' / 'ad',
    'segmentation_cn': dataset_dir / 'segmentation' / 'cn'
}

def count_files(directory):
    """return the number of files in the specified directory."""
    return len([f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))])

def plot_waveform_data(waveform, sr, title="Waveform"):
    """plot the given waveform with its sampling rate."""
    plt.figure(figsize=(12, 4))
    librosa.display.waveshow(waveform, sr=sr)
    plt.title(title)
    plt.xlabel("time (s)")
    plt.ylabel("amplitude")
    plt.show()

def plot_waveform(file_path=None, folder=paths['audio_ad']):
    """
    load an audio file (default: first .wav in folder) and plot its waveform.
    """
    if file_path is None:
        wav_files = list(folder.glob("*.wav"))
        if not wav_files:
            print(f"no .wav files found in {folder}")
            return
        file_path = wav_files[0]
    else:
        file_path = Path(file_path)

    waveform, sr = librosa.load(str(file_path), sr=None)
    print("original waveform shape:", waveform.shape)
    print("sampling rate:", sr)
    print("duration (seconds):", len(waveform) / sr, "\n")

def load_audio_file(file_path: Path):
    """load and return the audio waveform and sample rate using soundfile."""
    return sf.read(file_path)

def load_segmentation(seg_file: Path):
    """load and return the segmentation csv as a dataframe."""
    return pd.read_csv(seg_file)

def extract_patient_segments(audio: np.ndarray, sr: int, seg_df: pd.DataFrame, speaker: str = "PAR"):
    """
    extract patient segments from the audio where speaker equals the specified speaker.
    returns:
      - patient_mask: array with patient segments (nan elsewhere).
      - concatenated: patient segments concatenated.
      - segments: list of (begin, end) sample index tuples.
    """
    patient_df = seg_df[seg_df['speaker'] == speaker]
    patient_mask = np.full_like(audio, np.nan)
    segments_list = []
    segments = []

    for _, row in patient_df.iterrows():
        begin_sample = int(float(row['begin']) * sr / 1000)
        end_sample = int(float(row['end']) * sr / 1000)
        patient_mask[begin_sample:end_sample] = audio[begin_sample:end_sample]
        segments_list.append(audio[begin_sample:end_sample])
        segments.append((begin_sample, end_sample))

    concatenated = np.concatenate(segments_list) if segments_list else np.array([])
    return patient_mask, concatenated, segments

def plot_overlay_waveform(audio: np.ndarray, sr: int, patient_mask: np.ndarray,
                          orig_color: str = "#0000FF", patient_color: str = "#00FF00"):
    """plot the original waveform and overlay the patient-only segments."""
    time_axis = np.linspace(0, len(audio) / sr, num=len(audio))
    plt.figure(figsize=(14, 4))
    plt.plot(time_axis, audio, color=orig_color, label="Original")
    plt.plot(time_axis, patient_mask, color=patient_color, label="Patient-Only")
    plt.title("Original Audio with Patient Speech Highlighted")
    plt.xlabel("time (s)")
    plt.ylabel("amplitude")
    plt.legend()
    plt.show()

def process_audio(audio_file: Path, seg_file: Path, plot: bool = False):
    """
    load audio and segmentation files, extract patient segments,
    optionally plot the overlay, and return processed data.
    """
    audio, sr = load_audio_file(audio_file)
    seg_df = load_segmentation(seg_file)
    patient_mask, concatenated, segments = extract_patient_segments(audio, sr, seg_df)
    if plot:
        plot_overlay_waveform(audio, sr, patient_mask)
    return audio, sr, patient_mask, concatenated, segments

def init_transcriber(model_name: str = "openai/whisper-large", device: int = -1):
    """
    initialize and return the automatic speech recognition pipeline using hugging face's transformers.

    parameters:
      model_name (str): the hugging face model name; default is "openai/whisper-large".
      device (int): device to run the model on; -1 for cpu, or 0 (or other index) for gpu.

    returns:
      a hugging face asr pipeline instance.
    """
    transcriber = pipeline("automatic-speech-recognition", model=model_name, device=device)
    return transcriber

def transcribe_audio_file(file_path: Path, transcriber) -> str:
    """
    transcribe a single audio file using the specified transcription pipeline.

    parameters:
      file_path (Path): path to the .wav audio file.
      transcriber: a hugging face asr pipeline instance.

    returns:
      the transcription as a string.
    """
    # note the addition of return_timestamps=true to support long audio files
    transcription_result = transcriber(str(file_path), return_timestamps=True)
    # retrieve the transcribed text from the result dictionary
    return transcription_result.get("text", "")

def create_transcription_df(transcription_records: list) -> pd.DataFrame:
    """
    create a pandas dataframe from a list of transcription records.

    each record in the list should be a dictionary with keys like 'file_name' and 'transcription'.

    parameters:
      transcription_records (list): a list of transcription record dictionaries.

    returns:
      dataframe containing the transcription data.
    """
    return pd.DataFrame(transcription_records)


### Transcription


In [6]:
device = 0 if torch.cuda.is_available() else -1

# initialize the transcription pipeline with the selected device
transcriber = init_transcriber(model_name="openai/whisper-large", device=device)

# define the directory containing the audio files
audio_dir = Path("ADReSSo21/diagnosis/train/audio/ad")

# get a sample .wav file; picks the first one in the directory
sample_file = next(audio_dir.glob("*.wav"), None)

if sample_file is None:
    print(f"no .wav files found in {audio_dir}")
else:
    print("transcribing file:", sample_file.name)
    # transcribe the audio file with return_timestamps enabled internally
    transcription_text = transcribe_audio_file(sample_file, transcriber)
    print("transcription:")
    print(transcription_text)

    # create a transcription record and convert to a dataframe
    transcription_record = [{
        "file_name": sample_file.name,
        "transcription": transcription_text
    }]
    df_transcriptions = create_transcription_df(transcription_record)

    # save the dataframe as a csv file
    csv_filename = "transcriptions.csv"
    df_transcriptions.to_csv(csv_filename, index=False)
    print(f"transcription saved to {csv_filename}")

Device set to use cuda:0


Transcribing file: adrso049.wav


Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Transcription:
 And there's the picture. All the action that you can see. The little boy climbing up in some cookers out the cooking jar. And his little sister reaching for some. And the little boy standing on the lye. and his big sister washing the dishes at the sink. Big sis washing the dishes and she got dishes sitting on the sink. I think she's running water. Can you tell me anything else that's going on over there? And I say the little sister's reach. Johnny Johnny he is he's up on the ladder, getting some cookies and. This is the region that we should reach it up after. Anything he's passing down to her. Okay, anything else. And now they're about to turn over. Okay. Okay. Anything else going on over here? The cups, maybe she's under washing, but she got them sitting on the sink. And maybe running water on the sink. When the guy is, you gotta carry the pool that you might get some light in there. Since the dishes stacked up, they might be on this thing. Anything else? Nothing but 