In [2]:
!pip install opensmile gdown transformers soundfile --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/996.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m450.6/996.0 kB[0m [31m13.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m996.0/996.0 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.9/41.9 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.9/150.9 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

### Accessing Dataset

In [5]:
import os
import tarfile
from pathlib import Path
import os
import tempfile
import concurrent.futures
from pathlib import Path
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio, display
import pandas as pd
import numpy as np
import soundfile as sf
from transformers import pipeline
import torch

# install dataset using gdown
!gdown 1p4ZQOwbHkD2RAvq2K5ekY5gcMi24XLnS

tarball_filename = "ADReSSo21-diagnosis-train.tar"

# extract contents from tarball
with tarfile.open(tarball_filename, 'r:*') as tar:
    tar.extractall(path="./")

# remove tarball after extraction
os.remove(tarball_filename)

Downloading...
From (original): https://drive.google.com/uc?id=1p4ZQOwbHkD2RAvq2K5ekY5gcMi24XLnS
From (redirected): https://drive.google.com/uc?id=1p4ZQOwbHkD2RAvq2K5ekY5gcMi24XLnS&confirm=t&uuid=b3ddf379-a455-4aa3-a747-bf6402e534b9
To: /content/ADReSSo21-diagnosis-train.tar
100% 1.75G/1.75G [00:18<00:00, 92.3MB/s]


In [13]:
import os
import tempfile
from pathlib import Path
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio, display
import pandas as pd
import numpy as np
import soundfile as sf
from transformers import pipeline
import torch
import warnings

# -------------------------------------------------------------------------------
# Warning Suppression: Authentication and Whisper Deprecation Warnings
# -------------------------------------------------------------------------------
warnings.filterwarnings("ignore", message="Error while fetching `HF_TOKEN` secret value from your vault")
warnings.filterwarnings("ignore", message="Due to a bug fix in", category=UserWarning)
warnings.filterwarnings("ignore", message="The input name `inputs` is deprecated", category=FutureWarning)

# -------------------------------------------------------------------------------
# Helper Functions
# -------------------------------------------------------------------------------

def get_dataset_paths(dataset_root: str = "ADReSSo21/diagnosis/train"):
    """
    Return a dictionary with paths for audio and segmentation files.
    """
    dataset_dir = Path(dataset_root)
    return {
        "audio": dataset_dir / "audio",
        "audio_ad": dataset_dir / "audio" / "ad",
        "audio_cn": dataset_dir / "audio" / "cn",
        "segmentation": dataset_dir / "segmentation",
        "segmentation_ad": dataset_dir / "segmentation" / "ad",
        "segmentation_cn": dataset_dir / "segmentation" / "cn"
    }

def load_audio_file(file_path: Path):
    """Load audio waveform and sampling rate using soundfile."""
    return sf.read(file_path)

def load_segmentation(seg_file: Path):
    """Load the segmentation CSV into a Pandas DataFrame."""
    return pd.read_csv(seg_file)

def extract_patient_segments(audio: np.ndarray, sr: int, seg_df: pd.DataFrame, speaker: str = "PAR"):
    """
    Extract patient segments from the audio as specified in the segmentation CSV.

    Returns:
      - patient_mask: An array with patient segments (NaN elsewhere).
      - concatenated: Patient segments concatenated into one array.
      - segments: List of (begin, end) sample index tuples.
    """
    patient_df = seg_df[seg_df["speaker"] == speaker]
    patient_mask = np.full_like(audio, np.nan)
    segments_list = []
    segments = []
    for _, row in patient_df.iterrows():
        begin_sample = int(float(row["begin"]) * sr / 1000)
        end_sample = int(float(row["end"]) * sr / 1000)
        patient_mask[begin_sample:end_sample] = audio[begin_sample:end_sample]
        segments_list.append(audio[begin_sample:end_sample])
        segments.append((begin_sample, end_sample))
    concatenated = np.concatenate(segments_list) if segments_list else np.array([])
    return patient_mask, concatenated, segments

def process_audio(audio_file: Path, seg_file: Path, plot: bool = False):
    """
    Load an audio file and its corresponding segmentation CSV,
    extract the patient-only segments, and optionally plot the waveform.

    Returns:
      audio, sr, patient_mask, concatenated (patient-only audio), segments.
    """
    audio, sr = load_audio_file(audio_file)
    seg_df = load_segmentation(seg_file)
    patient_mask, concatenated, segments = extract_patient_segments(audio, sr, seg_df)
    if plot:
        time_axis = np.linspace(0, len(audio) / sr, num=len(audio))
        plt.figure(figsize=(14, 4))
        plt.plot(time_axis, audio, label="Original")
        plt.plot(time_axis, patient_mask, label="Patient-Only")
        plt.xlabel("Time (s)")
        plt.ylabel("Amplitude")
        plt.title("Patient Speech Isolation")
        plt.legend()
        plt.show()
    return audio, sr, patient_mask, concatenated, segments

def init_transcriber(model_name: str = "openai/whisper-large", device: int = 0):
    """
    Initialize the Hugging Face ASR pipeline.

    Set device to 0 to use your GPU.
    """
    return pipeline("automatic-speech-recognition", model=model_name, device=device)

def transcribe_audio_file(file_path: str, transcriber) -> str:
    """
    Transcribe an audio file given its file path.

    Returns the transcription text.
    """
    result = transcriber(file_path, return_timestamps=True)
    return result.get("text", "")

def create_transcription_df(transcription_records: list) -> pd.DataFrame:
    """
    Create a Pandas DataFrame from a list of transcription records.
    """
    return pd.DataFrame(transcription_records)

# -------------------------------------------------------------------------------
# Main Function: Get Transcripts for 30 Files on GPU
# -------------------------------------------------------------------------------
def get_transcripts():
    """
    Process 30 audio files from the dataset (from both AD and CN groups) sequentially on GPU,
    print the word count for each transcript, and save the sorted transcripts to a CSV file.
    """
    paths = get_dataset_paths()
    device = 0  # Use GPU (device index 0)
    model_name = "openai/whisper-large"

    # Gather audio files from both AD and CN groups, sort by file name, and select the first 30
    audio_files = sorted(list(paths["audio_ad"].glob("*.wav")) + list(paths["audio_cn"].glob("*.wav")), key=lambda f: f.name)[:30]
    print(f"Processing {len(audio_files)} audio files sequentially on GPU.")

    transcripts = []
    transcriber = init_transcriber(model_name=model_name, device=device)  # Initialize transcriber once
    for audio_file in audio_files:
        # Determine segmentation CSV based on the parent folder
        if "ad" in audio_file.parent.name.lower():
            seg_file = paths["segmentation_ad"] / f"{audio_file.stem}.csv"
        else:
            seg_file = paths["segmentation_cn"] / f"{audio_file.stem}.csv"

        # Process audio to extract the patient-only audio
        audio, sr, patient_mask, concatenated, segments = process_audio(audio_file, seg_file, plot=False)
        if concatenated.size == 0:
            transcript = "No patient speech segments found."
        else:
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
                temp_filename = tmp_file.name
            sf.write(temp_filename, concatenated, sr)
            transcript = transcribe_audio_file(temp_filename, transcriber)
            os.remove(temp_filename)
        transcripts.append({"file_name": audio_file.name, "transcription": transcript})
        word_count = len(transcript.split())
        print(f"File '{audio_file.name}': {word_count} words in transcript.")

    # Ensure the list of transcripts is sorted by file name (it should already be sorted but this guarantees it)
    transcripts = sorted(transcripts, key=lambda x: x["file_name"])

    df = create_transcription_df(transcripts)
    csv_filename = "sorted_patient_transcriptions.csv"
    df.to_csv(csv_filename, index=False)
    print(f"All sorted transcriptions saved to {csv_filename}")

### Transcription


In [14]:
get_transcripts()

Processing 30 audio files sequentially on GPU.


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


File 'adrso002.wav': 171 words in transcript.


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


File 'adrso003.wav': 82 words in transcript.
File 'adrso005.wav': 141 words in transcript.
File 'adrso007.wav': 63 words in transcript.
File 'adrso008.wav': 137 words in transcript.
File 'adrso010.wav': 41 words in transcript.
File 'adrso012.wav': 137 words in transcript.
File 'adrso014.wav': 138 words in transcript.
File 'adrso015.wav': 99 words in transcript.


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


File 'adrso016.wav': 79 words in transcript.


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


File 'adrso017.wav': 117 words in transcript.
File 'adrso018.wav': 5 words in transcript.
File 'adrso019.wav': 5 words in transcript.
File 'adrso021.wav': 5 words in transcript.
File 'adrso022.wav': 5 words in transcript.
File 'adrso023.wav': 5 words in transcript.
File 'adrso024.wav': 182 words in transcript.
File 'adrso025.wav': 195 words in transcript.
File 'adrso027.wav': 83 words in transcript.
File 'adrso028.wav': 47 words in transcript.


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


File 'adrso031.wav': 90 words in transcript.
File 'adrso032.wav': 184 words in transcript.


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


File 'adrso033.wav': 72 words in transcript.


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


File 'adrso035.wav': 84 words in transcript.
File 'adrso036.wav': 40 words in transcript.
File 'adrso039.wav': 28 words in transcript.


Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


File 'adrso043.wav': 250 words in transcript.
File 'adrso045.wav': 291 words in transcript.
File 'adrso046.wav': 108 words in transcript.
File 'adrso047.wav': 94 words in transcript.
All sorted transcriptions saved to sorted_patient_transcriptions.csv
