In [1]:
!pip install opensmile gdown transformers soundfile --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/996.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m993.3/996.0 kB[0m [31m39.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m996.0/996.0 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/41.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

### Accessing Dataset

In [2]:
import os
import tarfile
import tempfile
from pathlib import Path
import matplotlib.pyplot as plt
import librosa
import librosa.display
from IPython.display import Audio, display
import pandas as pd
import numpy as np
import soundfile as sf
from transformers import pipeline
import torch


# install dataset using gdown
!gdown 1p4ZQOwbHkD2RAvq2K5ekY5gcMi24XLnS

tarball_filename = "ADReSSo21-diagnosis-train.tar"

# extract contents from tarball
with tarfile.open(tarball_filename, 'r:*') as tar:
    tar.extractall(path="./")

# remove tarball after extraction
os.remove(tarball_filename)

Downloading...
From (original): https://drive.google.com/uc?id=1p4ZQOwbHkD2RAvq2K5ekY5gcMi24XLnS
From (redirected): https://drive.google.com/uc?id=1p4ZQOwbHkD2RAvq2K5ekY5gcMi24XLnS&confirm=t&uuid=5cc67864-4e5e-4ef5-b19e-9a61902f2202
To: /content/ADReSSo21-diagnosis-train.tar
100% 1.75G/1.75G [00:23<00:00, 73.0MB/s]


### Helper Functions

In [12]:
dataset_dir = Path('ADReSSo21/diagnosis/train')
paths = {
    'audio': dataset_dir / 'audio',
    'audio_ad': dataset_dir / 'audio' / 'ad',
    'audio_cn': dataset_dir / 'audio' / 'cn',
    'segmentation': dataset_dir / 'segmentation',
    'segmentation_ad': dataset_dir / 'segmentation' / 'ad',
    'segmentation_cn': dataset_dir / 'segmentation' / 'cn'
}


def load_audio_file(file_path: Path):
    """Load and return the audio waveform and sample rate using soundfile."""
    return sf.read(file_path)

def load_segmentation(seg_file: Path):
    """Load and return the segmentation CSV as a Pandas DataFrame."""
    return pd.read_csv(seg_file)

def extract_patient_segments(audio: np.ndarray, sr: int, seg_df: pd.DataFrame, speaker: str = "PAR"):
    """
    Extract patient segments (where speaker == speaker) from the audio.

    Returns:
      - patient_mask: Array with patient segments (NaN elsewhere).
      - concatenated: Patient segments concatenated into one array.
      - segments: List of (begin, end) sample index tuples.
    """
    patient_df = seg_df[seg_df['speaker'] == speaker]
    patient_mask = np.full_like(audio, np.nan)
    segments_list = []
    segments = []

    for _, row in patient_df.iterrows():
        begin_sample = int(float(row['begin']) * sr / 1000)
        end_sample = int(float(row['end']) * sr / 1000)
        patient_mask[begin_sample:end_sample] = audio[begin_sample:end_sample]
        segments_list.append(audio[begin_sample:end_sample])
        segments.append((begin_sample, end_sample))

    concatenated = np.concatenate(segments_list) if segments_list else np.array([])
    return patient_mask, concatenated, segments

def process_audio(audio_file: Path, seg_file: Path, plot: bool = False):
    """
    Load an audio file and its corresponding segmentation CSV file,
    extract patient segments (concatenated), and optionally plot the overlay.

    Returns:
      audio, sr, patient_mask, concatenated (patient-only audio), segments
    """
    audio, sr = load_audio_file(audio_file)
    seg_df = load_segmentation(seg_file)
    patient_mask, concatenated, segments = extract_patient_segments(audio, sr, seg_df)

    if plot:
        time_axis = np.linspace(0, len(audio)/sr, num=len(audio))
        plt.figure(figsize=(14, 4))
        plt.plot(time_axis, audio, label="Original")
        plt.plot(time_axis, patient_mask, label="Patient-Only")
        plt.xlabel("Time (s)")
        plt.ylabel("Amplitude")
        plt.title("Patient Speech Isolation")
        plt.legend()
        plt.show()

    return audio, sr, patient_mask, concatenated, segments

def init_transcriber(model_name: str = "openai/whisper-large", device: int = -1):
    """
    Initialize and return the automatic speech recognition pipeline
    using Hugging Face's transformers.
    """
    transcriber = pipeline("automatic-speech-recognition", model=model_name, device=device)
    return transcriber

def transcribe_audio_file(file_path: str, transcriber) -> str:
    """
    Transcribe an audio file given by its file path using the provided transcriber.

    Parameters:
      file_path (str): Path to the audio file.
      transcriber: The initialized ASR pipeline.

    Returns:
      The transcription text.
    """
    transcription_result = transcriber(file_path, return_timestamps=True)
    return transcription_result.get("text", "")

def create_transcription_df(transcription_records: list) -> pd.DataFrame:
    """
    Create a Pandas DataFrame from a list of transcription records.
    """
    return pd.DataFrame(transcription_records)


### Transcription


In [17]:
if __name__ == "__main__":
    # using cpu by setting device to 0
    device = 0
    transcriber = init_transcriber(model_name="openai/whisper-large", device=device)

    # pick a sample audio file from the audio_ad group
    audio_file = next(paths['audio_ad'].glob("*.wav"), None)
    if audio_file is None:
        raise FileNotFoundError("no audio file found in " + str(paths['audio_ad']))

    # construct the corresponding segmentation csv file path
    seg_file = paths['segmentation_ad'] / f"{audio_file.stem}.csv"

    # process audio to extract patient-only segments
    audio, sr, patient_mask, concatenated, segments = process_audio(audio_file, seg_file, plot=False)

    if concatenated.size == 0:
        print("no patient speech segments found in the file.")
    else:
        print("transcribing patient-only speech for file:", audio_file.name)

        # write the concatenated patient audio to a temporary wav file
        import tempfile
        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
            temp_filename = tmp_file.name
        sf.write(temp_filename, concatenated, sr)

        # transcribe using the temporary file
        transcription_text = transcribe_audio_file(temp_filename, transcriber)
        print("patient-only transcription:")
        print(transcription_text)

        # remove the temporary file
        os.remove(temp_filename)

        # save the transcription in a csv file
        record = [{
            "file_name": audio_file.name,
            "transcription": transcription_text
        }]
        df = create_transcription_df(record)
        csv_filename = "patient_transcriptions.csv"
        df.to_csv(csv_filename, index=False)
        print(f"patient-only transcription saved to {csv_filename}")


Device set to use cuda:0


transcribing patient-only speech for file: adrso049.wav




patient-only transcription:
 The little boy climbing up in some cookers out the cooking job and his little sister reach of a song and they watch Dianna on the loud and it makes us to watch in the basement at the sink Big sis washing the dishes and she got dishes sitting on the sink. I think she's running water. And I say the little sister's reach. Johnny Johnny he's he's up on the ladder, get getting some cookies in the. This is the region that we should reach it up after. He's passing down to her. and the lather about to turn over. The cups, maybe she done washed them and she got them sitting on the sink. Maybe running water on the sink and she got a curry to pour. That she might get some light in there. Since the dishes stacked up, they might be on this thing. Nothing but alright.
patient-only transcription saved to patient_transcriptions.csv
