# Download needed audio recordings

downloads the dataset ZIP from OneDrive into /content

In [None]:
share_link = "https://mailaub-my.sharepoint.com/:u:/g/personal/bmh26_mail_aub_edu/IQAZZCjtyVumTIbCBrLVy6QGAcoUAQdenZjbzBtPd7_Mr_0?e=HQ5Mxy"
out_path = "/content/prosody_audios.zip"

direct = share_link + ("&download=1" if "?" in share_link else "?download=1")

!wget -O "{out_path}" -L "{direct}"
!ls -lh "{out_path}"


--2025-12-08 12:25:58--  https://mailaub-my.sharepoint.com/:u:/g/personal/bmh26_mail_aub_edu/IQAZZCjtyVumTIbCBrLVy6QGAcoUAQdenZjbzBtPd7_Mr_0?e=HQ5Mxy&download=1
Resolving mailaub-my.sharepoint.com (mailaub-my.sharepoint.com)... 52.105.158.55, 2a01:111:f402:f0b2::55
Connecting to mailaub-my.sharepoint.com (mailaub-my.sharepoint.com)|52.105.158.55|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /personal/bmh26_mail_aub_edu/Documents/project%20490/dataset/prosody%20audios.zip?ga=1 [following]
--2025-12-08 12:25:59--  https://mailaub-my.sharepoint.com/personal/bmh26_mail_aub_edu/Documents/project%20490/dataset/prosody%20audios.zip?ga=1
Reusing existing connection to mailaub-my.sharepoint.com:443.
HTTP request sent, awaiting response... 200 OK
Length: 473186818 (451M) [application/x-zip-compressed]
Saving to: ‘/content/prosody_audios.zip’


2025-12-08 12:26:52 (8.66 MB/s) - ‘/content/prosody_audios.zip’ saved [473186818/473186818]

-rw-r--r-- 1 root root 452M 

nzips the dataset, then loads the .wav files needed.

In [None]:
import zipfile
from pathlib import Path

zip_path = Path("/content/prosody_audios.zip")
extract_root = Path("/content/prosody_extracted")
extract_root.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(extract_root)


wav_dir = extract_root / "prosody audios" / "ESPONTANEA" / "audios"

print("WAV dir:", wav_dir)
print("Exists?", wav_dir.exists())

wav_files = sorted(wav_dir.glob("*.wav")) if wav_dir.exists() else []
print("Num wav files:", len(wav_files))
print("First 5:", [p.name for p in wav_files[:5]])


WAV dir: /content/prosody_extracted/prosody audios/ESPONTANEA/audios
Exists? True
Num wav files: 74
First 5: ['HC_ESPONTANEA_0034.wav', 'HC_ESPONTANEA_0036.wav', 'HC_ESPONTANEA_0045.wav', 'HC_ESPONTANEA_0048.wav', 'HC_ESPONTANEA_0049.wav']


# F0 variability

## Definition: Fundamental Frequency (F0) Variability (Monotonicity)

F0 variability quantifies how much a speaker modulates pitch over an utterance, typically computed over voiced frames from the F0 contour using measures like F0 standard deviation (F0 SD) and speaker-normalized semitone standard deviation (STSD). STSD is commonly used because it reduces gender and speaker baseline differences, and SD-based variability is preferred over raw max–min range because range is very sensitive to outliers. In Parkinson's Disease, reduced F0 variability (monotone speech) is a hallmark feature reflecting impaired motor control of the vocal tract.

---

looks at a sample of files and prints data-driven F0_MIN/F0_MAX.

In [None]:
import numpy as np
import librosa

def quick_f0_percentiles(files, sr=16000, fmin=50, fmax=500, top_db=25, max_files=40):
    vals = []
    for p in files[:max_files]:
        y, s = librosa.load(str(p), sr=sr, mono=True)
        y, _ = librosa.effects.trim(y, top_db=top_db)

        f0, voiced_flag, _ = librosa.pyin(y, fmin=fmin, fmax=fmax, sr=s,
                                          frame_length=2048, hop_length=256)
        if f0 is None:
            continue
        f0 = f0[voiced_flag]
        f0 = f0[np.isfinite(f0)]
        if len(f0):
            vals.append(f0)

    if not vals:
        print("No voiced F0 found in sample.")
        return None

    all_f0 = np.concatenate(vals)
    p5, p95 = np.percentile(all_f0, [5, 95])
    print("F0 p5 =", float(p5), "Hz")
    print("F0 p95 =", float(p95), "Hz")
    print("Suggested: F0_MIN =", max(30.0, p5 - 10), "Hz , F0_MAX =", p95 + 20, "Hz")
    return float(p5), float(p95)

_ = quick_f0_percentiles(wav_files)


F0 p5 = 50.0 Hz
F0 p95 = 228.41646786215435 Hz
Suggested: F0_MIN = 40.0 Hz , F0_MAX = 248.41646786215435 Hz


computes F0 from voiced frames only, extracts variability stats, and saves them to a CSV.

In [None]:
import pandas as pd
import numpy as np
import librosa

# set these (either keep defaults or use values suggested by B3)
F0_MIN = 40
F0_MAX = 250

TOP_DB = 25 # This value is considered a good value based on spectrograms of
# phonation features extraction notebook and since the same people are speaking
SR = 16000
HOP = 256
FRAME = 2048

def extract_f0_variability(wav_path, fmin=F0_MIN, fmax=F0_MAX, sr=SR, top_db=TOP_DB):
    y, s = librosa.load(str(wav_path), sr=sr, mono=True)
    y, _ = librosa.effects.trim(y, top_db=top_db, frame_length=FRAME, hop_length=HOP)
    if len(y) < int(0.3 * s):
        return np.nan, np.nan, np.nan, np.nan, np.nan

    f0, voiced_flag, _ = librosa.pyin(
        y, fmin=fmin, fmax=fmax, sr=s,
        frame_length=FRAME, hop_length=HOP
    )
    if f0 is None:
        return np.nan, np.nan, np.nan, np.nan, np.nan

    f0v = f0[voiced_flag]
    f0v = f0v[np.isfinite(f0v)]
    if len(f0v) < 5:
        return np.nan, np.nan, np.nan, np.nan, np.nan

    mean_hz = float(np.mean(f0v))
    std_hz  = float(np.std(f0v, ddof=1))
    cv      = float(std_hz / mean_hz) if mean_hz > 0 else np.nan

    # semitone variability (more speaker-independent)
    med = np.median(f0v)
    st = 12.0 * np.log2(f0v / med)
    std_st = float(np.std(st, ddof=1))

    voiced_ratio = float(np.mean(voiced_flag))

    return mean_hz, std_hz, cv, std_st, voiced_ratio


rows = []
for p in wav_files:
    name = p.name.upper()
    label = "PD" if name.startswith("PD") else ("HC" if name.startswith("HC") else "UNK")

    f0_mean, f0_std, f0_cv, f0_std_st, voiced_ratio = extract_f0_variability(p)

    rows.append({
        "label": label,
        "file": p.name,
        "path": str(p),
        "f0_mean_hz": f0_mean,
        "f0_std_hz": f0_std,
        "f0_cv": f0_cv,
        "f0_std_semitones": f0_std_st,
        "voiced_ratio": voiced_ratio
    })

df_f0 = pd.DataFrame(rows)

out_csv = "/content/f0_variability_espontanea.csv"
df_f0.to_csv(out_csv, index=False)
print("Saved:", out_csv)

df_f0.head()


Saved: /content/f0_variability_espontanea.csv


Unnamed: 0,label,file,path,f0_mean_hz,f0_std_hz,f0_cv,f0_std_semitones,voiced_ratio
0,HC,HC_ESPONTANEA_0034.wav,/content/prosody_extracted/prosody audios/ESPO...,131.785178,43.469357,0.32985,6.713546,0.507949
1,HC,HC_ESPONTANEA_0036.wav,/content/prosody_extracted/prosody audios/ESPO...,199.406728,47.945785,0.240442,6.478548,0.831203
2,HC,HC_ESPONTANEA_0045.wav,/content/prosody_extracted/prosody audios/ESPO...,90.161484,29.106784,0.322829,5.741967,0.903614
3,HC,HC_ESPONTANEA_0048.wav,/content/prosody_extracted/prosody audios/ESPO...,115.121411,41.331163,0.359022,7.381251,0.818085
4,HC,HC_ESPONTANEA_0049.wav,/content/prosody_extracted/prosody audios/ESPO...,139.305229,78.122161,0.560798,11.907384,0.778604


# Pause feature constants

In [None]:
import numpy as np
import pandas as pd
import librosa

SR = 16000

# Silence / pause detection settings (energy-based)
TOP_DB = 25          # same trim aggressiveness as before
FRAME_LEN = 2048
HOP_LEN = 256

MIN_PAUSE_SEC = 0.20  # ignore tiny gaps <200ms (not real pauses)


## Definition: Pause Features (Disfluency)

Pause features describe the amount and structure of silence in speech, usually reported as number of pauses, pause durations (mean, median, standard deviation), and pause ratio (total pause time relative to total recording or task time). A common constant in automatic extraction is the minimum pause duration threshold (often around 0.20–0.25 seconds) to avoid counting very short closures as pauses. In Parkinson's Disease, increased pause frequency and longer pause durations reflect disrupted speech fluency and impaired motor timing.

---

In [None]:
def extract_pause_features(wav_path,
                           sr=SR,
                           top_db=TOP_DB,
                           frame_length=FRAME_LEN,
                           hop_length=HOP_LEN,
                           min_pause_sec=MIN_PAUSE_SEC):
    y, s = librosa.load(str(wav_path), sr=sr, mono=True)

    # Trim leading/trailing silence so pauses are mostly internal
    y, _ = librosa.effects.trim(y, top_db=top_db)

    dur = len(y) / s
    if dur < 0.5:
        return np.nan, np.nan, np.nan, np.nan

    # Split into non-silent intervals (speech-active)
    intervals = librosa.effects.split(y, top_db=top_db,
                                      frame_length=frame_length,
                                      hop_length=hop_length)

    if len(intervals) == 0:
        return np.nan, np.nan, np.nan, np.nan

    # Convert intervals to seconds
    speech = [(st / s, en / s) for st, en in intervals]

    # Pauses are the gaps between consecutive speech intervals
    pauses = []
    for (s1, e1), (s2, e2) in zip(speech[:-1], speech[1:]):
        gap = s2 - e1
        if gap >= min_pause_sec:
            pauses.append(gap)

    pause_count = int(len(pauses))
    pause_total = float(np.sum(pauses)) if pauses else 0.0
    pause_mean  = float(np.mean(pauses)) if pauses else 0.0
    pause_ratio = float(pause_total / dur) if dur > 0 else np.nan

    return pause_count, pause_mean, pause_total, pause_ratio


In [None]:
rows = []

for p in wav_files:
    name = p.name.upper()
    label = "PD" if name.startswith("PD") else ("HC" if name.startswith("HC") else "UNK")

    pause_count, pause_mean, pause_total, pause_ratio = extract_pause_features(p)

    rows.append({
        "label": label,
        "file": p.name,
        "path": str(p),
        "pause_count": pause_count,
        "pause_mean_sec": pause_mean,
        "pause_total_sec": pause_total,
        "pause_ratio": pause_ratio
    })

df_pause = pd.DataFrame(rows)

out_csv = "/content/pause_features_espontanea.csv"
df_pause.to_csv(out_csv, index=False)
print("Saved:", out_csv)

df_pause.head()


Saved: /content/pause_features_espontanea.csv


Unnamed: 0,label,file,path,pause_count,pause_mean_sec,pause_total_sec,pause_ratio
0,HC,HC_ESPONTANEA_0034.wav,/content/prosody_extracted/prosody audios/ESPO...,28,0.930857,26.064,0.431866
1,HC,HC_ESPONTANEA_0036.wav,/content/prosody_extracted/prosody audios/ESPO...,9,0.627556,5.648,0.188166
2,HC,HC_ESPONTANEA_0045.wav,/content/prosody_extracted/prosody audios/ESPO...,5,0.6176,3.088,0.23253
3,HC,HC_ESPONTANEA_0048.wav,/content/prosody_extracted/prosody audios/ESPO...,21,0.853333,17.92,0.299786
4,HC,HC_ESPONTANEA_0049.wav,/content/prosody_extracted/prosody audios/ESPO...,19,1.327158,25.216,0.509373


# Speech rate

In [None]:
import numpy as np
import pandas as pd
import librosa

SR = 16000
TOP_DB = 25
FRAME_LEN = 2048
HOP_LEN = 256


## Definition: Speech Rate (Articulation & Tempo)

Speech rate is formally defined as an output-per-time measure (commonly syllables per second) and a related measure, articulation rate, is the tempo of speech excluding silent pauses. Without transcripts, many pipelines estimate rate via time-based proxies such as voiced/speech time versus total time or run-based timing, rather than true syllable counts. In Parkinson's Disease, speech rate is often reduced due to motor slowness (bradykinesia), and variability in articulation rate can indicate disease progression and severity.

---

In [None]:
def extract_speech_rate_proxy(wav_path,
                              sr=SR,
                              top_db=TOP_DB,
                              frame_length=FRAME_LEN,
                              hop_length=HOP_LEN):
    y, s = librosa.load(str(wav_path), sr=sr, mono=True)

    # trim leading/trailing silence
    y, _ = librosa.effects.trim(y, top_db=top_db, frame_length=frame_length, hop_length=hop_length)
    dur = len(y) / s
    if dur < 0.5:
        return np.nan, np.nan, np.nan, np.nan

    # speech-active segments
    intervals = librosa.effects.split(y, top_db=top_db, frame_length=frame_length, hop_length=hop_length)
    if len(intervals) == 0:
        return np.nan, np.nan, np.nan, np.nan

    seg_lens = np.array([(en - st) / s for st, en in intervals], dtype=float)

    speech_time = float(np.sum(seg_lens))
    n_segments = int(len(seg_lens))

    articulation_rate = float(speech_time / dur) if dur > 0 else np.nan  # speech fraction
    segment_rate = float(n_segments / dur) if dur > 0 else np.nan        # bursts per sec
    mean_segment = float(np.mean(seg_lens)) if n_segments > 0 else np.nan

    return float(dur), speech_time, articulation_rate, segment_rate, mean_segment


In [None]:
rows = []

for p in wav_files:
    name = p.name.upper()
    label = "PD" if name.startswith("PD") else ("HC" if name.startswith("HC") else "UNK")

    dur, speech_time, art_rate, seg_rate, mean_seg = extract_speech_rate_proxy(p)

    rows.append({
        "label": label,
        "file": p.name,
        "path": str(p),
        "duration_sec": dur,
        "speech_time_sec": speech_time,
        "articulation_rate": art_rate,        # higher => less pause / more continuous speech
        "speech_segment_rate": seg_rate,      # higher => more fragmented speech
        "mean_speech_segment_sec": mean_seg
    })

df_speech_rate = pd.DataFrame(rows)

out_csv = "/content/speech_rate_proxy_espontanea.csv"
df_speech_rate.to_csv(out_csv, index=False)
print("Saved:", out_csv)

df_speech_rate.head()


Saved: /content/speech_rate_proxy_espontanea.csv


Unnamed: 0,label,file,path,duration_sec,speech_time_sec,articulation_rate,speech_segment_rate,mean_speech_segment_sec
0,HC,HC_ESPONTANEA_0034.wav,/content/prosody_extracted/prosody audios/ESPO...,60.368,32.368,0.536178,0.811688,0.660571
1,HC,HC_ESPONTANEA_0036.wav,/content/prosody_extracted/prosody audios/ESPO...,30.032,23.952,0.797549,0.732552,1.088727
2,HC,HC_ESPONTANEA_0045.wav,/content/prosody_extracted/prosody audios/ESPO...,13.264,10.064,0.758745,0.603136,1.258
3,HC,HC_ESPONTANEA_0048.wav,/content/prosody_extracted/prosody audios/ESPO...,59.792,40.528,0.677816,0.786058,0.862298
4,HC,HC_ESPONTANEA_0049.wav,/content/prosody_extracted/prosody audios/ESPO...,49.488,23.584,0.47656,0.808277,0.5896


# References

[1] L. K. Bowen, S. D. K. et al., "Effects of Parkinson's disease on fundamental frequency variability in running speech," Journal of Voice, 2013. [Online]. Available: PubMed Central.

[2] J. K. Y. Ma, "Acoustic analysis of intonation in Parkinson's disease," in Proc. Interspeech, 2010. [Online]. Available: ISCA Archive.

[3] I. Lacruz, "Average pause ratio as an indicator of cognitive effort in post-editing," in Proc. AMTA Workshop on Post-Editing Technology and Practice, 2012. [Online]. Available: ACL Anthology.

[4] J. Bishop, "Articulation rate, phrase length, and lookahead in speech production," in Proc. Speech Prosody, 2018. [Online]. Available: ISCA Archive.

[5] L. Romito, "Fluency, articulation and speech rate as new parameters in the speaker recognition," 2005. [Online]. Available: Osservatorio sulla Linguistica Forense.

[6] V. B. dos Santos et al., "Does speech in patients with different Parkinson's disease subtypes decline over time?," Parkinsonism & Related Disorders, 2025.