# Data Preparation
'
Working on the Italian_Parkinsons_Voice_and_Speech dataset downloaded from here: https://huggingface.co/datasets/birgermoell/Italian_Parkinsons_Voice_and_Speech


In [59]:
seed_value = 1986
from speechbrain.utils.data_utils import get_all_files
import os
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import torchaudio


## Full Dataset

|  | # wav files | # speakers |
| --- | --- | --- |
| Dataset | 831 | 61 |
| Train Data | 649 | 48 |
| Valid Data | 96 (downsampled to 64 for class balance) | 6 |
| Test Data | 86 | 7 |

### get audio files:

In [60]:

# Your code here
data_files = get_all_files('/home/ulaval.ca/maelr5/scratch/parkinsons', match_and=['.wav'])

print('data size= ', len(data_files))



data size=  831


In [61]:
type(data_files), data_files[0], data_files[500]

(list,
 '/home/ulaval.ca/maelr5/scratch/parkinsons/15 Young Healthy Control/Daniele R/B1LBULCAAS94M100120171057.wav',
 "/home/ulaval.ca/maelr5/scratch/parkinsons/28 People with Parkinson's disease/17-28/Nicola M/VE1NMIICNOO52M100220171138.wav")

In [62]:
data_files[0].split(os.sep), data_files[500].split(os.sep)

(['',
  'home',
  'ulaval.ca',
  'maelr5',
  'scratch',
  'parkinsons',
  '15 Young Healthy Control',
  'Daniele R',
  'B1LBULCAAS94M100120171057.wav'],
 ['',
  'home',
  'ulaval.ca',
  'maelr5',
  'scratch',
  'parkinsons',
  "28 People with Parkinson's disease",
  '17-28',
  'Nicola M',
  'VE1NMIICNOO52M100220171138.wav'])

### extract metadata

In [63]:
data = []

for wav_path in data_files:
    parts = wav_path.split(os.sep)

    # Assumes folder format: /home/ulaval.ca/maelr5/scratch/parkinsons/<label-folder>/<speaker>/<file.wav>
    speaker_id = parts[-2]    # e.g., "Davide S"
    filename = parts[-1]
    
    if "Healthy Control" in wav_path:
        label_folder = parts[-3]  # e.g: "15 Young Healthy Control"
    elif "with Parkinson's disease" in wav_path:
        label_folder = parts[-4] # e.g: "28 People with Parkinson's disease"

    # Determine label from folder name
    label = "HC" if "Healthy Control" in label_folder else "PD"

    data.append({
        "filename": filename,
        "full_path": wav_path,
        "speaker_id": speaker_id,
        "label": label
    })

df = pd.DataFrame(data)


In [64]:
print(len(df))
df.head(2)


831


Unnamed: 0,filename,full_path,speaker_id,label
0,B1LBULCAAS94M100120171057.wav,/home/ulaval.ca/maelr5/scratch/parkinsons/15 Y...,Daniele R,HC
1,B2LBULCAAS94M100120171057.wav,/home/ulaval.ca/maelr5/scratch/parkinsons/15 Y...,Daniele R,HC


In [65]:
df.tail(2)


Unnamed: 0,filename,full_path,speaker_id,label
829,VE2lbuairgo52M1606161815.wav,/home/ulaval.ca/maelr5/scratch/parkinsons/28 P...,Luigi B,PD
830,FB1lbuairgo52M1606161825.wav,/home/ulaval.ca/maelr5/scratch/parkinsons/28 P...,Luigi B,PD


### split data into train/ valid/ test sets **"by speaker"**:

80% Training, 10%Validation, 10% Test

Splitting **by speaker** means train and test must not include recordings from the same person, to get more reliable results and because splitting **by recordings** causes **data leakage**, and models just learn to recognize the person — not Parkinson's symptoms.[1]

[1] Iswarya Kannoth Veetil, Sowmya V., Juan Rafael Orozco-Arroyave, E.A. Gopalakrishnan,
Robust language independent voice data driven Parkinson’s disease detection,
Engineering Applications of Artificial Intelligence,
Volume 129,
2024,
107494,
ISSN 0952-1976,
https://doi.org/10.1016/j.engappai.2023.107494.
(https://www.sciencedirect.com/science/article/pii/S0952197623016780)


In [66]:
df['speaker_id'].unique()[0]


'Daniele R'

In [67]:
speakers = df['speaker_id'].unique()
train_speakers, eval_speakers = train_test_split(speakers, test_size=0.2, train_size=0.8, random_state=seed_value, shuffle=True)
valid_speakers, test_speakers = train_test_split(eval_speakers, test_size=0.5, train_size=0.5, random_state=seed_value, shuffle=True)

train_df = df[df['speaker_id'].isin(train_speakers)]

valid_df = df[df['speaker_id'].isin(valid_speakers)]

test_df = df[df['speaker_id'].isin(test_speakers)]

print('train wavfiles size= ', len(train_df))
print('valid wavfiles size= ', len(valid_df))
print('test wavfiles size= ', len(test_df))


train wavfiles size=  649
valid wavfiles size=  96
test wavfiles size=  86


In [68]:

print('data speakers size= ', len(speakers))
print('train speakers size= ', len(train_speakers))
print('valid speakers size= ', len(valid_speakers))
print('test speakers size= ', len(test_speakers))



data speakers size=  61
train speakers size=  48
valid speakers size=  6
test speakers size=  7


#### Note: some speakers have more recordings than others

### create json files

In [12]:
os.path.splitext('B1LBULCAAS94M100120171057.wav')[0]

'B1LBULCAAS94M100120171057'

In [13]:
def basic_df_to_json(df, json_path, shuffle=True, seed=42):
    if shuffle:
        df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    data = {}
    for _, row in df.iterrows():
        utt_id = os.path.splitext(row['filename'])[0]  # unique id
        # Getting info
        audioinfo = torchaudio.info(row['full_path'])
        # Compute the duration in seconds.
        # This is the number of samples divided by the sampling frequency
        duration = audioinfo.num_frames / audioinfo.sample_rate
        
        data[utt_id] = {
            "path": row['full_path'],
            "spk_id": row['speaker_id'],
            "length": duration,
            "detection": row['label']
        }
    with open(json_path, "w") as f:
        json.dump(data, f, indent=4)

# Write out the JSONs
basic_df_to_json(train_df, "train.json", shuffle=True, seed=seed_value)
basic_df_to_json(valid_df, "valid.json", shuffle=True, seed=seed_value)
basic_df_to_json(test_df, "test.json", shuffle=True, seed=seed_value)


### The json files are formatted in the following way:-

test.json:
```
{
  "B1LBULCAAS94M100120171053": {
    path:	"/home/ulaval.ca/maelr5/scratch/parkinsons/15 Young Healthy Control/Arianna P/B1LBULCAAS94M100120171053.wav",
    spk_id:	"Arianna P",
    length:	40.69875,
    detection:	"HC",
  },
  "VO2NPIICEOR42M020420171811": {
    path:	"/home/ulaval.ca/maelr5/scratch/parkinsons/22 Elderly Healthy Control/NICOLA P/VO2NPIICEOR42M020420171811.wav",
    spk_id:	"NICOLA P",
    length:	5.6536875,
    detection:	"HC",
  },
  "D1cdaopmoe67M2605161905": {
    path:	"/home/ulaval.ca/maelr5/scratch/parkinsons/28 People with Parkinson's disease/1-5/Domenico C/D1cdaopmoe67M2605161905.wav",
    spk_id:	"Domenico C",
    length:	5.000725623582767,
    detection:	"PD",
  },
....
```

In [49]:
# check class statistics for each set

def plot_class_balance(inputfile):
    # Load the JSON file
    with open(inputfile, "r") as f:
        data = json.load(f)
    
    # Extract labels (assumes key is "detection")
    labels = [entry["detection"] for entry in data.values()]
    
    # Count samples per class
    counts = Counter(labels)
    
    # Print nicely
    for label, count in counts.items():
        print(f"{label}: {count} samples")



In [14]:

print("train:-")
plot_class_balance("train.json")
print("valid:-")
plot_class_balance("valid.json")
print("test:-")
plot_class_balance("test.json")


train:-
PD: 325 samples
HC: 324 samples
valid:-
PD: 64 samples
HC: 32 samples
test:-
HC: 38 samples
PD: 48 samples


### Updated df_to_json() with class balancing

We need to class balance the valid set by Downsampling the larger class to match the smaller one.

In [57]:
def df_to_json(df, json_path, shuffle=True, balance_classes=False, seed=42):
    if balance_classes:
        # Split into PD and HC
        pd_df = df[df['label'] == 'PD']
        hc_df = df[df['label'] == 'HC']

        # Find the smaller class size
        min_size = min(len(pd_df), len(hc_df))

        # Downsample both classes
        pd_df = pd_df.sample(n=min_size, random_state=seed)
        hc_df = hc_df.sample(n=min_size, random_state=seed)

        # Combine and shuffle
        df = pd.concat([pd_df, hc_df], axis=0)
    
    if shuffle:
        df = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    data = {}
    for _, row in df.iterrows():
        utt_id = os.path.splitext(row['filename'])[0]  # unique id
        # Getting info
        audioinfo = torchaudio.info(row['full_path'])
        # Compute the duration in seconds.
        # This is the number of samples divided by the sampling frequency
        duration = audioinfo.num_frames / audioinfo.sample_rate
        
        data[utt_id] = {
            "path": row['full_path'],
            "spk_id": row['speaker_id'],
            "length": duration,
            "detection": row['label']
        }
    with open(json_path, "w") as f:
        json.dump(data, f, indent=4)
        

In [15]:

df_to_json(train_df, "train.json", shuffle=True, balance_classes=False, seed=seed_value)
df_to_json(valid_df, "valid.json", shuffle=True, balance_classes=True, seed=seed_value)
df_to_json(test_df, "test.json", shuffle=True, balance_classes=False, seed=seed_value)  # usually test is untouched


In [16]:

print("train:-")
plot_class_balance("train.json")
print("valid:-")
plot_class_balance("valid.json")
print("test:-")
plot_class_balance("test.json")


train:-
PD: 325 samples
HC: 324 samples
valid:-
PD: 32 samples
HC: 32 samples
test:-
HC: 38 samples
PD: 48 samples


#### sanity check : to show that there is not overlap betweeen train and test
(1) Compare by Audio Paths (Most Reliable)

In [52]:


def load_paths(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    return set(ex["path"] for ex in data.values())

def load_speakers(json_path):
    with open(json_path, "r") as f:
        data = json.load(f)
    return set(ex["spk_id"] for ex in data.values())



In [17]:

train_paths = load_paths("train.json")
valid_paths = load_paths("valid.json")
test_paths = load_paths("test.json")

overlap = train_paths.intersection(test_paths)
print(f"\n Number of overlapping audio files between train and test: {len(overlap)}")
if overlap:
    print("Some overlapping files:")
    for p in list(overlap)[:10]:  # show first 10
        print("-", p)

overlap = train_paths.intersection(valid_paths)
print(f"\n Number of overlapping audio files between train and valid: {len(overlap)}")

overlap = valid_paths.intersection(test_paths)
print(f"\n Number of overlapping audio files between valid and test: {len(overlap)}")




 Number of overlapping audio files between train and test: 0

 Number of overlapping audio files between train and valid: 0

 Number of overlapping audio files between valid and test: 0


(2) Compare by Speaker

In [18]:

train_speakers = load_speakers("train.json")
valid_speakers = load_speakers("valid.json")
test_speakers = load_speakers("test.json")

overlap_speakers = train_speakers.intersection(test_speakers)
print(f"\n🎙️ Overlapping speakers between train and test: {len(overlap_speakers)}")

overlap_speakers = train_speakers.intersection(valid_speakers)
print(f"\n🎙️ Overlapping speakers between train and valid: {len(overlap_speakers)}")

overlap_speakers = valid_speakers.intersection(test_speakers)
print(f"\n🎙️ Overlapping speakers between valid and test: {len(overlap_speakers)}")

if overlap_speakers:
    print("Some shared speakers:")
    for s in list(overlap_speakers)[:10]:
        print("-", s)



🎙️ Overlapping speakers between train and test: 0

🎙️ Overlapping speakers between train and valid: 0

🎙️ Overlapping speakers between valid and test: 0


## Create small data for sanity check of the model:

The model should overfit this data after training for multiple epochs.

In [46]:
import os
import json
import torchaudio

def df_to_small_json(df, json_path, shuffle=True, balance_classes=False, seed=42, samples_per_class=None):
    if balance_classes:
        # Split into PD and HC
        pd_df = df[df['label'] == 'PD']
        hc_df = df[df['label'] == 'HC']

        # Optionally limit samples per class
        if samples_per_class:
            pd_df = pd_df.sample(n=min(samples_per_class, len(pd_df)), random_state=seed)
            hc_df = hc_df.sample(n=min(samples_per_class, len(hc_df)), random_state=seed)
        
        df = pd.concat([pd_df, hc_df], axis=0)

    if shuffle:
        df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    data = {}
    for _, row in df.iterrows():
        utt_id = os.path.splitext(os.path.basename(row['filename']))[0]
        audioinfo = torchaudio.info(row['full_path'])
        duration = audioinfo.num_frames / audioinfo.sample_rate

        data[utt_id] = {
            "path": row['full_path'],
            "spk_id": row['speaker_id'],
            "length": duration,
            "detection": row['label']
        }

    with open(json_path, "w") as f:
        json.dump(data, f, indent=4)

# Keep your full train/valid/test splits
# But only use a balanced subset when exporting to JSON

df_to_small_json(train_df, "train-check.json", shuffle=True, balance_classes=True, samples_per_class=10)
df_to_small_json(valid_df, "valid-check.json", shuffle=True, balance_classes=True, samples_per_class=2)



In [47]:

print("train:-")
plot_class_balance("train-check.json")
print("valid:-")
plot_class_balance("valid-check.json")
print("test:-")
plot_class_balance("test.json")


train:-
PD: 10 samples
HC: 10 samples
valid:-
PD: 2 samples
HC: 2 samples
test:-
HC: 38 samples
PD: 48 samples


## k-fold cross validation

In [71]:
len(df), len(train_df), len(valid_df), seed_value

(831, 649, 96, 1986)

In [73]:

from sklearn.model_selection import StratifiedKFold

# Save a DataFrame to JSON
def save_json(df, json_path):
    data = {}
    for _, row in df.iterrows():
        utt_id = os.path.splitext(row['filename'])[0]
        audioinfo = torchaudio.info(row['full_path'])
        duration = audioinfo.num_frames / audioinfo.sample_rate
        data[utt_id] = {
            "path": row['full_path'],
            "spk_id": row['speaker_id'],
            "length": duration,
            "detection": row['label']
        }
    with open(json_path, "w") as f:
        json.dump(data, f, indent=4)

def balance_classes(df, seed=42):
    pd_df = df[df['label'] == 'PD']
    hc_df = df[df['label'] == 'HC']
    min_size = min(len(pd_df), len(hc_df))
    
    pd_df = pd_df.sample(n=min_size, random_state=seed)
    hc_df = hc_df.sample(n=min_size, random_state=seed)
    
    return pd.concat([pd_df, hc_df]).sample(frac=1, random_state=seed).reset_index(drop=True)

# Perform K-Fold only on train_df (ignoring test_df)
def cross_val_on_train(train_df, k=5, seed=42, balance_train=True):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['label'])):
        fold_train = train_df.iloc[train_idx].reset_index(drop=True)
        fold_val = train_df.iloc[val_idx].reset_index(drop=True)
        if balance_train:
            fold_train = balance_classes(fold_train)

        save_json(fold_train, f"fold{fold}_train.json")
        save_json(fold_val, f"fold{fold}_valid.json")
        print(f"Saved Fold {fold}: Train and Val JSONs")


full_train_df = pd.concat([train_df, valid_df]).reset_index(drop=True)

# If you just want to use train_df only:
cross_val_on_train(full_train_df, k=3, seed=seed_value, balance_train=True)



Saved Fold 0: Train and Val JSONs
Saved Fold 1: Train and Val JSONs
Saved Fold 2: Train and Val JSONs


In [75]:
for fold in range(3):
    print(f"fold{fold}_train:-")
    plot_class_balance(f"fold{fold}_train.json")
    print(f"fold{fold}_valid:-")
    plot_class_balance(f"fold{fold}_valid.json")


fold0_train:-
HC: 237 samples
PD: 237 samples
fold0_val:-
HC: 119 samples
PD: 130 samples
fold1_train:-
HC: 237 samples
PD: 237 samples
fold1_val:-
HC: 119 samples
PD: 129 samples
fold2_train:-
HC: 238 samples
PD: 238 samples
fold2_val:-
HC: 118 samples
PD: 130 samples


In [76]:

for fold in range(3):
    train_paths = load_paths(f"train.json")
    valid_paths = load_paths(f"valid.json")
    test_paths = load_paths(f"test.json")
    
    overlap = train_paths.intersection(test_paths)
    print(f"\n Number of overlapping audio files between fold{fold}_train and test: {len(overlap)}")
    if overlap:
        print(f"Some overlapping fold{fold}_files:")
        for p in list(overlap)[:10]:  # show first 10
            print("-", p)
    
    overlap = train_paths.intersection(valid_paths)
    print(f"\n Number of overlapping audio files between fold{fold}_train and vfold{fold}_alid: {len(overlap)}")
    
    overlap = valid_paths.intersection(test_paths)
    print(f"\n Number of overlapping audio files between fold{fold}_valid and test: {len(overlap)}")
    
    
    train_speakers = load_speakers(f"fold{fold}_train.json")
    valid_speakers = load_speakers(f"fold{fold}_valid.json")
    test_speakers = load_speakers(f"test.json")
    
    overlap_speakers = train_speakers.intersection(test_speakers)
    print(f"\n🎙️ Overlapping speakers between fold{fold}_train and test: {len(overlap_speakers)}")
    
    overlap_speakers = train_speakers.intersection(valid_speakers)
    print(f"\n🎙️ Overlapping speakers between fold{fold}_train and fold{fold}_fold{fold}_valid: {len(overlap_speakers)}")
    
    overlap_speakers = valid_speakers.intersection(test_speakers)
    print(f"\n🎙️ Overlapping speakers between fold{fold}_valid and test: {len(overlap_speakers)}")
    
    if overlap_speakers:
        print(f"Some shared fold{fold}_speakers:")
        for s in list(overlap_speakers)[:10]:
            print("-", s)



 Number of overlapping audio files between fold0_train and test: 0

 Number of overlapping audio files between fold0_train and vfold0_alid: 0

 Number of overlapping audio files between fold0_valid and test: 0


FileNotFoundError: [Errno 2] No such file or directory: 'fold0_valid.json'

In [None]:
#**************************************

## Data statistics

### number of examples in each class 

to check the classes balance ratio

In [4]:
from speechbrain.utils.data_utils import get_all_files

young_healthy_files = get_all_files("/home/ulaval.ca/maelr5/scratch/parkinsons/15 Young Healthy Control", match_and=['.wav'])
elderly_healthy_files = get_all_files("/home/ulaval.ca/maelr5/scratch/parkinsons/22 Elderly Healthy Control", match_and=['.wav'])

print('15 young healthy data size= ', len(young_healthy_files))
print('22 elderly healthy data size= ', len(elderly_healthy_files))
print('37 candidates - Healthy data size= ', len(young_healthy_files) + len(elderly_healthy_files))


15 young healthy data size=  45
22 elderly healthy data size=  349
37 candidates - Healthy data size=  394


In [5]:
data_files = get_all_files("/home/ulaval.ca/maelr5/scratch/parkinsons/28 People with Parkinson's disease", match_and=['.wav'])

print("28 candidates - with Parkinson's disease data size= ", len(data_files))


28 candidates - with Parkinson's disease data size=  437


In [None]:

# 39 Healthy (1) in test
# 45 PD (0) in test

# decrease the data and use the short recordings of vowels

Sustained Vowels: Participants are asked to produce sustained phonations of vowels, such as 'a','e','o','i','u'. These recordings are particularly useful for analyzing fundamental frequency (F0) variations and other acoustic features that can indicate PD-related changes in voice production.


In [3]:
# Your code here
from speechbrain.utils.data_utils import get_all_files

# Your code here
data_files = get_all_files('/home/ulaval.ca/maelr5/scratch/parkinsons',
                           match_and=['.wav'],
                           match_or=['VA1','VA2','VE1','VE2','VI1','VI2','VO1','VO2','VU1','VU2'],
                          )

print('data size= ', len(data_files))

from sklearn.model_selection import train_test_split

train_files, test_files = train_test_split(data_files, test_size=0.2, train_size=0.8, random_state=seed_value, shuffle=True)
valid_files, test_files = train_test_split(test_files, test_size=0.5, train_size=0.5, random_state=seed_value, shuffle=True)

print('train size= ', len(train_files))
print('valid size= ', len(valid_files))
print('test size= ', len(test_files))


data size=  495
train size=  396
valid size=  49
test size=  50


In [4]:
import json
import torchaudio

def create_json(json_file, audiolist):
  json_dict = {}
  for audiofile in audiolist:

    # Getting info
    audioinfo = torchaudio.info(audiofile)

    # Compute the duration in seconds.
    # This is the number of samples divided by the sampling frequency
    duration = audioinfo.num_frames / audioinfo.sample_rate

    # Get spk Label by manipulating the audio path
    if "Healthy Control" in audiofile:
        detection_id = "Healthy Control"
    elif "with Parkinson's disease" in audiofile:
        detection_id = "with Parkinson's disease"

    # Get a unique utterance id
    uttid = audiofile.split('/')[-2] + '_' + audiofile.split('/')[-1][:-4]

    # Create entry for this utterance
    json_dict[uttid] = {
            "path": audiofile,
            "length": duration,
            "detection": detection_id,
    }

    # Writing the dictionary to the json file
    with open(json_file, mode="w") as json_f:
      json.dump(json_dict, json_f, indent=2)



create_json('train-vowels.json', train_files)
create_json('valid-vowels.json', valid_files)
create_json('test-vowels.json', test_files)



# decrease the data and only use the short recordings of vowel 'a'


In [6]:
# Your code here
from speechbrain.utils.data_utils import get_all_files

# Your code here
data_files = get_all_files('/home/ulaval.ca/maelr5/scratch/parkinsons',
                           match_and=['.wav'],
                           match_or=['VA1', 'VA2'],
                          )

print('data size= ', len(data_files))

from sklearn.model_selection import train_test_split

train_files, test_files = train_test_split(data_files, test_size=0.2, train_size=0.8, random_state=seed_value, shuffle=True)
valid_files, test_files = train_test_split(test_files, test_size=0.5, train_size=0.5, random_state=seed_value, shuffle=True)

print('train size= ', len(train_files))
print('valid size= ', len(valid_files))
print('test size= ', len(test_files))


data size=  99
train size=  79
valid size=  10
test size=  10


In [7]:
import json
import torchaudio

def create_json(json_file, audiolist):
  json_dict = {}
  for audiofile in audiolist:

    # Getting info
    audioinfo = torchaudio.info(audiofile)

    # Compute the duration in seconds.
    # This is the number of samples divided by the sampling frequency
    duration = audioinfo.num_frames / audioinfo.sample_rate

    # Get spk Label by manipulating the audio path
    if "Healthy Control" in audiofile:
        detection_id = "Healthy Control"
    elif "with Parkinson's disease" in audiofile:
        detection_id = "with Parkinson's disease"

    # Get a unique utterance id
    uttid = audiofile.split('/')[-2] + '_' + audiofile.split('/')[-1][:-4]

    # Create entry for this utterance
    json_dict[uttid] = {
            "path": audiofile,
            "length": duration,
            "detection": detection_id,
    }

    # Writing the dictionary to the json file
    with open(json_file, mode="w") as json_f:
      json.dump(json_dict, json_f, indent=2)



create_json('train-vowel-a.json', train_files)
create_json('valid-vowel-a.json', valid_files)
create_json('test-vowel-a.json', test_files)



# decrease the data and use the recordings of short sentences or phrases (not vowels)
Phrases: Short sentences or phrases are recorded to evaluate more complex speech patterns. 
These recordings help in assessing prosody, articulation, and other speech characteristics that may be affected by PD.


In [12]:
# Your code here
from speechbrain.utils.data_utils import get_all_files

# Your code here
data_files = get_all_files('/home/ulaval.ca/maelr5/scratch/parkinsons',
                           match_and=['.wav'],
                           exclude_or=['VA1','VA2','VE1','VE2','VI1','VI2','VO1','VO2','VU1','VU2'],
                          )

print('data size= ', len(data_files))

from sklearn.model_selection import train_test_split

train_files, test_files = train_test_split(data_files, test_size=0.2, train_size=0.8, random_state=seed_value, shuffle=True)
valid_files, test_files = train_test_split(test_files, test_size=0.5, train_size=0.5, random_state=seed_value, shuffle=True)

print('train size= ', len(train_files))
print('valid size= ', len(valid_files))
print('test size= ', len(test_files))


data size=  336
train size=  268
valid size=  34
test size=  34


In [13]:
import json
import torchaudio

def create_json(json_file, audiolist):
  json_dict = {}
  for audiofile in audiolist:

    # Getting info
    audioinfo = torchaudio.info(audiofile)

    # Compute the duration in seconds.
    # This is the number of samples divided by the sampling frequency
    duration = audioinfo.num_frames / audioinfo.sample_rate

    # Get spk Label by manipulating the audio path
    if "Healthy Control" in audiofile:
        detection_id = "Healthy Control"
    elif "with Parkinson's disease" in audiofile:
        detection_id = "with Parkinson's disease"

    # Get a unique utterance id
    uttid = audiofile.split('/')[-2] + '_' + audiofile.split('/')[-1][:-4]

    # Create entry for this utterance
    json_dict[uttid] = {
            "path": audiofile,
            "length": duration,
            "detection": detection_id,
    }

    # Writing the dictionary to the json file
    with open(json_file, mode="w") as json_f:
      json.dump(json_dict, json_f, indent=2)



create_json('train-phrases.json', train_files)
create_json('valid-phrases.json', valid_files)
create_json('test-phrases.json', test_files)

