# Generation of fold distribution for k-fold with EWA-DB dataset. 
- NOTE: This notebook is heavly inspired by the GITA_k_fold_generator.ipynb file created by Torbjørn Karl Svendsen and Maria Francesca for PC-GITA but adapted to work for the EWA-DB dataset. 

In [2]:
# Due to file being in another folder: 
import sys
sys.path.append('../')

In [4]:
#generic libraries
import pandas
import numpy as np
import os

from constants import *
from EWA_wav_paths import ids_EWA_100

### Definitions

In [5]:
folds = 10  # Specify number of folds 
ids = ids_EWA_100 # ID to be used in the divition. 
outfile = 'kfold-groups-ewa.csv' # Output file (leave empty for no output file)
SEED = 42 # Seed

### Import metadata

In [6]:
metadata_path = os.path.join(personal_path_to_EWA_DB,'SPEAKERS.TSV')
metadata=pandas.read_csv(metadata_path,sep='\t') # Get all metadata 

metadata = metadata.rename(columns={"SPEAKER_CODE": "ID"}) # Restructure IDs to be called "ID". 
metadata = metadata.loc[metadata['ID'].isin(ids)] # Extract the data with the IDs from the feature data. 

metadata.reset_index(inplace = True, drop = True)

In [7]:
display(metadata)

Unnamed: 0,ID,AGE,COFFEE,DIAGNOSIS,EDUCATION_GRATUATED_SCHOOL,EDUCATION_YEARS,FAMILY,INCLUSIVE_CRITERIA,LOW_QUALITY,MOCA,PUBLISH_AGREEMENT,RECORDED,SEX,WAIS,ANNOTATION
0,611xqj2u01,44,Yes,Healthy,University,17,no occurrence,True,False,27,True,2021-12-14T18:33:39+01:00,male,2,True
1,665udebf01,65,Yes,Healthy,University,18,AD,True,False,29,True,2021-07-21T15:28:02+02:00,female,5,True
2,665ur02801,65,Yes,Healthy,University,22,no occurrence,True,False,28,True,2021-07-21T14:14:36+02:00,female,2,True
3,b2yux3m001,66,No,Healthy,University,18,no occurrence,True,False,27,True,2021-12-21T11:30:46+01:00,male,2,True
4,c8ijy50001,56,Yes,Healthy,Secondary,14,no occurrence,True,False,27,True,2022-10-13T16:39:46+02:00,male,7,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,ysw975bp01,66,Yes,Healthy,University,19,no occurrence,True,False,27,True,2021-10-04T12:14:16+02:00,female,4,True
96,ysw9avix01,69,Yes,Healthy,University,24,PD,True,False,30,True,2022-04-25T15:13:59+02:00,female,1,True
97,ysw9dip001,66,Yes,Healthy,Secondary,12,no occurrence,True,False,26,True,2022-05-11T15:09:05+02:00,male,1,True
98,ysw9gmsr01,65,Yes,Healthy,University,18,no occurrence,True,False,27,True,2021-09-24T12:30:48+02:00,female,1,True


### Inspecting metadata amounts

In [8]:
num_pd = len(metadata[metadata["DIAGNOSIS"] == "Parkinson"])
num_hc = len(metadata[metadata["DIAGNOSIS"] == "Healthy"])
num_male = len(metadata[metadata["SEX"] == "male"])
num_female = len(metadata[metadata["SEX"] == "female"])

print(f"In total number of PD people: {num_pd} ")
print(f"In total number of HC people: {num_hc}.")
print(f"In total number of males people: {num_male}")
print(f"In total number of female people: {num_female}.")

In total number of PD people: 50 
In total number of HC people: 50.
In total number of males people: 50
In total number of female people: 50.


### Define the folds to be used for training and evaluation. Split in 10 subsets while keeping gender balance and healthy/patient balance in the subsets.

Prerequisites: Dataframe "metadata" imported from EWA-DB containing columns "ID" and "SEX" to describe speaker ID and gender, as well as a "DIAGNOSIS" column that describes if it a PD patient or a person from the HC group. 

Procedure
1. Preprocessing, determine number of draws per group etc.
2. Split first into patients and healthy control group 
3. Then, split these groups by gender to create male and female patient groups and male and female control groups
4. Finally, draw from each group so each fold has equal number of patients and control group, and that gender balance is observed - i.e. equal representation in each fold, at most 1 in difference when the sub-groups are odd-numbered, equal when they are even-numbered. 


In [9]:
# 1.
sets =[[]]*folds
numspkrs=len(metadata)
foldspkrs=int(numspkrs/folds)
try: 
    foldspkrs*folds == numspkrs
except:
    print('Impossible to create folds with equal number of speakers')
numsub=int(foldspkrs/2)
try:
    2*numsub == foldspkrs
except:
    print('The number of speakers in the folds is odd! To make them balanced it should be even')
numsub2=int(numsub/2)
if 2*numsub2 < numsub:
    numsub1 = numsub2 + 1
else:
    numsub1 = numsub2
numdraws = [numsub1, numsub2]
 
 # 2. a) split into healthy and control groups then b) create subgroups by gender
hc=[]
pd=[]
for i in range(len(metadata)):
    if (metadata['DIAGNOSIS'][i]=="Healthy"):
        hc.append(metadata['ID'][i])
    elif (metadata['DIAGNOSIS'][i]=="Parkinson"):
        pd.append(metadata['ID'][i])
    else:
        print('The dataset contains people that is not in the Healthy or Parkinsons group. They should be in either of them for this code to work.')
        


hcm=[]
hcf=[]
pdm=[]
pdf=[]
for i in range(len(hc)):
    idx=metadata.index[metadata['ID'] == hc[i]].tolist()
    if len(idx) != 1:
        print('Error in dataset, non-unique speaker ID', hc[i])
        continue
    if metadata['SEX'][idx[0]]== "female":
        hcf.append(metadata['ID'][idx[0]])
    else:
        hcm.append(metadata['ID'][idx[0]])
for i in range(len(pd)):
    idx=metadata.index[metadata['ID'] == pd[i]].tolist()
    if len(idx) != 1:
        print('Error in dataset, non-unique speaker ID', pd[i])
        continue
    if metadata['SEX'][idx[0]]== "female":
        pdf.append(metadata['ID'][idx[0]])
    else:
        pdm.append(metadata['ID'][idx[0]])

# 3.
# The dictionary spkrdict will contain the fold each speaker belongs to, and 
rng = np.random.default_rng(seed=SEED)

flip = rng.integers(low=0,high=2)
flip1= 1 - flip 
spkrdict={}

for i in range(folds):
    draws = numdraws[flip % 2]
    idx=rng.choice(hcm, size=draws, replace = False)
    for kk in idx:
        spkrdict[kk] = i
    hcm = [j for j in hcm if j not in idx]
    draws = numdraws[(flip +1) %2 ]
    idx=rng.choice(hcf, size=draws, replace = False)
    for kk in idx:
        spkrdict[kk] = i
    hcf = [j for j in hcf if j not in idx]
    flip +=1
    
    draws = numdraws[flip1 % 2]
    idx=rng.choice(pdm, size=draws, replace = False)
    for kk in idx:
        spkrdict[kk] = i
    pdm = [j for j in pdm if j not in idx]
    draws = numdraws[(flip1 +1) %2 ]
    idx=rng.choice(pdf, size=draws, replace = False)
    for kk in idx:
        spkrdict[kk] = i
    pdf = [j for j in pdf if j not in idx]
    flip1 +=1
    
    

### Transform the speakerdict information to a dataframe

In [10]:
o=[['ID', 'Sex', 'Group', 'Fold']]
for ispkr in spkrdict.keys():
    idx=metadata.index[metadata['ID'] == ispkr].tolist()
    try:
        len(idx) == 1
    except:
        print('Error in dataset, non-unique speaker ID', ispkr)
    
    idx = idx[0]
    
    group = metadata.at[idx,'DIAGNOSIS']
    gender = metadata.at[idx,'SEX']
    
    if (group == "Healthy"):
        Group = 'HC'
    else:
        Group = 'PD'
    
    if (gender == "female"):
        Gender = "F"
    else: 
        Gender = "M"
    o.append([ispkr,Gender,Group,spkrdict[ispkr]])

df = pandas.DataFrame(o[1:], columns=o[0])

### Check that the resulting folds are balanced

In [11]:
for ifold in range(folds):
    females = df[(df['Fold'] == ifold) & (df['Sex'] == 'F')]
    males = df[(df['Fold'] == ifold) & (df['Sex'] == 'M')]
    control = df[(df['Fold'] == ifold) & (df['Group'] == 'HC')]
    patients = df[(df['Fold'] == ifold) & (df['Group'] == 'PD')]
    speakers = df[df['Fold'] == ifold]['ID']
    print('Fold', ifold)
    print('\t Females:', len(females), 'Males:', len(males), 'Patients:', len(patients), 'Control', len(control))
    print('\t', *speakers)

Fold 0
	 Females: 5 Males: 5 Patients: 5 Control 5
	 wsnb28dk01 noahczv201 u1unospv01 tbc501f201 d1aj5bfq01 lj2v9a9r01 tbc5q3u101 tbc5u9nr01 u1unkmy101 tbc5zcus01
Fold 1
	 Females: 5 Males: 5 Patients: 5 Control 5
	 wsnbuexj01 c8ijy50001 kixjr33y01 kixjnqra01 kixj5g8601 u1unv52201 tbc5taig01 tbc5es9b01 tbc51rtj01 tbc5kai901
Fold 2
	 Females: 5 Males: 5 Patients: 5 Control 5
	 o2kg4ivh01 wsnbcqhb01 b2yux3m001 o2kgtwiz01 kixj7npk01 wsnbegjd01 u1unjvvw01 fgl9fz7501 wsnbov1e01 tbc5klt201
Fold 3
	 Females: 5 Males: 5 Patients: 5 Control 5
	 ysw9l33701 wsnb4hge01 o2kgafym01 kixjui2r01 kixjc4c801 o2kgzthq01 tbc5xvde01 wsnba93901 wsnbr5vb01 tbc59a3j01
Fold 4
	 Females: 5 Males: 5 Patients: 5 Control 5
	 wsnb6w7701 kixj56vy01 frvbu7hz01 frvb3fyq01 wsnbpesw01 tbc5qqyr01 kixj1huf01 tbc570ey01 tbc53fwu01 tbc5o2yq01
Fold 5
	 Females: 5 Males: 5 Patients: 5 Control 5
	 kixjew4x01 d1ajb2sv01 665ur02801 wsnb4dm901 o2kgpblm01 tbc5es0q01 tbc5y24h01 u1unoo4g01 tbc5dgjb01 wsnbcnuc01
Fold 6
	 Females: 5 Ma

Write resulting folds to file (if outfile is not empty string)

In [12]:
if outfile:
    df=df.sort_values(['Fold', 'ID'])
    df.to_csv(outfile, index = False)