# Generation of fold memberships for k-fold experiments with PC-GITA

### Example code for two slightly different approaches. The first is slightly more generic, at least when it comes to number of folds. Both produce the desired outcome, but will give different results due to e.g. different sequence in random generator use. 

- Note: This file is created by Torbjørn Karl Svendsen and Maria Francesca, but has some modifications to make both methods have fold numbers that goes in the same range. 

In [1]:
# Due to file being in another folder: 
import sys
sys.path.append('../')

In [2]:
#generic libraries
import pandas
import numpy as np
import os
import re
import random



from constants import *

### Specify number of folds, the output csv file (le



#### Leave empty for no file output) and the random seed.

In [3]:
folds = 10 
# Location of the source data - it is only the metadata file that is used.
gitaroot=personal_path_to_PC_GITA
metadata = pandas.read_excel(os.path.join(gitaroot,'PCGITA_metadata.xlsx'))
# The destination for the generated folds
outfile = 'kfold-groups-tsv.csv'
outfile_mf = 'kfold-groups-mf.csv'
SEED = 42


### Read the PC-GITA metadata spreadsheet into a dataframe, add a column with speaker ID (without the 'AVPEPUDE' prefix)

In [4]:
idfull=list(metadata['RECODING ORIGINAL NAME'])
for id in range(len(idfull)):
    idfull[id]=idfull[id].replace('AVPEPUDE','')
metadata.insert(0,'ID',idfull, True)
metadata.head()

Unnamed: 0,ID,RECODING ORIGINAL NAME,UPDRS,UPDRS-speech,H/Y,SEX,AGE,time after diagnosis
0,A0001,AVPEPUDEA0001,28.0,1.0,2.0,M,64,3.0
1,A0002,AVPEPUDEA0002,19.0,0.0,1.0,F,72,2.5
2,A0003,AVPEPUDEA0003,52.0,2.0,3.0,F,75,3.0
3,A0005,AVPEPUDEA0005,32.0,1.0,2.0,M,65,12.0
4,A0006,AVPEPUDEA0006,28.0,1.0,2.0,F,66,4.0


### Define the folds to be used for training and evaluation. Split in 10 subsets, and observe gender balance and healthy/patient balance in the subsets.

Prerequisites: Dataframe "metadata" imported from PC-GITA containing columns "ID" and "SEX" to describe speaker ID and gender. The group membership (PD/HC) is derived from the speaker ID, which is AVPEPUDEAnnnn for the patients, AVPEPUDEACnnnn for the control group.

Procedure
1. Preprocessing, determine number of draws per group etc.
2. Split first into patients and healthy control group 
3. Then, split these groups by gender to create male and female patient groups and male and female control groups
4. Finally, draw from each group so each fold has equal number of patients and control group, and that gender balance is observed - i.e. equal representation in each fold, at most 1 in difference when the sub-groups are odd-numbered, equal when they are even-numbered. 


In [5]:
# 1.
sets =[[]]*folds
numspkrs=len(metadata)
foldspkrs=int(numspkrs/folds)
try: 
    foldspkrs*folds == numspkrs
except:
    print('Impossible to create folds with equal number of speakers')
numsub=int(foldspkrs/2)
try:
    2*numsub == foldspkrs
except:
    print('The number of speakers in the folds is odd! To make them balanced it should be even')
numsub2=int(numsub/2)
if 2*numsub2 < numsub:
    numsub1 = numsub2 + 1
else:
    numsub1 = numsub2
numdraws = [numsub1, numsub2]
 
 # 2. a) split into healthy and control groups then b) create subgroups by gender
hc=[]
pd=[]
for i in range(len(metadata)):
    if (re.match(r"AC\w{4}",metadata['ID'][i])):
        hc.append(metadata['ID'][i])
    else:
        pd.append(metadata['ID'][i])


hcm=[]
hcf=[]
pdm=[]
pdf=[]
for i in range(len(hc)):
    idx=metadata.index[metadata['ID'] == hc[i]].tolist()
    if len(idx) != 1:
        print('Error in dataset, non-unique speaker ID', hc[i])
        continue
    if metadata['SEX'][idx[0]]== "F":
        hcf.append(metadata['ID'][idx[0]])
    else:
        hcm.append(metadata['ID'][idx[0]])
for i in range(len(pd)):
    idx=metadata.index[metadata['ID'] == pd[i]].tolist()
    if len(idx) != 1:
        print('Error in dataset, non-unique speaker ID', pd[i])
        continue
    if metadata['SEX'][idx[0]]== "F":
        pdf.append(metadata['ID'][idx[0]])
    else:
        pdm.append(metadata['ID'][idx[0]])

# 3.
# The dictionary spkrdict will contain the fold each speaker belongs to, and 
rng = np.random.default_rng(seed=SEED)

flip = rng.integers(low=0,high=2)
flip1= 1 - flip 
spkrdict={}

for i in range(folds):
    draws = numdraws[flip % 2]
    idx=rng.choice(hcm, size=draws, replace = False)
    for kk in idx:
        spkrdict[kk] = i
    hcm = [j for j in hcm if j not in idx]
    draws = numdraws[(flip +1) %2 ]
    idx=rng.choice(hcf, size=draws, replace = False)
    for kk in idx:
        spkrdict[kk] = i
    hcf = [j for j in hcf if j not in idx]
    flip +=1
    
    draws = numdraws[flip1 % 2]
    idx=rng.choice(pdm, size=draws, replace = False)
    for kk in idx:
        spkrdict[kk] = i
    pdm = [j for j in pdm if j not in idx]
    draws = numdraws[(flip1 +1) %2 ]
    idx=rng.choice(pdf, size=draws, replace = False)
    for kk in idx:
        spkrdict[kk] = i
    pdf = [j for j in pdf if j not in idx]
    flip1 +=1
    
    

### Transform the speakerdict information to a dataframe

In [6]:
o=[['ID', 'Sex', 'Group', 'Fold']]
for ispkr in spkrdict.keys():
    idx=metadata.index[metadata['ID'] == ispkr].tolist()
    try:
        len(idx) == 1
    except:
        print('Error in dataset, non-unique speaker ID', ispkr)
    
    idx = idx[0]
    if 'AC' in ispkr:
        Group = 'HC'
    else:
        Group = 'PD'
    o.append([ispkr,metadata.at[idx,'SEX'],Group,spkrdict[ispkr]])

df = pandas.DataFrame(o[1:], columns=o[0])

### Check that the resulting folds are balanced

In [7]:
for ifold in range(folds):
    females = df[(df['Fold'] == ifold) & (df['Sex'] == 'F')]
    males = df[(df['Fold'] == ifold) & (df['Sex'] == 'M')]
    control = df[(df['Fold'] == ifold) & (df['Group'] == 'HC')]
    patients = df[(df['Fold'] == ifold) & (df['Group'] == 'PD')]
    speakers = df[df['Fold'] == ifold]['ID']
    print('Fold', ifold)
    print('\t Females:', len(females), 'Males:', len(males), 'Patients:', len(patients), 'Control', len(control))
    print('\t', *speakers)

Fold 0
	 Females: 5 Males: 5 Patients: 5 Control 5
	 AC0045 AC0037 AC0043 AC0024 AC0006 A0013 A0037 A0042 A0047 A0045
Fold 1
	 Females: 5 Males: 5 Patients: 5 Control 5
	 AC0049 AC0021 AC0016 AC0014 AC0010 A0053 A0039 A0027 A0008 A0015
Fold 2
	 Females: 5 Males: 5 Patients: 5 Control 5
	 AC0039 AC0048 AC0005 AC0023 AC0012 A0058 A0046 A0003 A0054 A0020
Fold 3
	 Females: 5 Males: 5 Patients: 5 Control 5
	 AC0052 AC0046 AC0018 AC0017 AC0013 A0016 A0041 A0057 A0059 A0011
Fold 4
	 Females: 5 Males: 5 Patients: 5 Control 5
	 AC0047 AC0033 AC0031 AC0007 AC0030 A0038 A0005 A0010 A0009 A0029
Fold 5
	 Females: 5 Males: 5 Patients: 5 Control 5
	 AC0034 AC0022 AC0004 AC0028 AC0020 A0025 A0043 A0049 A0014 A0050
Fold 6
	 Females: 5 Males: 5 Patients: 5 Control 5
	 AC0050 AC0041 AC0026 AC0053 AC0001 A0034 A0056 A0021 A0035 A0007
Fold 7
	 Females: 5 Males: 5 Patients: 5 Control 5
	 AC0035 AC0040 AC0054 AC0011 AC0008 A0024 A0032 A0023 A0002 A0051
Fold 8
	 Females: 5 Males: 5 Patients: 5 Control 5
	 AC0

## Maria Francesca's code (slightly modified to use the data collected in this script)

The k-fold selection

In [8]:
random.seed(SEED)
patient_df = df[df['Group']=='PD']
control_df = df[df['Group']=='HC']
# PD (Parkinsonian People)
# Building 2 dataset: one with males only while the other females only
males_pd = patient_df.loc[patient_df['Sex'] == 'M']
females_pd = patient_df.loc[patient_df['Sex'] == 'F']
#Number of groups for which i have to split people
GROUPS_NUMBER = folds
# Creation of 2 lists in which i put the speaker id one for male and one for female
list_speaker_id_m = males_pd['ID'].unique()
list_speaker_id_m= list_speaker_id_m.tolist()
list_speaker_id_f = females_pd['ID'].unique()
list_speaker_id_f= list_speaker_id_f.tolist()
for group in range(0,GROUPS_NUMBER): # Modified to start group numbers at 0. 
    #EVEN
    if group %2 == 0:
        print(f'Group',group)
        #randomly sample 2 males from the speaker id list
        y = random.sample(list_speaker_id_m,k=2)
        #find them inside the dataset
        patient_df.loc[patient_df['ID'].isin(y),'Fold'] = group
        #drop them from the list from which i will sample again during the next cicle
        y, list_speaker_id_m = [i for i in y if i not in list_speaker_id_m], [j for j in list_speaker_id_m if j not in y]
        #randomly sample 3 females from the speaker id list
        x = random.sample(list_speaker_id_f,k=3)
        #find them inside the dataset
        patient_df.loc[patient_df['ID'].isin(x),'Fold'] = group
        #drop them from the list from which i will sample again during the next cicle
        x, list_speaker_id_f = [i for i in x if i not in list_speaker_id_f], [j for j in list_speaker_id_f if j not in x]
    #ODD
    else:
        print(f'Group',group)
        #randomly sample 3 males from the speaker id list
        y = random.sample(list_speaker_id_m,k=3)
        patient_df.loc[patient_df['ID'].isin(y),'Fold'] = group
        y, list_speaker_id_m = [i for i in y if i not in list_speaker_id_m], [j for j in list_speaker_id_m if j not in y]
        #randomly sample 2 females from the speaker id list
        x = random.sample(list_speaker_id_f,k=2) 
        patient_df.loc[patient_df['ID'].isin(x),'Fold'] = group
        x, list_speaker_id_f = [i for i in x if i not in list_speaker_id_f], [j for j in list_speaker_id_f if j not in x]
#HC
# Building 2 dataset: one with males only while the other females only
males_hc = control_df.loc[control_df['Sex'] == 'M']
females_hc = control_df.loc[control_df['Sex'] == 'F']
# Creation of 2 lists in which i put the speaker id one for male and one for female
list_speaker_id_m = males_hc['ID'].unique()
list_speaker_id_m= list_speaker_id_m.tolist()
list_speaker_id_f = females_hc['ID'].unique()
list_speaker_id_f= list_speaker_id_f.tolist()
for group in range(0,GROUPS_NUMBER): # Modified to start group numbers at 0. 
    if group %2 == 0:
        y = random.sample(list_speaker_id_m,k=3)
        control_df.loc[control_df['ID'].isin(y),'Fold'] = group
        y, list_speaker_id_m = [i for i in y if i not in list_speaker_id_m], [j for j in list_speaker_id_m if j not in y]
        x = random.sample(list_speaker_id_f,k=2)
        control_df.loc[control_df['ID'].isin(x),'Fold'] = group
        x, list_speaker_id_f = [i for i in x if i not in list_speaker_id_f], [j for j in list_speaker_id_f if j not in x]
    else:
        y = random.sample(list_speaker_id_m,k=2)
        control_df.loc[control_df['ID'].isin(y),'Fold'] = group
        y, list_speaker_id_m = [i for i in y if i not in list_speaker_id_m], [j for j in list_speaker_id_m if j not in y]
        x = random.sample(list_speaker_id_f,k=3)
        control_df.loc[control_df['ID'].isin(x),'Fold'] = group
        x, list_speaker_id_f = [i for i in x if i not in list_speaker_id_f], [j for j in list_speaker_id_f if j not in x]
df_sg = pandas.concat([patient_df,control_df])

Group 0
Group 1
Group 2
Group 3
Group 4
Group 5
Group 6
Group 7
Group 8
Group 9


Check assignments

In [9]:
for ifold in range(0,GROUPS_NUMBER): # Modified to start group numbers at 0. 
    females = df_sg[(df_sg['Fold'] == ifold) & (df_sg['Sex'] == 'F')]
    males = df_sg[(df_sg['Fold'] == ifold) & (df_sg['Sex'] == 'M')]
    control = df_sg[(df_sg['Fold'] == ifold) & (df_sg['Group'] == 'HC')]
    patients = df_sg[(df_sg['Fold'] == ifold) & (df_sg['Group'] == 'PD')]
    speakers = df_sg[df_sg['Fold'] == ifold]['ID']
    print('Fold', ifold)
    print('\t Females:', len(females), 'Males:', len(males), 'Patients:', len(patients), 'Control', len(control))
    print('\t', *speakers)

Fold 0
	 Females: 5 Males: 5 Patients: 5 Control 5
	 A0042 A0039 A0059 A0017 A0052 AC0043 AC0013 AC0031 AC0026 AC0029
Fold 1
	 Females: 5 Males: 5 Patients: 5 Control 5
	 A0027 A0058 A0041 A0051 A0048 AC0012 AC0034 AC0008 AC0027 AC0015
Fold 2
	 Females: 5 Males: 5 Patients: 5 Control 5
	 A0047 A0045 A0053 A0021 A0055 AC0037 AC0049 AC0016 AC0052 AC0030
Fold 3
	 Females: 5 Males: 5 Patients: 5 Control 5
	 A0008 A0046 A0005 A0025 A0006 AC0048 AC0033 AC0004 AC0001 AC0011
Fold 4
	 Females: 5 Males: 5 Patients: 5 Control 5
	 A0057 A0056 A0035 A0007 A0026 AC0014 AC0018 AC0047 AC0040 AC0044
Fold 5
	 Females: 5 Males: 5 Patients: 5 Control 5
	 A0038 A0029 A0050 A0034 A0032 AC0005 AC0017 AC0022 AC0019 AC0025
Fold 6
	 Females: 5 Males: 5 Patients: 5 Control 5
	 A0013 A0054 A0010 A0009 A0049 AC0021 AC0010 AC0039 AC0046 AC0053
Fold 7
	 Females: 5 Males: 5 Patients: 5 Control 5
	 A0020 A0016 A0023 A0031 A0030 AC0023 AC0007 AC0020 AC0041 AC0035
Fold 8
	 Females: 5 Males: 5 Patients: 5 Control 5
	 A00

Write resulting folds to file (if outfile is not empty string)

In [10]:
if outfile:
    df=df.sort_values(['Fold', 'ID'])
    df_sg=df_sg.sort_values(['Fold', 'ID'])
    df.to_csv(outfile, index = False)
    df_sg.to_csv(outfile_mf, index = False)