# Project Kojak - Weeks 09 - 12

## Defining groups of speakers for training and testing
(Jupyter Notebook 2 of 4)

### This notebook.

This notebook focuses on making random groupings of the various speakers in the TIMIT corpus. The TIMIT corpus has two subdivisions of data called TRAIN and TEST. I used the TRAIN data to create the 100 "household" groups of 6 people each, though this data set also has a training subset (7 sentences of the 10 spoken) and test subset (3 sentences). I reserved the TEST data to create groups of unfamiliar (out-of-group) speakers.

In [1]:
import numpy as np
import glob
import os

from itertools import compress

### First, set the groundwork for creating the 100 speaker groups ###

In [2]:
# Define the distribution of speakers in terms of gender and #
# dialect region of the U.S., for every "household" smart speaker. #
# Dictionary format is (# males, # females, # regions) : repetitions.
SPKR_CNT = 6
nrepeat = 10
batch_distr = {(3,3,1):nrepeat, (3,3,3):nrepeat, (3,3,7):nrepeat,
               (0,6,1):nrepeat, (0,6,3):nrepeat, (0,6,7):nrepeat,
               (6,0,1):nrepeat, (6,0,3):nrepeat, (6,0,7):nrepeat,
               (3,3,'dr8'):2, (0,6,'dr8'):4, (6,0,'dr8'):4
              }

In [3]:
# The function creates random groups of speakers following certain rules. #
# Total # of male and female in input arguments must equal the length
# of 'dr_list', the dialect regions from which the speakers are to be
# chosen. The region list should already be randomized (if desired),
# as the gender list will not be (males precede females).
def get_speakers(nmales, nfemales, nreg, dr_list, wavbase='data/TRAIN'):
    g_list = ['m']*nmales + ['f']*nfemales
    if len(g_list) != len(dr_list):
        raise ValueError('M/F and region lists do not have equal lengths.')
    
    cwd = os.getcwd()   # store current directory
    
    spkr_list = []      # temporarily change to region directory
    for i in range(len(dr_list)):
        dr_next = dr_list[i].upper()
        g_next = g_list[i].upper() # g = gender = "M" or "F"
        os.chdir('/'.join([cwd,wavbase,dr_next]))
                        # randomly pick speaker from directory
        spkr_matches = glob.glob(g_next + '*')
        idx = np.random.randint(len(spkr_matches))
        spkr = spkr_matches[idx]
        while spkr in spkr_list:   # repeat if speaker already in group
            idx = np.random.randint(len(spkr_matches))
            spkr = spkr_matches[idx]
        spkr_list.append(spkr)
        
    os.chdir(cwd)      # at end, revert back to original directory
    
    return spkr_list

In [4]:
# Get audio files from TIMIT database, given dialect region + speaker #
# Argument 'split' gives the percentage of files (of 10) to use in the
# training set; this must be at least 0.8; the rest go to the test set.
# The train/test allocation is semi-random and executed using indexes
# into the list of file names rather than the file names themselves.
def get_audiofiles(dr, spkr, split=0.7, wavbase='data/TRAIN'):
    wavdir = '/'.join([wavbase,dr,spkr]).upper()
    wavlist = glob.glob(wavdir + '/*.WAV')
    nwav = len(wavlist)
    
    train_idx, test_idx = [], []
    all_idx = range(nwav)
    
    # At least one 'SA' and one 'SI' file must be in the test set #
    sa_idx = [bool(glob.re.search('/SA',wav)) for wav in wavlist]
    sa_idx = np.random.permutation(list(compress(all_idx,sa_idx)))
    si_idx = [bool(glob.re.search('/SI',wav)) for wav in wavlist]
    si_idx = np.random.permutation(list(compress(all_idx,si_idx)))
    
    test_idx.append(sa_idx[0])
    test_idx.append(si_idx[0])
    
    # And at least one 'SA' in the training set #
    train_idx.append(sa_idx[1])
    
    # All others distribute to achieve the train/test split ratio #
    unused_idx = [i for i in all_idx if not (i in test_idx or i in train_idx)]
    unused_idx = np.random.permutation(unused_idx)
    ntest = int(max(np.floor(nwav*(1.-split)) - len(test_idx), 0))
    
    if ntest:  # 'ntest' = additional # wav files to allocate to test set
        test_idx.extend(unused_idx[:ntest])
    train_idx.extend(unused_idx[ntest:])
    
    # Finally, assign wav files based on indexes into the list #
    train_wavlist = list(np.array(wavlist)[train_idx])
    test_wavlist = list(np.array(wavlist)[test_idx])

    return train_wavlist, test_wavlist

### Create the speaker groups ###

In [7]:
# Loop through intructions in 'batch_distr' to get random speakers #
# Save the names of the speakers and regions, the .wav file names, and
# other info. Define the distribution of speakers in terms of gender and
# dialect region of the U.S., for every "household" smart speaker.
# Dictionary format is (# males, # females, # regions) : repetitions.
SPKR_CNT = 6
REGION_NAMES = ('dr1','dr2','dr3','dr4','dr5','dr6','dr7')
nrepeat = 10
batch_distr = {(3,3,1):nrepeat, (3,3,3):nrepeat, (3,3,7):nrepeat,
               (0,6,1):nrepeat, (0,6,3):nrepeat, (0,6,7):nrepeat,
               (6,0,1):nrepeat, (6,0,3):nrepeat, (6,0,7):nrepeat,
               (3,3,'dr8'):2, (0,6,'dr8'):4, (6,0,'dr8'):4
              }

batch_info = {}
cnt = 0
for code in batch_distr:
    for i in range(batch_distr[code]):
        cnt += 1
        if type(code[2]) is str:   # for specific region, just repeat it
            region_list = [code[2]]*SPKR_CNT
        else:   # otherwise, create random lists of size SPKR_CNT
            region_subset = np.random.permutation(REGION_NAMES)
            region_subset = region_subset[0:code[2]]
            idx = np.random.randint(0, len(region_subset), SPKR_CNT)
            region_list = [region_subset[i] for i in idx]
        
        speaker_list = get_speakers(*code, region_list,
                                    wavbase='data/TRAIN')
        train_files, test_files = [], []
        for j in range(len(speaker_list)):
            reg = region_list[j]
            spk = speaker_list[j]
            train, test = get_audiofiles(reg, spk,
                                         split=0.7, wavbase='data/TRAIN')
            train_files.append(train)
            test_files.append(test)

        nregion = len(np.unique(region_list))
        diversity = 10 - (abs(code[1]-code[0]) + 0.1*(6-nregion))

        batch_info[cnt] = {'device':cnt, 'code':code,
            'regions':region_list, 'speakers':speaker_list,
            'train':train_files, 'test':test_files,
            'nregion':nregion, 'diversity':diversity
            }

In [8]:
# Show an example of one group, or "household", of 6 members #
batch_info[22]

{'device': 22,
 'code': (3, 3, 7),
 'regions': ['dr1', 'dr6', 'dr2', 'dr5', 'dr2', 'dr5'],
 'speakers': ['MDAC0', 'MKLN0', 'MEFG0', 'FBJL0', 'FSKL0', 'FLOD0'],
 'train': [['DATA/TRAIN/DR1/MDAC0/SA1.WAV',
   'DATA/TRAIN/DR1/MDAC0/SX451.WAV',
   'DATA/TRAIN/DR1/MDAC0/SX181.WAV',
   'DATA/TRAIN/DR1/MDAC0/SX91.WAV',
   'DATA/TRAIN/DR1/MDAC0/SX361.WAV',
   'DATA/TRAIN/DR1/MDAC0/SI1261.WAV',
   'DATA/TRAIN/DR1/MDAC0/SX271.WAV'],
  ['DATA/TRAIN/DR6/MKLN0/SA1.WAV',
   'DATA/TRAIN/DR6/MKLN0/SX428.WAV',
   'DATA/TRAIN/DR6/MKLN0/SX68.WAV',
   'DATA/TRAIN/DR6/MKLN0/SI968.WAV',
   'DATA/TRAIN/DR6/MKLN0/SX338.WAV',
   'DATA/TRAIN/DR6/MKLN0/SX248.WAV',
   'DATA/TRAIN/DR6/MKLN0/SI1598.WAV'],
  ['DATA/TRAIN/DR2/MEFG0/SA1.WAV',
   'DATA/TRAIN/DR2/MEFG0/SX375.WAV',
   'DATA/TRAIN/DR2/MEFG0/SX285.WAV',
   'DATA/TRAIN/DR2/MEFG0/SX105.WAV',
   'DATA/TRAIN/DR2/MEFG0/SX15.WAV',
   'DATA/TRAIN/DR2/MEFG0/SX195.WAV',
   'DATA/TRAIN/DR2/MEFG0/SI598.WAV'],
  ['DATA/TRAIN/DR5/FBJL0/SA1.WAV',
   'DATA/TRAIN/DR5/FBJL

In [9]:
# Save the batch file #
import pickle

with open('data/batch_training02.pkl', 'wb') as picklefile:
    pickle.dump(batch_info, picklefile)

### Create a different batch set for speakers who aren't part of a group ###

In [97]:
# In this case, just collect ALL of the speaker files in the "TEST" subdirectory #
batch_info2 = []   # a list this time, rather than a dictionary
subdir_list = glob.glob('data/TEST/*')

for subdir in subdir_list:
    wavlist = [wav for spkrdir in glob.glob(subdir+'/*')  \
               for wav in glob.glob(spkrdir+'/*.WAV')]
    batch_info2.extend(wavlist)

In [100]:
# Look at a few file examples #
batch_info2[101:103]

['data/TEST/DR4/FADG0/SX289.WAV', 'data/TEST/DR4/FADG0/SX199.WAV']

In [99]:
# Save the batch file #
import pickle

with open('data/batch_test02.pkl', 'wb') as picklefile:
    pickle.dump(batch_info2, picklefile)