This contains code for generating the testing and training datasets used by Kramer et al. (2022)

[link to paper](10.3389/fdgth.2022.847555)


In [1]:
import numpy as np
import scipy.io
import shutil
import os
from scipy.io.wavfile import write
import csv

from parameters_kramer import parameters
from data_storage_utils import create_directory, normalizer, generate_store_kramer_data

parameters = parameters()

In [2]:
path_training_data = parameters['annotation_file_train'] # location of anntotation file
path_testing_data =  parameters['annotation_file_test'] # location of anntotation file
path_cinc11 = parameters['path_cinc11']
path_save_training = parameters['train_data']
path_save_testing = parameters['test_data']

In [3]:
data_length = parameters['data_length']
wav_data_range = parameters['wav_data_range']
amount_data = 3870 # number of datapoints we want per label

In [4]:
label_count = [0, 0] # nubmer of current datapoints per label

In [5]:
# only for debugging, remove existing dataset by removing folder:
# shutil.rmtree(path_save+"\\0")
# shutil.rmtree(path_save+"\\1")

Normalization function used:

$data_{normalized}\mapsto \frac{data-r_{\text{min}}}{r_{\text{max}}-r_{\text{min}}}\times (t_{\text{max}}-t_{\text{min}}) + t_{\text{min}}$

Get the annotation of the data and save it in the `annotations` array and their corresponding recording numbers and save it in the `recording numbers` array. The `generate_store_kramer_data` function finds the location where the data is stored. The name of the data file is the recording number. The data is then saved as a `.wav` file in a folder named according to the annotation number. E.G. : `\\Code\\kramer_data\\0\\1002603_10.wav`

# Save training data

In [6]:
annotations = []

with open(path_training_data+"\\Annotation Training Set Group I.txt") as f:
    lines = f.readlines()
    for line in lines: 
        line = line.split('\t')
        annotations_12 = []
        for val in line: 
            annotations_12.append(val[0])
        annotations.append(annotations_12)
        
recording_numbers = []

with open(path_training_data+"\\Training 50 Group I recording names.txt") as f:
    line = f.readline() 
    line = line.split('\t')
    for val in line: 
        val = val.split('\n')
        recording_numbers.append(val[0])
        
label_count = generate_store_kramer_data(annotations, recording_numbers, wav_data_range, label_count, path_cinc11, path_save_training, amount_data)

In [7]:
# print(recording_numbers)
# print(annotations)

In [8]:
annotations = []

with open(path_training_data+"\\Annotation Training Set Group II.txt") as f:
    lines = f.readlines()
    for line in lines: 
        line = line.split('\t')
        annotations_12 = []
        for val in line: 
            annotations_12.append(val[0])
        annotations.append(annotations_12)
        
recording_numbers = []

with open(path_training_data+"\\Training 50 Group II recording names.txt") as f:
    line = f.readline() 
    line = line.split('\t')
    for val in line: 
        val = val.split('\n')
        recording_numbers.append(val[0])
        
label_count = generate_store_kramer_data(annotations, recording_numbers, wav_data_range, label_count, path_cinc11, path_save_training, amount_data)

# Save testing data

In [9]:
annotations = []

with open(path_testing_data+"\\Annotation Testing Set Group I.txt") as f:
    lines = f.readlines()
    for line in lines: 
        line = line.split('\t')
        annotations_12 = []
        for val in line: 
            annotations_12.append(val[0])
        annotations.append(annotations_12)  

recording_numbers = []

with open(path_testing_data+"\\Testing 175 Group I recording names.txt") as f:
    lines = f.readlines() 
    for val in lines:
        val = val.split('\n')
        recording_numbers.append(val[0])

# print(recording_numbers)
label_count = generate_store_kramer_data(annotations, recording_numbers, wav_data_range, label_count, path_cinc11, path_save_testing, amount_data)

In [10]:
annotations = []

with open(path_testing_data+"\\Annotation Testing Set Group II.txt") as f:
    lines = f.readlines()
    for line in lines: 
        line = line.split('\t')
        annotations_12 = []
        for val in line: 
            annotations_12.append(val[0])
        annotations.append(annotations_12)
        
recording_numbers = []

with open(path_testing_data+"\\Testing 175 Group II recording names.txt") as f:
    lines = f.readlines() 
    for val in lines:
        val = val.split('\n')
        recording_numbers.append(val[0])
        
label_count = generate_store_kramer_data(annotations, recording_numbers, wav_data_range, label_count, path_cinc11, path_save_testing, amount_data)

In [11]:
print(label_count)

[1741, 3659]


In [12]:
# shutil.rmtree(path_save+"\\rest")