In [5]:
import tgt
import pandas as pd
import numpy as np
import os
import librosa
from tqdm.notebook import tqdm

In [10]:
# set paths
audio_path = '/laughter/DUEL/de/audio'
annotation_path = '/laughter/DUEL/de/transcriptions_annotations/'
save_path = '/laughter/DUEL/datasets/de/'

# Create Spectrograms 

In [11]:
def save_spectrogram(filepath, save_path, start, stop, y, sr):
    """Convert audio to spectrogram and save.
    
    # Arguments
        filepath: wav audio filepath.
        start: start time in seconds.
        stop: stop time in seconds.
        y: audio time series.
        sr: sample rate.
        
    # Outputs
        saves a numpy file of the mel spectrogram array with
        dimensions (n_mels, t)
    """
    S = librosa.feature.melspectrogram(y=y[sr * start:(sr * stop)],
                                       sr=sr, n_mels=64, fmax=sr / 2)
    rp = os.path.basename(filepath).split('.')[0]
    if not os.path.isdir(save_path + '/' + rp):
        os.makedirs(save_path + '/' + rp)
    save_path = save_path + '/' + rp + '/' + rp + '_' + str(start) + 'to' + str(stop) + '_spectro'
    np.save(save_path, S)

In [None]:
wavs = [os.path.join(root, name)
            for root, dirs, files in os.walk(audio_path)
            for name in files
            if name.endswith((".wav"))]

In [18]:
window_size = 6
slide = 6

In [None]:
for filepath in tqdm(wavs, desc='load audio'):
    y, sr = librosa.load(filepath, mono=True)
    length = int(len(y) / sr)
    remainder = length % window_size
    for i in tqdm(range(0, length - remainder - window_size, slide), desc='save_spectro', leave=False):
        save_spectrogram(filepath, save_path, i, i + window_size, y, sr)

# Convert TextGrid file to csvs

In [20]:
def convert_tg_file_to_csv(file, annotation_path):
    """Filter TextGrid annotation file for laughter and convert to csv.
    
    # Arguments
        file: TextGrid file.
        annotation_path: folder containing annotations.
    
    # Saves
        csv file which is a filtered TextGrid file with only 
        tiers that are named containing 'laugh'.
    """
    tg = tgt.io.read_textgrid(file, include_empty_intervals=True)
    tier_list = tg.get_tier_names()
    tier_no_laugh_list = [tier for tier in tier_list if 'laugh' not in tier]
    for tier in tier_no_laugh_list:
        tg.delete_tier(tier)
    csv = tgt.io.export_to_table(tg, separator=',')
    save_name = os.path.basename(file).split('.')[0] + '_Laugh.txt'
    save_dir = os.path.dirname(file)
    save_file = save_dir + '/' + save_name
    with open(save_file, 'w') as output:
        output.write(csv)

In [21]:
def convert_tg_file_to_part_csv(file, annotation_path):
    """Filter TextGrid annotation file for parts and convert to csv.
    To be used to determine when roleplay starts, this is needed
    as there is audio outside of the roleplay, which has not been
    annotated.
    
    # Arguments
        file: TextGrid file.
        annotation_path: folder containing annotations.
    
    # Saves
        csv file which is a filtered TextGrid file with only 
        tiers that are named containing 'part'. 
    """
    tg = tgt.io.read_textgrid(file, include_empty_intervals=True)
    tier_list = tg.get_tier_names()
    tier_no_part_list = [tier for tier in tier_list if 'Part' not in tier]
    for tier in tier_no_part_list:
        tg.delete_tier(tier)
    csv = tgt.io.export_to_table(tg, separator=',')
    save_name = os.path.basename(file).split('.')[0] + '_Parts.txt'
    save_dir = os.path.dirname(file)
    save_file = save_dir + '/' + save_name
    with open(save_file, 'w') as output:
        output.write(csv)

In [22]:
TextGrid_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(annotation_path)
             for name in files
             if name.endswith((".TextGrid"))]

In [None]:
for file in tqdm(TextGrid_files, desc='tg to csv'):
    convert_tg_file_to_csv(file, annotation_path)
    convert_tg_file_to_part_csv(file, annotation_path)

# Create Dataset (combine: id, spectrogram, label)

In [24]:
def find_label_start_end(spectro_file, annotation_path):
    """Find the label path, start and end time relating to spectrogram.
    
    # Arguments
        spectro_file: spectrogram filepath.
        annotation_path: directory of annotations.
        
    # Returns
        label_path: label filepath relating to the spectrogram.
        start_time: start time relating to the spectrogram.
        end_time: end time relating to the spectrogram.
        roleplay_path: roleplay parts information filepath relating to the spectrogram.
    """
    base_file = os.path.basename(spectro_file)
    start_time = int(base_file.split('_')[1].split('to')[0])
    end_time = int(base_file.split('_')[1].split('to')[1])
    
    label_dir = annotation_path + os.path.dirname(spectro_file).split('/')[-1]
    label_files = [f for f in os.listdir(label_dir) if f.endswith(("Laugh.txt"))]
    label_path = label_dir + '/' + label_files[0]
    
    roleplay_files = [f for f in os.listdir(label_dir) if f.endswith(("Parts.txt"))] 
    roleplay_path = label_dir + '/' + roleplay_files[0]
    return label_path, start_time, end_time, roleplay_path

In [25]:
def filter_csv(start_time, end_time, label_path):
    """Filters csv file for start and end time, returns as dataframe.
    
    # Arguments
        start_time: start time relating to spectrogram.
        end_time: end time relating to spectrogram.
        label_path: filepath of label.
        
    # Returns
        dataframe filtered to contain 'laugh' in the text
        and filtered for specified start_time and end_time.
        When start_time in the csv is before specified start_time,
        this record will be included but start_time in the csv will be set
        to specified start_time. Same for end_time.
        
        For example:
    
        start_time    end_time     text
        905.765658    909.731864   <laughter> jaha läuft </laughter>
    
        if start_time was 907 and end_time was 909, filter_csv would set this row to:
    
        start_time    end_time     text
        907.0         909.0        <laughter> jaha läuft </laughter>
    """
    df = pd.read_csv(label_path)
    df = df[df['text'].str.contains('laugh') == True]
    df = df[df['text'].str.contains('Offset') == False] # Remove offsets
    df = df[df['start_time'] <= end_time]
    df = df[df['end_time'] >= start_time]
    df.loc[df.end_time > end_time, 'end_time'] = end_time
    df.loc[df.start_time < start_time, 'start_time'] = start_time
    return df

In [26]:
def create_id(spectro_file):
    """Create identifier for spectrogram.
    
    # Arguments
        spectro_file: filepath for spectrogram.
        
    # Returns
        id for file.
        
    # Example
        input of spectro_file of 'audio/r7/r7_270to276_spectro.npy'
        would return 'r7_270to276'.
    """
    base_name = os.path.basename(spectro_file)
    r = base_name.split('_')[0]
    times = base_name.split('_')[1]
    file_id = r + '_' + times
    return file_id

In [27]:
def start_end_in_timesteps(df, start_time, timesteps_per_second):
    """Convert start and end time from seconds to timesteps.
    Reformating times to start from 0 and end at 6.
    Removing tier_name, tier_type and text columns.
    
    # Arguments
        df: dataframe in format from output of function filter_csv.
        start_time: start time relating to spectrogram.
        timesteps_per_second: timesteps_per_second = timesteps / window_size.
        
    # Returns
        dataframe after modifications.
    """
    df = df.drop(['tier_name', 'tier_type', 'text'], 1)
    df['start_time'] = df['start_time'] - start_time
    df['start_time'] = (df['start_time'] * timesteps_per_second).apply(np.floor)
    df['end_time'] = df['end_time'] - start_time
    df['end_time'] = (df['end_time'] * timesteps_per_second).apply(np.ceil)
    return df

In [28]:
def create_label_matrix(df):
    """Convert label annotations into a matrix.
    
    # Arguments
        df: dataframe in format from output of start_end_in_timesteps.
        
    # Returns
        vector of length (timesteps) which has values of 0 or 1.
        1 representing laughter, 0 representing no laughter.
    
    # Example:
        [1, 0, 0, 1, 0, 0 ....] represents laughter in timesteps 0 and 3
    """
    label = np.zeros(timesteps)
    update_list = []
    for index, row in df.iterrows():
        update_list.append([row['start_time'], row['end_time']])
    for l in update_list:
        start = int(l[0])
        end = int(l[1])
        label[start:end] = 1
    return label

In [29]:
def create_id_spectro_label(file_id, spectro_path, label):
    """Combine id, spectrogram and label.
    
    # Arguments
        file_id: file id created from function create_id.
        spectro_path: filepath for spectrogram.
        label: label created from function create_label_matrix.
        
    # Returns
        numpy array containing 3 elements:
        id
        related spectrogram
        related label
    """
    np_spectro_file = np.load(spectro_path)
    combined = [file_id, np_spectro_file, label]
    np_combined = np.asarray(combined)
    return np_combined

In [30]:
def roleplay_flag(roleplay_path):
    """States whether label path is during the roleplay or not.
    This is decided by the 'Parts.txt' file which has roleplay start
    and end times.
    
    # Arguments
        roleplay_path: filepath for Parts file which has times for annotated roleplays.
        
    # Returns
        True if the start and end times of the spectrogram
        are during the annotated roleplay times. Else returns False.
    """
    df = pd.read_csv(roleplay_path)
    df = df.drop(['tier_name', 'tier_type', 'text'], 1)
    roleplay_times = []
    for index, row in df.iterrows():
        roleplay_times.append([row['start_time'], row['end_time']])
    proceed_flag = False
    for rp in roleplay_times:
        if start_time <= rp[1] and end_time >= rp[0]:
            proceed_flag = True
    return proceed_flag

In [33]:
spectros = [os.path.join(root, name)
            for root, dirs, files in os.walk(save_path)
            for name in files
            if name.endswith(("spectro.npy"))]

In [None]:
dataset = []
spectro_eg = np.load(spectros[0])
timesteps = spectro_eg.shape[1] 
timesteps_per_second = timesteps / window_size

for spectro_path in tqdm(spectros, desc='create dataset'):
    label_path, start_time, end_time, roleplay_path = find_label_start_end(spectro_path, annotation_path)
    if roleplay_flag(roleplay_path):
        df = filter_csv(start_time, end_time, label_path)
        df = start_end_in_timesteps(df, start_time, timesteps_per_second)
        df_label = create_label_matrix(df)
        file_id = create_id(spectro_path)
        np_combined = create_id_spectro_label(file_id, spectro_path, df_label)
        dataset.append(np_combined)

# Save train, val and test files

In [22]:
dataset = np.asarray(dataset)

In [23]:
train_val = [e for e in dataset 
             if 'r17' not in e[0]
             and 'r18' not in e[0]
             and 'r19' not in e[0]]

In [24]:
test = [e for e in dataset 
        if 'r17' in e[0]
        or 'r18' in e[0]
        or 'r19' in e[0]]

In [25]:
train_val = np.asarray(train_val)
np.random.shuffle(train_val)
val_split = int(len(dataset) * 0.1)
train = train_val[:-val_split]
val = train_val[-val_split:]

test = np.asarray(test)
np.random.shuffle(test)

In [31]:
np.save(save_path + 'de_train_6_6_64_ds', train)
np.save(save_path + 'de_val_6_6_64_ds', val)
np.save(save_path + 'de_test_6_6_64_ds', test)