In [None]:
import numpy as np
import pandas as pd
import os
import librosa
from tqdm.notebook import tqdm
import networkx as nx

In [2]:
# set paths
load_labels_path = '/laughter/DUEL/en/MS_word_laughter_timings/data/alignments'
save_labels_path = '/laughter/DUEL/en/labels'
load_audio_path = '/laughter/DUEL/en/switchboard1/swb1'
save_spectros_path = '/laughter/DUEL/en/spectros'
save_dataset_path = '/laughter/DUEL/datasets/en'

# Create spectrograms and labels

In [3]:
def filter_text_give_save_path(text_file, save_labels_path):
    """Creates a filtered dataframe and accompanying save path.

    # Arguments
        text_file: .text file.

    # Returns
        df4: dataframe with rows labelled with laughter.
        save_path: save path for df4.
    """
    df = pd.read_csv(text_file, sep="\t", names=["id", "a", "start", "end", "x", "label1", "label2"])
    df1 = df[df['label1'].str.contains("laugh")]
    df2 = df[df['label2'].str.contains("laugh")] 
    df3 = pd.concat([df1,df2]).drop_duplicates().reset_index(drop=True)
    df4 = df3.drop(columns=['a', 'x'])
    save_file = os.path.basename(text_file).split('-')[0] + '_Laugh.txt'
    save_path = save_labels_path + '/' + save_file
    return df4, save_path

In [4]:
def combine_AB(a_path, save_labels_path, b_laughter_paths):
    """Creates combined dataframe of A and B labels.
    
    # Arguments
        a_path: filepath to A label file.
    
    # Returns
        ab_df: dataframe of combined A and B labels.
        ab_save: save path.
    """
    a_id = os.path.basename(a).split('A')[0]
    b = [x for x in b_laughter_paths if a_id in x][0]
    a_df = pd.read_csv(a, sep="\t")
    b_df = pd.read_csv(b, sep="\t")
    ab_df = pd.concat([a_df, b_df]).reset_index(drop=True).drop(columns=['Unnamed: 0'])
    save_file = os.path.basename(a_path).split('A')[0] + '_Comb_Laugh.txt'
    ab_save = save_labels_path + '/' + save_file
    return ab_df, ab_save

In [5]:
def create_spectrogram(load_path, save_path, start, stop, y, sr, mels):
    """Creates a mel spectrogram and accompanying save path.

    # Arguments
        load_path: wav audio filepath.
        save_path: directory to save spectrograms.
        start: start time in seconds.
        stop: stop time in seconds.
        y: audio time series.
        sr: sample rate.
        
    # Returns
        S: mel spectrogram array with dimensions (n_mels, t).
        S_save_path: path to save spetrogram.
    """
    S = librosa.feature.melspectrogram(y=y[sr * start:(sr * stop)],
                                       sr=sr, n_mels=mels, fmax=sr / 2)
    c_id = os.path.basename(load_path).split('.')[0]
    if not os.path.isdir(save_path + '/' + c_id):
        os.makedirs(save_path + '/' + c_id)
    S_save_path = save_path + '/' + c_id + '/' + c_id + '_' + str(start) + \
                  'to' + str(stop) + '_spectro'
    return S, S_save_path

In [6]:
# spectrogram settings
timesteps= 259
window_size = 6
timesteps_per_second = timesteps / window_size
slide = 6
mels = 64
sample_rate = 22_050

In [None]:
label_files = [os.path.join(root, name)
              for root, dirs, files in os.walk(load_labels_path)
              for name in files
              if name.endswith((".text"))]

# filter for laughter and save label files
for f in tqdm(label_files, desc='labels'):
    df, save_path = filter_text_give_save_path(f, save_labels_path)
    df.to_csv(save_path, sep='\t')

In [None]:
laughter_paths = [os.path.join(root, name)
            for root, dirs, files in os.walk(save_labels_path)
            for name in files
            if name.endswith((".txt"))]

a_laughter_paths = [os.path.join(root, name)
            for root, dirs, files in os.walk(save_labels_path)
            for name in files
            if name.endswith(("A_Laugh.txt"))]

b_laughter_paths = [os.path.join(root, name)
            for root, dirs, files in os.walk(save_labels_path)
            for name in files
            if name.endswith(("B_Laugh.txt"))]

In [None]:
# combine A and B laughter files
for a in tqdm(a_laughter_paths, desc='combing labels'):
    ab_df, ab_save = combine_AB(a, save_labels_path, b_laughter_paths)
    ab_df.to_csv(ab_save, sep='\t')

In [None]:
label_paths = [os.path.join(root, name)
            for root, dirs, files in os.walk(save_labels_path)
            for name in files
            if name.endswith(("Comb_Laugh.txt"))]

wavs = [os.path.join(root, name)
            for root, dirs, files in os.walk(load_audio_path)
            for name in files
            if name.endswith((".wav"))]

labels_paths_in_audio_format = [os.path.basename(l).split('_')[0] for l in label_paths]
labels_paths_in_audio_format = [(l[0:2] + '0' + l[2:]) for l in labels_paths_in_audio_format]

In [None]:
# create set of wavs which have a related label file
wav_label_match = []
for l in labels_paths_in_audio_format:
    for w in wavs:
        if l in w:
            wav_label_match.append(w)
wav_label_match = list(set(wav_label_match))

In [None]:
# create and save spectrograms
for w in tqdm(wav_label_match, desc='total'):
    y, sr = librosa.load(w, sr=sample_rate, mono=True)
    length = int(len(y) / sr)
    remainder = length % window_size
    for i in tqdm(range(0, length - remainder - window_size, slide),
                  desc='current_wav', leave=False):
        S, S_save_path = create_spectrogram(w, save_spectros_path,
                                            i, i + window_size,
                                            y, sr, mels)
        np.save(S_save_path, S)

# Separate out test set

In [6]:
# list of conversation numbers in metadata format
convs = [int(os.path.basename(p).split('.')[0][3:]) for p in wav_label_match]

In [11]:
metadata_csv = 'swda-metadata.csv'
df = pd.read_csv(metadata_csv)
# filter dataframe to include only conversations with a wav file and a label
#df = df[df['conversation_no'].isin(convs)]

In [12]:
list(df.columns.values)

['conversation_no',
 'talk_day',
 'length',
 'topic_description',
 'prompt',
 'from_caller_sex',
 'from_caller_education',
 'from_caller_birth_year',
 'from_caller_dialect_area',
 'to_caller_sex',
 'to_caller_education',
 'to_caller_birth_year',
 'to_caller_dialect_area']

In [13]:
# there are no ids, so need to create by combining other identifiers
df['from_id'] = df['from_caller_sex'].astype(str) + "," \
                + df['from_caller_birth_year'].astype(str) + "," \
                + df['from_caller_education'].astype(str) + "," \
                + df['from_caller_dialect_area'].astype(str)
df['to_id'] = df['to_caller_sex'].astype(str) + "," \
                + df['to_caller_birth_year'].astype(str) + "," \
                + df['to_caller_education'].astype(str) + "," \
                + df['to_caller_dialect_area'].astype(str)

In [14]:
df = df.drop(columns=['talk_day', 'length',
                      'topic_description', 'prompt',
                      'from_caller_sex', 'from_caller_education',
                      'from_caller_birth_year', 'from_caller_dialect_area',
                      'to_caller_sex', 'to_caller_education',
                      'to_caller_birth_year', 'to_caller_dialect_area'])

In [None]:
df

In [14]:
# create graph 
edges = df.drop(columns=['conversation_no'])
G = nx.convert_matrix.from_pandas_edgelist(edges, source='from_id', target='to_id')

In [15]:
# create list of conversations with lowest degree centrality nodes to form test set
central_dict = nx.degree_centrality(G)
ids = [k for k in central_dict.keys() if central_dict[k] < 0.01]
f_id_df = df[df['from_id'].isin(ids)]
t_id_df = df[df['to_id'].isin(ids)]
test_set = pd.concat([f_id_df,t_id_df]).drop_duplicates().reset_index(drop=True)

In [16]:
test_set

Unnamed: 0,conversation_no,from_id,to_id
0,2121,"MALE,1937,2,NEW ENGLAND","MALE,1958,3,NORTHERN"
1,2131,"MALE,1933,2,SOUTH MIDLAND","FEMALE,1963,2,SOUTH MIDLAND"
2,2151,"MALE,1932,1,NEW ENGLAND","FEMALE,1963,2,SOUTH MIDLAND"
3,2229,"MALE,1956,2,NORTHERN","FEMALE,1957,2,MIXED"
4,2429,"FEMALE,1970,2,SOUTHERN","FEMALE,1959,2,MIXED"
...,...,...,...
187,4905,"FEMALE,1960,2,WESTERN","MALE,1958,3,SOUTHERN"
188,4908,"MALE,1937,2,NORTHERN","MALE,1945,3,NORTH MIDLAND"
189,4917,"FEMALE,1960,2,WESTERN","MALE,1962,2,NORTHERN"
190,4927,"FEMALE,1960,2,WESTERN","MALE,1946,1,WESTERN"


In [108]:
# create list of all conversation nos in test set
test_set_list = [str(e) for e in test_set['conversation_no'].tolist()]

# Create Dataset (combine, id, spectrogram, label)

In [1]:
def find_label_start_end(spectro_file, save_labels_path):
    """Find the label path, start and end time relating to spectrogram.
    
    # Arguments
        spectro_file: spectrogram filepath.
        save_labels_path: directory of labels.
        
    # Returns
        label_path: label filepath relating to the spectrogram.
        start_time: start time relating to the spectrogram.
        end_time: end time relating to the spectrogram.
    """
    base_file = os.path.basename(spectro_file)
    time = base_file.split('_')[1]
    start_time = int(time.split('to')[0])
    end_time = int(time.split('to')[1])

    spec_id = os.path.basename(spectro_file.split('_')[0])[3:]
    label_dir = os.path.dirname(spectro_file)
    label_files = [f for f in os.listdir(save_labels_path) 
                   if f.endswith((spec_id + "_Comb_Laugh.txt"))]
    label_path = save_labels_path + '/' + label_files[0]
    return label_path, start_time, end_time

In [82]:
def filter_csv(start_time, end_time, label_path):
    """Filters csv file for start and end time, returns as dataframe.
    
    # Arguments
        start_time: start time relating to spectrogram.
        end_time: end time relating to spectrogram.
        label_path: filepath of label.
        
    # Returns
        dataframe filtered to contain 'laugh' in the text
        and filtered for specified start_time and end_time.
        When start_time in the csv is before specified start_time,
        this record will be included but start_time in the csv will be set
        to specified start_time. Same for end_time.
        
    # Example
        start         end          label1
        905.765658    909.731864   [laughter]

        if start_time was 907 and end_time was 909, this row would be set to:

        start         end          label1
        907.0         909.0        [laughter]
    """    
    df = pd.read_csv(label_path, sep='\t', index_col=0)
    df = df[df['start'] <= end_time]
    df = df[df['end'] >= start_time]
    df.loc[df.end > end_time, 'end'] = end_time
    df.loc[df.start < start_time, 'start'] = start_time
    return df

In [86]:
def create_id(spectro_file):
    """Create identifier for spectrogram.
    
    # Arguments
        spectro_file: filepath for spectrogram.
        
    # Returns
        id for file.
        For example input of spectro_file of '.../sw03148/sw03148_264to270_spectro.npy'
        would return 'sw031_270to276'.
    """
    base_name = os.path.basename(spectro_file)
    r = base_name.split('_')[0]
    r2 = base_name.split('_')[1]
    file_id = r + '_' + r2
    file_id
    return file_id

In [89]:
def start_end_in_timesteps(df, start_time, timesteps_per_second):
    """Convert start and end time from seconds to timesteps.
    Remove tier_name, tier_type and text columns.
    Reformat times to start from 0 and end at window_size.
    
    # Arguments
        df: dataframe in format from output of function filter_csv.
        start_time: start time relating to spectrogram.
        timesteps_per_second: timesteps_per_second = timesteps / window_size.
        
    # Returns
        dataframe after modifications.
    """
    df = df.drop(['id', 'label1', 'label2'], 1)
    df['start'] = df['start'] - start_time
    df['start'] = (df['start'] * timesteps_per_second).apply(np.floor)
    df['end'] = df['end'] - start_time
    df['end'] = (df['end'] * timesteps_per_second).apply(np.ceil)
    return df

In [94]:
def create_label_matrix(df):
    """Convert label annotations into a matrix.
    
    # Arguments
        df: dataframe in format from output of start_end_in_timesteps.
        
    # Returns
        vector of length (timesteps) which has values of 0 or 1.
        1 representing laughter, 0 representing no laughter.
    
    # Example
        [1, 0, 0, 1, 0, 0 ....] represents laughter in timesteps 0 and 3
    """
    label = np.zeros(timesteps)
    update_list = []
    for index, row in df.iterrows():
        update_list.append([row['start'], row['end']])
    for l in update_list:
        start = int(l[0])
        end = int(l[1])
        label[start:end] = 1
    return label

In [96]:
def create_id_spectro_label(file_id, spectro_path, label):
    """Combine id, spectrogram and label.
    
    # Arguments
        file_id: file id created from function create_id.
        spectro_path: filepath for spectrogram.
        label: label created from function create_label_matrix.
        
    # Returns
        numpy array containing 3 elements:
        id
        related spectrogram
        related label
    """
    np_spectro_file = np.load(spectro_path)
    combined = [file_id, np_spectro_file, label]
    np_combined = np.asarray(combined)
    return np_combined

In [99]:
spectros = [os.path.join(root, name)
            for root, dirs, files in os.walk(save_spectros_path)
            for name in files
            if name.endswith(("spectro.npy"))]

# keep only spectrograms that have a label
spectros_keep = []
# find conversations that match with wavs
conv_wav_match = [os.path.basename(x).split('.')[0][3:] for x in wav_label_match]
for s in spectros:
    conv_s = os.path.dirname(s).split('/')[-1][3:]
    if conv_s in conv_wav_match:
        spectros_keep.append(s)

In [None]:
dataset = []

for spectro_path in tqdm(spectros_keep, desc='create dataset'):
    label_path, start_time, end_time = find_label_start_end(spectro_path, save_labels_path)
    df = filter_csv(start_time, end_time, label_path)
    df = start_end_in_timesteps(df, start_time, timesteps_per_second)
    df_label = create_label_matrix(df)
    file_id = create_id(spectro_path)
    np_combined = create_id_spectro_label(file_id, spectro_path, df_label)
    dataset.append(np_combined)

# Check laughter rate and save train, val, test files

In [104]:
def increase_laugh_rate(dataset, percentage_laughs):
    """Increase the laughter rate in the dataset.
    
    # Arguments
        dataset: dataset to have laughter percentage increased.
        percentage_laughs: desired percentage for dataset of
                           examples containing a laugh. As integer.
    
    # Returns
        dataset_inc_laughs: dataset with increased rate of examples 
                            containing a laugh.
    """
    laugh = [example for example in dataset if 1 in example[2]]
    laugh_count = len(laugh)
    no_laugh = [example for example in dataset if 1 not in example[2]]
    no_laugh_count = len(no_laugh)
    
    delete_from_no_laugh = len(dataset) - laugh_count * 100 / percentage_laughs
    delete_from_no_laugh = int(delete_from_no_laugh)
    
    no_laugh = no_laugh[0:-delete_from_no_laugh]
    
    dataset_inc_laughs = np.vstack((laugh, no_laugh))
    return dataset_inc_laughs

In [105]:
def laughter_check(dataset):
    """Check the percentage of clips that contain laughter in the dataset.
    
    # Arguments
        dataset: dataset to be checked.
    
    # Returns
        percentage of examples containing laughter.
    """
    laughs = len([e for e in dataset[:, 2] if 1 in e])
    total = len(dataset[:, 2])
    laughter_p = laughs / total
    return laughter_p

In [106]:
dataset = np.asarray(dataset)

In [116]:
train_val = [e for e in dataset 
             if e[0].split('_')[0][3:] not in test_set_list]

In [120]:
test = [e for e in dataset 
        if e[0].split('_')[0][3:] in test_set_list]

In [124]:
train_val = np.asarray(train_val)
np.random.shuffle(train_val)
val_split = int(len(dataset) * 0.1)
train = train_val[:-val_split]
val = train_val[-val_split:]

test = np.asarray(test)
np.random.shuffle(test)
print(laughter_check(train))
print(laughter_check(val))
print(laughter_check(test))

0.15669655497876356
0.16479772888573457
0.12847288752634603


In [None]:
train = increase_laugh_rate(train, 33)
print(laughter_check(train))

In [130]:
np.save(save_dataset_path + 'en_train_6_6_64_ds', train)
np.save(save_dataset_path + 'en_val_6_6_64_ds', val)
np.save(save_dataset_path + 'en_test_6_6_64_ds', test)