In [1]:
import audioop
import itertools
import pathlib
import random
import subprocess
import tempfile
import wave

import keras.models
import numpy as np
from scipy import optimize
from scipy import stats
import tgt
import webrtcvad

import data_loading
import floor_control
import settings
import skantze_train
import utils

Using TensorFlow backend.


# Data loading

In [2]:
data = list(data_loading.generator(settings.ANNOTATIONS_DIR, settings.AUDIO_DIR))

# Data preparation

## Annotated floor tier from utterances

In [3]:
def _generate_unsorted_utterance_intervals(tg, tier_names):
    for i, tier in enumerate(tier_names):
        for interval in tg.get_tier_by_name(tier).intervals:
            yield tgt.core.Interval(
                interval.start_time,
                interval.end_time,
                str(i),
            )


def _generate_floor_intervals(tg, tier_names):
    gen = _generate_unsorted_utterance_intervals(tg, tier_names)
    intervals = iter(sorted(gen, key=lambda x: x.start_time))
    cur = next(intervals)
    while True:
        try:
            nex = next(intervals)
        except StopIteration:
            yield cur
            break
        # Current and next one are same speaker -> merge
        if cur.text == nex.text:
            cur = tgt.core.Interval(cur.start_time, nex.end_time, cur.text)
        # Current ends before next one starts -> output current
        elif cur.end_time <= nex.start_time:
            yield cur
            cur = nex
        # Next is completely within current -> ignore it
        elif nex.start_time >= cur.start_time and nex.end_time <= cur.end_time:
            pass
        # Otherwise it's a partial overlap
        else:
            yield tgt.core.Interval(cur.start_time, nex.start_time, cur.text)
            cur = tgt.core.Interval(cur.end_time, nex.end_time, nex.text)
            
            
def calculate_floor_tier_from_annotations(tg, tier_names):
    floor_tier = tgt.core.IntervalTier(name='floor')
    floor_tier.add_intervals(_generate_floor_intervals(tg, tier_names))
    return floor_tier

## Backchannels tier from annotations

In [4]:
def is_backchannel(interval):
    words = [w.lower() for w in interval.text.split()]
    backchannel_words = all(w in settings.BACKCHANNEL_WORDS for w in words)
    short_enough = interval.end_time - interval.start_time < settings.MAX_BACKCHANNEL_DURATION
    return backchannel_words and short_enough


def _generate_backchannel_points(tg, tier_names):
    for i, tier_name in enumerate(tier_names):
        for interval in tg.get_tier_by_name(tier_name).intervals:
            if is_backchannel(interval):
                yield tgt.core.Point(time=interval.start_time, text=str(i))


def calculate_backchannels_tier_from_annotations(tg, tier_names):
    backchannels_tier = tgt.core.PointTier(name='backchannels')
    backchannels_tier.add_points(_generate_backchannel_points(tg, tier_names))
    return backchannels_tier

## Floor control detection (FCD)

In [5]:
def generate_detection_per_frame_from_wav(
    filepath,
    buffer_duration,
    swap_stereo,
    detector_class,
    detector_params,
):
    with wave.open(str(filepath)) as f:
        sample_rate = f.getframerate()
        sample_width = f.getsampwidth()
        buffer_size = int(sample_rate * buffer_duration)

        detector_params['sample_rate'] = sample_rate
        detector_params['sample_width'] = sample_width
        detector_params['buffer_duration'] = buffer_duration

        detector = detector_class(**detector_params)

        while True:
            fragment = f.readframes(buffer_size)
            if len(fragment) != buffer_size * sample_width * f.getnchannels():
                break
            l = audioop.tomono(fragment, sample_width, 1, 0)
            r = audioop.tomono(fragment, sample_width, 0, 1)
            if swap_stereo:
                l, r = r, l
            yield detector.process([l, r])

In [6]:
def generate_intervals_from_frames(frames, frame_duration, shift=0):
    timed_frames = zip(itertools.count(step=frame_duration), frames)
    changes = utils.dedup(timed_frames, key=lambda x: x[1])
    for cur, nex in utils.pairwise(changes):
        yield tgt.core.Interval(start_time=cur[0] + shift, end_time=nex[0] + shift, text=str(cur[1]))

In [7]:
def create_fcd_tier(session, cutoff_freq=0.35, hysteresis=0.1):
    tg = session['textgrid']
    audio_filepath = settings.AUDIO_DIR / session['name'] / f'{session["name"]}.wav'
    frames_gen = generate_detection_per_frame_from_wav(
        audio_filepath,
        settings.BUFFER_DURATION,
        session['swapped_stereo'],
        floor_control.FloorControlDetector,
        {'cutoff_freq': cutoff_freq, 'hysteresis': hysteresis},
    )
    tier = tgt.core.IntervalTier(name='fcd')
    tier.add_intervals(generate_intervals_from_frames(frames_gen, settings.BUFFER_DURATION))
    return tier

## Random model

In [8]:
def _generate_random_floor_intervals(average_floor_duration):
    floor_holder = random.randint(0, 1)
    previous_timestamp = 0
    while True:
        samples = np.random.exponential(average_floor_duration, 100)
        timestamps = samples.cumsum() + previous_timestamp
        for timestamp in timestamps:
            yield tgt.core.Interval(
                start_time=previous_timestamp,
                end_time=timestamp,
                text=str(floor_holder)
            )
            floor_holder = (floor_holder * -1) + 1
            previous_timestamp = timestamp


def calculate_random_floor_tier(average_floor_duration, textgrid_duration):
    gen = _generate_random_floor_intervals(average_floor_duration)
    tier = tgt.core.IntervalTier(name='random')
    tier.add_intervals(itertools.takewhile(lambda i: i.end_time < textgrid_duration, gen))
    return tier

## VAD model

In [9]:
class VadDetector:
    def __init__(
        self,
        sample_rate,
        vad_mode,
        **_,
    ):
        self._vad = webrtcvad.Vad(vad_mode)
        self._sample_rate = sample_rate
        self._current_floor_holder = None
        
    def process(self, fragments):
        vad_vals = [self._vad.is_speech(fragment, self._sample_rate) for fragment in fragments]
        # Change floor holder when only one is vocalising
        if sum(vad_vals) == 1:
            self._current_floor_holder = vad_vals.index(True)
        return self._current_floor_holder

In [10]:
def upsample(source, target):
    subprocess.run([
        'ffmpeg',
        '-y',  # Overwrite, it will always exist because a temp is created
        '-i', source,
        '-ar', '48000',
        target,
    ])

## Update TextGrids with calculated tiers

### Annotated floor

In [11]:
for session in data:
    tg = session['textgrid']
    tg.add_tier(calculate_floor_tier_from_annotations(tg, settings.ANNOTATIONS_TIERS))

Percentage of annotated floor without competition for floor control. I.e. defined annotated floor.

In [12]:
def defined_floor_time_ratio(textgrid, start_time, end_time):
    floor_tier = textgrid.get_tier_by_name('floor')
    floor_intervals = floor_tier.get_annotations_between_timepoints(start_time, end_time)
    defined_time = sum([i.end_time - i.start_time for i in floor_intervals])
    return defined_time / (end_time - start_time)

In [13]:
vals = [
    defined_floor_time_ratio(session['textgrid'], part['start_time'], part['end_time'])
    for session in data
    for part in session['parts']
]
print(np.mean(vals), np.var(vals))

0.8348158639511156 0.003661832575892673


### Backchannels

In [14]:
for session in data:
    tg = session['textgrid']
    tg.add_tier(calculate_backchannels_tier_from_annotations(tg, settings.ANNOTATIONS_TIERS))

### FCD model

In [15]:
%%time

for session in data:
    session['textgrid'].add_tier(create_fcd_tier(session))

CPU times: user 1min 2s, sys: 2.68 s, total: 1min 5s
Wall time: 1min 6s


### Random model

In [16]:
average_floor_duration = np.mean(
    [
        i.end_time - i.start_time
        for session in data
        for i in session['textgrid'].get_tier_by_name('floor')
    ]
)
average_floor_duration

3.2634707847988693

In [17]:
for session in data:
    tg = session['textgrid']
    tg.add_tier(calculate_random_floor_tier(average_floor_duration, tg.end_time))

### LSTM model (Skantze)

In [18]:
model_0 = keras.models.load_model('model_0.h5')
model_1 = keras.models.load_model('model_1.h5')

W1122 18:30:51.384323 140330374854464 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W1122 18:30:51.397258 140330374854464 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1122 18:30:51.400092 140330374854464 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W1122 18:30:51.638414 140330374854464 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/back

In [19]:
%%time

for session in data:
    tg = session['textgrid']
    intervals = []
    for part in session['parts']:
        X = np.load(skantze_train.DATA_DIR / f'X-{session["name"]}-{part["name"]}.npy')

        batch_generator = skantze_train.BatchGenerator(
            X,
            np.zeros(len(X)),
            skantze_train.SEQUENCE_LENGTH,
            skantze_train.PREDICTION_LENGTH,
            skantze_train.BATCH_SIZE,
        )
        model_0_predictions = model_0.predict_generator(batch_generator)
        model_1_predictions = model_1.predict_generator(batch_generator)
        predictions = np.vstack([model_0_predictions[:, 0], model_1_predictions[:, 0]])
        floor_holder = predictions.T.argmax(axis=1)

        intervals += list(
            generate_intervals_from_frames(
                floor_holder,
                frame_duration=1 / skantze_train.FRAME_RATE,
                shift=part['start_time'] + (skantze_train.SEQUENCE_LENGTH + 1) / skantze_train.FRAME_RATE,
            )
        )

    tier = tgt.core.IntervalTier(name='lstm')
    tier.add_intervals(intervals)
    tg.add_tier(tier)

CPU times: user 2min 4s, sys: 8.67 s, total: 2min 12s
Wall time: 48.1 s


### VAD model

In [20]:
%%time

for session in data:
    tg = session['textgrid']
    audio_filepath = settings.AUDIO_DIR / session['name'] / f'{session["name"]}.wav'
    with tempfile.NamedTemporaryFile(suffix='.wav') as tf:
        upsample(str(audio_filepath.resolve()), tf.name)
        frames_gen = generate_detection_per_frame_from_wav(
            tf.name,
            settings.BUFFER_DURATION,
            session['swapped_stereo'],
            VadDetector,
            {'vad_mode': 3},
        )
        tier = tgt.core.IntervalTier(name='vad')
        tier.add_intervals(generate_intervals_from_frames(frames_gen, settings.BUFFER_DURATION))
        tg.add_tier(tier)

CPU times: user 1min 22s, sys: 2.63 s, total: 1min 24s
Wall time: 1min 58s


# Export textgrid for manual observation

In [21]:
tmp = pathlib.Path('tmp')
tmp.mkdir(exist_ok=True)

for session in data:
    with open(tmp / f'{session["name"]}.textgrid', 'w') as f:
        f.write(tgt.io.export_to_long_textgrid(session['textgrid']))

# Experiments utils

In [22]:
def stat_application_generator(data, stat_func, candidate=None, *, create_tier=None):
    '''
    Applies the `stat_func` to each part in the test set.
    Requires either `candidate` or `create_tier` function. Not both.
    If a `candidate` is supplied, use the tier with that name.
    If `create_tier` function is supplied, pass the session to it
    to generate a tier.
    '''
    assert candidate or create_tier
    assert not (candidate and create_tier)
    for session in data:
        tg = session['textgrid']
        for part in session['parts']:
            if candidate:
                candidate_tier = tg.get_tier_by_name(candidate)
            else:
                candidate_tier = create_tier(session)
            yield stat_func(tg, candidate_tier, part['start_time'], part['end_time'])
            
            
def test_set(data):
    return _subset(data, test_set=True)


def train_set(data):
    return _subset(data, train_set=True)


def _subset(data, test_set=False, train_set=False):
    new_data = []
    i = 0
    for session in data:
        new_session = session.copy()
        new_session['parts'] = []
        for part in session['parts']:
            if (test_set and (i % 4 == 0)) or (train_set and (i % 4 != 0)):
                new_session['parts'].append(part)
            i += 1
        if new_session['parts']:
            new_data.append(new_session)
    return new_data

# Performance measures

### Accuracy

In [23]:
def accuracy(tg, candidate_tier, start, end):
    total = 0
    hits = 0
    target_tier = tg.get_tier_by_name('floor')
    for time in np.arange(start, end, 0.1):
        target = target_tier.get_annotations_by_time(time)
        candidate = candidate_tier.get_annotations_by_time(time)
        if target and candidate:
            if candidate[0].text == target[0].text:
                hits += 1
            total += 1
    return hits / total

In [24]:
%%time

for candidate in settings.COMPARABLE_TIERS:
    print(candidate, 'accuracy')
    vals = list(stat_application_generator(test_set(data), accuracy, candidate))
    print('mean:', np.mean(vals), 'std:', np.std(vals))

fcd accuracy
mean: 0.8653953441990642 std: 0.025179023623290376
random accuracy
mean: 0.5132850958739504 std: 0.0360503081587073
vad accuracy
mean: 0.7019965078029801 std: 0.06316885551288656
lstm accuracy
mean: 0.8882538028620065 std: 0.023900469465904153
CPU times: user 36.9 s, sys: 0 ns, total: 36.9 s
Wall time: 36.9 s


### Backchannels classification

In [25]:
def backchannels_correctly_categorised(tg, candidate_tier, start, end):
    total = 0
    hits = 0
    bc_tier = tg.get_tier_by_name('backchannels')
    for bc in bc_tier.get_annotations_between_timepoints(start, end):
        floor_at_bc = candidate_tier.get_annotations_by_time(bc.time)
        if floor_at_bc:
            if floor_at_bc[0].text != bc.text:
                hits += 1
            total += 1
    return hits / total

In [26]:
for candidate in settings.COMPARABLE_TIERS:
    print(candidate, 'backchannels correctly categorised')
    vals = list(stat_application_generator(test_set(data), backchannels_correctly_categorised, candidate))
    print('mean:', np.mean(vals), 'std:', np.std(vals))

fcd backchannels correctly categorised
mean: 0.7922724314742816 std: 0.10568349102940862
random backchannels correctly categorised
mean: 0.3701232837246644 std: 0.17207857584268407
vad backchannels correctly categorised
mean: 0.6561874390236582 std: 0.149850841247541
lstm backchannels correctly categorised
mean: 0.7555062202648837 std: 0.11290367233063701


### Stability

In [27]:
def floor_holder_changes(tg, candidate_tier, start_time, end_time):
    gen = (i.text for i in candidate_tier.get_annotations_between_timepoints(start_time, end_time))
    items = utils.dedup(gen)
    return len(list(items)) - 1  # number of changes is number of values minus 1


def stability(tg, candidate_tier, start_time, end_time):
    annotated_floor = tg.get_tier_by_name('floor')
    annotated_floor_changes = floor_holder_changes(tg, annotated_floor, start_time, end_time)
    candidate_floor_changes = floor_holder_changes(tg, candidate_tier, start_time, end_time)
    return annotated_floor_changes / candidate_floor_changes

In [28]:
for candidate in settings.COMPARABLE_TIERS:
    print(candidate, 'stability')
    vals = list(stat_application_generator(test_set(data), stability, candidate))
    print('mean:', np.mean(vals), 'std:', np.std(vals))

fcd stability
mean: 0.9287108417871082 std: 0.21944462860850766
random stability
mean: 0.8948645666159241 std: 0.3256787991338075
vad stability
mean: 0.21358132690750747 std: 0.07577958923744672
lstm stability
mean: 0.11742626060041336 std: 0.04443382176501092


### Lag

In [29]:
def lag(tg, candidate_tier, start_time, end_time):
    lags = []
    visited_target_intervals = set()
    target_tier = tg.get_tier_by_name('floor')
    for candidate_interval in candidate_tier.intervals:
        target_intervals = target_tier.get_annotations_by_time(candidate_interval.start_time)
        if target_intervals:
            target_interval = target_intervals[0]
            if (
                target_interval.text == candidate_interval.text and
                target_interval not in visited_target_intervals
            ):
                lags.append(candidate_interval.start_time - target_interval.start_time)
            visited_target_intervals.add(target_interval)
    return np.mean(lags)

In [30]:
for candidate in settings.COMPARABLE_TIERS:
    print(candidate, 'lag')
    vals = list(stat_application_generator(test_set(data), lag, candidate))
    print('mean:', np.mean(vals), 'std:', np.std(vals))

fcd lag
mean: 0.41077566962461864 std: 0.04000523491024262
random lag
mean: 1.8120902064579862 std: 0.32682680185294954
vad lag
mean: 0.4767208722773748 std: 0.09355363374067974
lstm lag
mean: 0.29757926099112136 std: 0.41298518105018533


# Optimising parameters for accuracy

In [31]:
def get_negative_accuracy_from_model(params, data):

    def create_fcd_tier_partial(session):
        cutoff_freq, hysteresis = params
        return create_fcd_tier(session, cutoff_freq, hysteresis)

    generator = stat_application_generator(train_set(data), accuracy, create_tier=create_fcd_tier_partial)
    return -np.mean(list(generator))

In [32]:
%%time

res = optimize.minimize(
     get_negative_accuracy_from_model,
     [0.35, 0.1],
     args=(train_set(data), ),
     method='Nelder-Mead',
     options={'disp': True},
)

Optimization terminated successfully.
         Current function value: -0.894242
         Iterations: 28
         Function evaluations: 66
CPU times: user 1h 29min 50s, sys: 2min 35s, total: 1h 32min 26s
Wall time: 1h 34min 53s


In [33]:
res

 final_simplex: (array([[ 1.66455078, -0.05666016],
       [ 1.66455078, -0.05669922],
       [ 1.66461914, -0.05668945]]), array([-0.89424184, -0.89424184, -0.89423307]))
           fun: -0.8942418423799594
       message: 'Optimization terminated successfully.'
          nfev: 66
           nit: 28
        status: 0
       success: True
             x: array([ 1.66455078, -0.05666016])

In [34]:
for session in data:
    optimized_fcd_tier = create_fcd_tier(session, *res.x)
    optimized_fcd_tier.name = 'optimized_fcd'
    session['textgrid'].add_tier(optimized_fcd_tier)

In [35]:
for func in [accuracy, backchannels_correctly_categorised, stability, lag]:
    print(func.__name__)
    vals = list(stat_application_generator(test_set(data), func, 'optimized_fcd'))
    print('mean:', np.mean(vals), 'std:', np.std(vals))

accuracy
mean: 0.8614435366224688 std: 0.04081918949324621
backchannels_correctly_categorised
mean: 0.5849182344379213 std: 0.25076737876469085
stability
mean: 0.2914369260583304 std: 0.08822166518997065
lag
mean: 0.08571070287788207 std: 0.00817481340315245


Is the accuracy significantly better than the hand-picked values (evaluated on the test-set)?

In [36]:
non_optimized_vals = list(stat_application_generator(test_set(data), accuracy, 'fcd'))
optimized_vals = list(stat_application_generator(test_set(data), accuracy, 'optimized_fcd'))
stats.ttest_rel(non_optimized_vals, optimized_vals)

Ttest_relResult(statistic=0.24778800660262884, pvalue=0.8114098018075939)

Doesn't seem like the optimization improves on the test-set. Let's check the train-set, to make sure it does something reasonable.

In [37]:
non_optimized_vals = list(stat_application_generator(train_set(data), accuracy, 'fcd'))
optimized_vals = list(stat_application_generator(train_set(data), accuracy, 'optimized_fcd'))
print('Non-optimized')
print('mean:', np.mean(non_optimized_vals), 'std:', np.std(non_optimized_vals))
print('Optimized')
print('mean:', np.mean(optimized_vals), 'std:', np.std(optimized_vals))
stats.ttest_rel(non_optimized_vals, optimized_vals)

Non-optimized
mean: 0.870213230147201 std: 0.029191274312062226
Optimized
mean: 0.8892287830008905 std: 0.019463745720531816


Ttest_relResult(statistic=-2.978451411880582, pvalue=0.007165853993480536)