In [1]:
import audioop
import itertools
import pathlib
import random
import subprocess
import tempfile
import wave

import keras.models
import numpy as np
import tgt
import webrtcvad

import data_loading
import floor_control
import settings
import skantze_train
import utils

Using TensorFlow backend.


# Data loading

In [2]:
data = list(data_loading.generator(settings.ANNOTATIONS_DIR, settings.AUDIO_DIR))

# Data preparation

## Floor tier from annotations (target / annotated floor)

In [3]:
def _generate_unsorted_utterance_intervals(tg, tier_names):
    for i, tier in enumerate(tier_names):
        for interval in tg.get_tier_by_name(tier).intervals:
            yield tgt.core.Interval(
                interval.start_time,
                interval.end_time,
                str(i),
            )


def _generate_floor_intervals(tg, tier_names):
    gen = _generate_unsorted_utterance_intervals(tg, tier_names)
    intervals = iter(sorted(gen, key=lambda x: x.start_time))
    cur = next(intervals)
    while True:
        try:
            nex = next(intervals)
        except StopIteration:
            yield cur
            break
        # Current and next one are same speaker -> merge
        if cur.text == nex.text:
            cur = tgt.core.Interval(cur.start_time, nex.end_time, cur.text)
        # Current ends before next one starts -> output current
        elif cur.end_time <= nex.start_time:
            yield cur
            cur = nex
        # Next is completely within current -> ignore it
        elif nex.start_time >= cur.start_time and nex.end_time <= cur.end_time:
            pass
        # Otherwise it's a partial overlap
        else:
            yield tgt.core.Interval(cur.start_time, nex.start_time, cur.text)
            cur = tgt.core.Interval(cur.end_time, nex.end_time, nex.text)
            
            
def calculate_floor_tier_from_annotations(tg, tier_names):
    floor_tier = tgt.core.IntervalTier(name='target')
    floor_tier.add_intervals(_generate_floor_intervals(tg, tier_names))
    return floor_tier

## Backchannels tier from annotations

In [4]:
def is_backchannel(interval):
    words = [w.lower() for w in interval.text.split()]
    backchannel_words = all(w in settings.BACKCHANNEL_WORDS for w in words)
    short_enough = interval.end_time - interval.start_time < settings.MAX_BACKCHANNEL_DURATION
    return backchannel_words and short_enough


def _generate_backchannel_points(tg, tier_names):
    for i, tier_name in enumerate(tier_names):
        for interval in tg.get_tier_by_name(tier_name).intervals:
            if is_backchannel(interval):
                yield tgt.core.Point(time=interval.start_time, text=str(i))


def calculate_backchannels_tier_from_annotations(tg, tier_names):
    backchannels_tier = tgt.core.PointTier(name='backchannels')
    backchannels_tier.add_points(_generate_backchannel_points(tg, tier_names))
    return backchannels_tier

## Floor tier from audio (detected / FC)

In [5]:
def generate_detection_per_frame_from_wav(
    filepath,
    buffer_duration,
    swap_stereo,
    detector_class,
    detector_params,
):
    with wave.open(str(filepath)) as f:
        sample_rate = f.getframerate()
        sample_width = f.getsampwidth()
        buffer_size = int(sample_rate * buffer_duration)

        detector_params['sample_rate'] = sample_rate
        detector_params['sample_width'] = sample_width
        detector_params['buffer_size'] = buffer_size

        detector = detector_class(**detector_params)

        while True:
            fragment = f.readframes(buffer_size)
            if len(fragment) != buffer_size * sample_width * f.getnchannels():
                break
            l = audioop.tomono(fragment, sample_width, 1, 0)
            r = audioop.tomono(fragment, sample_width, 0, 1)
            if swap_stereo:
                l, r = r, l
            yield detector.process([l, r])

In [6]:
def generate_intervals_from_frames(frames, frame_duration, shift=0):
    timed_frames = zip(itertools.count(step=frame_duration), frames)
    changes = utils.dedup(timed_frames, key=lambda x: x[1])
    for cur, nex in utils.pairwise(changes):
        yield tgt.core.Interval(start_time=cur[0] + shift, end_time=nex[0] + shift, text=str(cur[1]))

## Random floor tier

In [7]:
def _generate_random_floor_intervals(average_floor_duration):
    floor_holder = random.randint(0, 1)
    previous_timestamp = 0
    while True:
        samples = np.random.exponential(average_floor_duration, 100)
        timestamps = samples.cumsum() + previous_timestamp
        for timestamp in timestamps:
            yield tgt.core.Interval(
                start_time=previous_timestamp,
                end_time=timestamp,
                text=str(floor_holder)
            )
            floor_holder = (floor_holder * -1) + 1
            previous_timestamp = timestamp


def calculate_random_floor_tier(average_floor_duration, textgrid_duration):
    gen = _generate_random_floor_intervals(average_floor_duration)
    tier = tgt.core.IntervalTier(name='random')
    tier.add_intervals(itertools.takewhile(lambda i: i.end_time < textgrid_duration, gen))
    return tier

## VAD floor tier

In [8]:
class VadDetector:
    def __init__(
        self,
        sample_rate,
        vad_mode,
        **_,
    ):
        self._vad = webrtcvad.Vad(vad_mode)
        self._sample_rate = sample_rate
        self._current_floor_holder = None
        
    def process(self, fragments):
        vad_vals = [self._vad.is_speech(fragment, self._sample_rate) for fragment in fragments]
        # Change floor holder when only one is vocalising
        if sum(vad_vals) == 1:
            self._current_floor_holder = vad_vals.index(True)
        return self._current_floor_holder

In [9]:
def upsample(source, target):
    subprocess.run([
        'ffmpeg',
        '-y',  # Overwrite, it will always exist because a temp is created
        '-i', source,
        '-ar', '48000',
        target,
    ])

## Update TextGrids with calculated tiers

### Annotated floor

In [10]:
for session in data:
    tg = session['textgrid']
    tg.add_tier(calculate_floor_tier_from_annotations(tg, settings.ANNOTATIONS_TIERS))

### Annotated backchannels

In [11]:
for session in data:
    tg = session['textgrid']
    tg.add_tier(calculate_backchannels_tier_from_annotations(tg, settings.ANNOTATIONS_TIERS))

### FC

In [12]:
%%time

for session in data:
    tg = session['textgrid']
    audio_filepath = settings.AUDIO_DIR / session['name'] / f'{session["name"]}.wav'
    frames_gen = generate_detection_per_frame_from_wav(
        audio_filepath,
        settings.BUFFER_DURATION,
        session['swapped_stereo'],
        floor_control.RmsFilterDetector,
        {'cutoff_freq': 0.35, 'hysteresis': 0.1},
    )
    tier = tgt.core.IntervalTier(name='detected')
    tier.add_intervals(generate_intervals_from_frames(frames_gen, settings.BUFFER_DURATION))
    tg.add_tier(tier)

CPU times: user 58.2 s, sys: 3.43 s, total: 1min 1s
Wall time: 1min 2s


### Random floor

In [13]:
average_floor_duration = np.mean(
    [
        i.end_time - i.start_time
        for session in data
        for i in session['textgrid'].get_tier_by_name('target')
    ]
)
average_floor_duration

3.2634707847988693

In [14]:
for session in data:
    tg = session['textgrid']
    tg.add_tier(calculate_random_floor_tier(average_floor_duration, tg.end_time))

### Skantze

In [15]:
model_0 = keras.models.load_model('model_0.h5')
model_1 = keras.models.load_model('model_1.h5')

W0828 17:12:31.292941 140257205532288 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0828 17:12:31.315837 140257205532288 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0828 17:12:31.321515 140257205532288 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0828 17:12:31.562888 140257205532288 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/back

In [16]:
%%time

for session in data:
    tg = session['textgrid']
    intervals = []
    for part in session['parts']:
        X = np.load(skantze_train.DATA_DIR / f'X-{session["name"]}-{part["name"]}.npy')

        batch_generator = skantze_train.BatchGenerator(
            X,
            np.zeros(len(X)),
            skantze_train.SEQUENCE_LENGTH,
            skantze_train.PREDICTION_LENGTH,
            skantze_train.BATCH_SIZE,
        )
        model_0_predictions = model_0.predict_generator(batch_generator)
        model_1_predictions = model_1.predict_generator(batch_generator)
        predictions = np.vstack([model_0_predictions[:, 0], model_1_predictions[:, 0]])
        floor_holder = predictions.T.argmax(axis=1)

        intervals += list(
            generate_intervals_from_frames(
                floor_holder,
                frame_duration=0.05,
                shift=part['start_time'],
            )
        )

    tier = tgt.core.IntervalTier(name='skantze')
    tier.add_intervals(intervals)
    tg.add_tier(tier)

CPU times: user 2min 41s, sys: 10.4 s, total: 2min 52s
Wall time: 1min 8s


### VAD

In [17]:
%%time

for session in data:
    tg = session['textgrid']
    audio_filepath = settings.AUDIO_DIR / session['name'] / f'{session["name"]}.wav'
    with tempfile.NamedTemporaryFile(suffix='.wav') as tf:
        upsample(str(audio_filepath.resolve()), tf.name)
        frames_gen = generate_detection_per_frame_from_wav(
            tf.name,
            settings.BUFFER_DURATION,
            session['swapped_stereo'],
            VadDetector,
            {'vad_mode': 3},
        )
        tier = tgt.core.IntervalTier(name='vad')
        tier.add_intervals(generate_intervals_from_frames(frames_gen, settings.BUFFER_DURATION))
        tg.add_tier(tier)

CPU times: user 1min 28s, sys: 3.47 s, total: 1min 31s
Wall time: 2min


# Export textgrid for manual observation

In [18]:
for session in data:
    with open(pathlib.Path('tmp') / f'{session["name"]}.textgrid', 'w') as f:
        f.write(tgt.io.export_to_long_textgrid(session['textgrid']))

# Experiments utils

In [19]:
def stat_application_generator(data, stat_func, candidate):
    i = 0
    for session in data:
        tg = session['textgrid']
        for part in session['parts']:
            if (i % 4 == 0):  # Only test-set
                yield stat_func(tg, tg.get_tier_by_name(candidate), part['start_time'], part['end_time'])
            i += 1

# Experiment 1 - agreement

In [20]:
def agreement(tg, candidate_tier, start, end):
    total = 0
    hits = 0
    target_tier = tg.get_tier_by_name('target')
    for time in np.arange(start, end, 0.1):
        target = target_tier.get_annotations_by_time(time)
        if target:
            detected = candidate_tier.get_annotations_by_time(time)
            if detected and detected[0].text == target[0].text:
                hits += 1
            total += 1
    return hits / total

In [21]:
%%time

for candidate in settings.COMPARABLE_TIERS:
    print(candidate, 'agreement')
    vals = list(stat_application_generator(data, agreement, candidate))
    print('mean:', np.mean(vals), 'std:', np.std(vals))

detected agreement
mean: 0.8644762165956857 std: 0.024724106725840835
random agreement
mean: 0.5048602002907447 std: 0.02263357847568976
vad agreement
mean: 0.7019965078029801 std: 0.06316885551288656
skantze agreement
mean: 0.5331396491162868 std: 0.06345740365096404
CPU times: user 28.3 s, sys: 0 ns, total: 28.3 s
Wall time: 28.4 s


# Experiment 2 - backchannels

In [22]:
def backchannels_correctly_categorised(tg, candidate_tier, start, end):
    total = 0
    hits = 0
    bc_tier = tg.get_tier_by_name('backchannels')
    for bc in bc_tier.get_annotations_between_timepoints(start, end):
        floor_at_bc = candidate_tier.get_annotations_by_time(bc.time)
        if floor_at_bc:
            if floor_at_bc[0].text != bc.text:
                hits += 1
            total += 1
    return hits / total

In [23]:
for candidate in settings.COMPARABLE_TIERS:
    print(candidate, 'backchannels correctly categorised')
    vals = list(stat_application_generator(data, backchannels_correctly_categorised, candidate))
    print('mean:', np.mean(vals), 'std:', np.std(vals))

detected backchannels correctly categorised
mean: 0.7922724314742816 std: 0.10568349102940862
random backchannels correctly categorised
mean: 0.5935503804781557 std: 0.18215760065878636
vad backchannels correctly categorised
mean: 0.6561874390236582 std: 0.149850841247541
skantze backchannels correctly categorised
mean: 0.634582046549928 std: 0.17943228040203382


# Experiment 3 - stability

In [24]:
def floor_holder_changes(tg, candidate_tier, start_time, end_time):
    gen = (i.text for i in candidate_tier.get_annotations_between_timepoints(start_time, end_time))
    items = utils.dedup(gen)
    return len(list(items)) - 1  # number of changes is number of values minus 1


def stability(tg, candidate_tier, start_time, end_time):
    annotated_floor = tg.get_tier_by_name('target')
    annotated_floor_changes = floor_holder_changes(tg, annotated_floor, start_time, end_time)
    candidate_floor_changes = floor_holder_changes(tg, candidate_tier, start_time, end_time)
    return annotated_floor_changes / candidate_floor_changes

In [25]:
for candidate in settings.COMPARABLE_TIERS:
    print(candidate, 'stability')
    vals = list(stat_application_generator(data, stability, candidate))
    print('mean:', np.mean(vals), 'std:', np.std(vals))

detected stability
mean: 0.9287108417871082 std: 0.21944462860850766
random stability
mean: 0.8968191785368049 std: 0.3281678098472433
vad stability
mean: 0.21358132690750747 std: 0.07577958923744672
skantze stability
mean: 0.15937797604730697 std: 0.0379461612731586


# Experiment 4 - lag

In [26]:
def lag(tg, candidate_tier, start_time, end_time):
    lags = []
    visited_target_intervals = set()
    target_tier = tg.get_tier_by_name('target')
    for candidate_interval in candidate_tier.intervals:
        target_intervals = target_tier.get_annotations_by_time(candidate_interval.start_time)
        if target_intervals:
            target_interval = target_intervals[0]
            if (
                target_interval.text == candidate_interval.text and
                target_interval not in visited_target_intervals
            ):
                lags.append(candidate_interval.start_time - target_interval.start_time)
            visited_target_intervals.add(target_interval)
    return np.mean(lags)

In [27]:
for candidate in settings.COMPARABLE_TIERS:
    print(candidate, 'lag')
    vals = list(stat_application_generator(data, lag, candidate))
    print('mean:', np.mean(vals), 'std:', np.std(vals))

detected lag
mean: 0.41077566962461864 std: 0.04000523491024262
random lag
mean: 1.791989110725945 std: 0.4134349961795376
vad lag
mean: 0.4767208722773748 std: 0.09355363374067974
skantze lag
mean: 1.2944193514103752 std: 0.9609737180907929
