In [1]:
import audioop
import itertools
import pathlib
import random
import subprocess
import tempfile
import wave

import keras.models
import numpy as np
import tgt
import webrtcvad

import floor_control
import skantze_train
import utils

Using TensorFlow backend.


In [2]:
DUEL_DIR = pathlib.Path('~/DUEL').expanduser()
SUB_DUEL_DIR = DUEL_DIR / 'de'
ANNOTATIONS_DIR =  SUB_DUEL_DIR / 'transcriptions_annotations'
AUDIO_DIR = SUB_DUEL_DIR / 'audio'
BUFFER_DURATION = 0.02
BACKCHANNEL_WORDS = {'ja', 'okay', 'ohm', 'mhm', 'genau'}
MAX_BACKCHANNEL_DURATION = 0.5
ANNOTATIONS_TIERS = ['A-utts', 'B-utts']
SWAPPED_STEREO = {'r12', 'r13', 'r16'}
COMPARABLE_TIERS = ['detected', 'random', 'vad', 'skantze']

# Data loading

In [3]:
def load_data(annotations_dir):
    tgs = {}
    for session_dir in annotations_dir.glob('r*'):
        filepath = next(session_dir.glob('r*.TextGrid'))
        tgs[session_dir.name] = tgt.io.read_textgrid(filepath)
    return tgs

In [4]:
textgrids = load_data(ANNOTATIONS_DIR)

# Data preparation

## Floor tier from annotations (target / annotated floor)

In [5]:
def _generate_unsorted_utterance_intervals(tg, tier_names):
    for i, tier in enumerate(tier_names):
        for interval in tg.get_tier_by_name(tier).intervals:
            yield tgt.core.Interval(
                interval.start_time,
                interval.end_time,
                str(i),
            )


def _generate_floor_intervals(tg, tier_names):
    gen = _generate_unsorted_utterance_intervals(tg, tier_names)
    intervals = iter(sorted(gen, key=lambda x: x.start_time))
    cur = next(intervals)
    while True:
        try:
            nex = next(intervals)
        except StopIteration:
            yield cur
            break
        # Current and next one are same speaker -> merge
        if cur.text == nex.text:
            cur = tgt.core.Interval(cur.start_time, nex.end_time, cur.text)
        # Current ends before next one starts -> output current
        elif cur.end_time <= nex.start_time:
            yield cur
            cur = nex
        # Next is completely within current -> ignore it
        elif nex.start_time >= cur.start_time and nex.end_time <= cur.end_time:
            pass
        # Otherwise it's a partial overlap
        else:
            yield tgt.core.Interval(cur.start_time, nex.start_time, cur.text)
            cur = tgt.core.Interval(cur.end_time, nex.end_time, nex.text)
            
            
def calculate_floor_tier_from_annotations(tg, tier_names):
    floor_tier = tgt.core.IntervalTier(name='target')
    floor_tier.add_intervals(_generate_floor_intervals(tg, tier_names))
    return floor_tier

## Backchannels tier from annotations

In [6]:
def is_backchannel(interval):
    words = [w.lower() for w in interval.text.split()]
    backchannel_words = all(w in BACKCHANNEL_WORDS for w in words)
    short_enough = interval.end_time - interval.start_time < MAX_BACKCHANNEL_DURATION
    return backchannel_words and short_enough


def _generate_backchannel_points(tg, tier_names):
    for i, tier_name in enumerate(tier_names):
        for interval in tg.get_tier_by_name(tier_name).intervals:
            if is_backchannel(interval):
                yield tgt.core.Point(time=interval.start_time, text=str(i))


def calculate_backchannels_tier_from_annotations(tg, tier_names):
    backchannels_tier = tgt.core.PointTier(name='backchannels')
    backchannels_tier.add_points(_generate_backchannel_points(tg, tier_names))
    return backchannels_tier

## Floor tier from audio (detected / FC)

In [7]:
def generate_detection_per_frame_from_wav(
    filepath,
    buffer_duration,
    swap_stereo,
    detector_class,
    detector_params,
):
    with wave.open(str(filepath)) as f:
        sample_rate = f.getframerate()
        sample_width = f.getsampwidth()
        buffer_size = int(sample_rate * buffer_duration)

        detector_params['sample_rate'] = sample_rate
        detector_params['sample_width'] = sample_width
        detector_params['buffer_size'] = buffer_size

        detector = detector_class(**detector_params)

        while True:
            fragment = f.readframes(buffer_size)
            if len(fragment) != buffer_size * sample_width * f.getnchannels():
                break
            l = audioop.tomono(fragment, sample_width, 1, 0)
            r = audioop.tomono(fragment, sample_width, 0, 1)
            if swap_stereo:
                l, r = r, l
            yield detector.process([l, r])

In [8]:
def generate_intervals_from_frames(frames, frame_duration, shift=0):
    timed_frames = zip(itertools.count(step=frame_duration), frames)
    changes = utils.dedup(timed_frames, key=lambda x: x[1])
    for cur, nex in utils.pairwise(changes):
        yield tgt.core.Interval(start_time=cur[0] + shift, end_time=nex[0] + shift, text=str(cur[1]))
    
    
def frames_to_tier(frames, buffer_duration, tier_name):
    intervals = generate_intervals_from_frames(frames, buffer_duration)
    tier = tgt.core.IntervalTier(name=tier_name)
    tier.add_intervals(intervals)
    return tier

## Random floor tier

In [9]:
def _generate_random_floor_intervals(average_floor_duration):
    floor_holder = random.randint(0, 1)
    previous_timestamp = 0
    while True:
        samples = np.random.exponential(average_floor_duration, 100)
        timestamps = samples.cumsum() + previous_timestamp
        for timestamp in timestamps:
            yield tgt.core.Interval(
                start_time=previous_timestamp,
                end_time=timestamp,
                text=str(floor_holder)
            )
            floor_holder = (floor_holder * -1) + 1
            previous_timestamp = timestamp


def calculate_random_floor_tier(average_floor_duration, textgrid_duration):
    gen = _generate_random_floor_intervals(average_floor_duration)
    tier = tgt.core.IntervalTier(name='random')
    tier.add_intervals(itertools.takewhile(lambda i: i.end_time < textgrid_duration, gen))
    return tier

## VAD floor tier

In [10]:
class VadDetector:
    def __init__(
        self,
        sample_rate,
        vad_mode,
        **_,
    ):
        self._vad = webrtcvad.Vad(vad_mode)
        self._sample_rate = sample_rate
        self._current_floor_holder = None
        
    def process(self, fragments):
        vad_vals = [self._vad.is_speech(fragment, self._sample_rate) for fragment in fragments]
        # Change floor holder when only one is vocalising
        if sum(vad_vals) == 1:
            self._current_floor_holder = vad_vals.index(True)
        return self._current_floor_holder

In [11]:
def upsample(source, target):
    subprocess.run([
        'ffmpeg',
        '-y',  # Overwrite, it will always exist because a temp is created
        '-i', source,
        '-ar', '48000',
        target,
    ])

## Update TextGrids with calculated tiers

### Annotated floor

In [12]:
for tg in textgrids.values():
    tg.add_tier(calculate_floor_tier_from_annotations(tg, ANNOTATIONS_TIERS))

### Annotated backchannels

In [13]:
for tg in textgrids.values():
    tg.add_tier(calculate_backchannels_tier_from_annotations(tg, ANNOTATIONS_TIERS))

### FC

In [14]:
%%time

for session, tg in textgrids.items():
    audio_filepath = AUDIO_DIR / session / f'{session}.wav'
    frames_gen = generate_detection_per_frame_from_wav(
        audio_filepath,
        BUFFER_DURATION,
        session in SWAPPED_STEREO,
        floor_control.RmsFilterDetector,
        {'cutoff_freq': 0.35, 'hysteresis': 0.1},
    )
    tg.add_tier(frames_to_tier(list(frames_gen), BUFFER_DURATION, 'detected'))

CPU times: user 55 s, sys: 3.26 s, total: 58.3 s
Wall time: 58.8 s


### Random floor

In [15]:
average_floor_duration = np.mean(
    [
        i.end_time - i.start_time
        for tg in textgrids.values()
        for i in tg.get_tier_by_name('target')
    ]
)
average_floor_duration

3.2634707847988684

In [16]:
for tg in textgrids.values():
    tg.add_tier(calculate_random_floor_tier(average_floor_duration, tg.end_time))

### Skantze

In [17]:
X = np.nan_to_num(np.load('X.npy'))

In [18]:
model_0 = keras.models.load_model('model_0.h5')
model_1 = keras.models.load_model('model_1.h5')

W0806 10:56:55.347461 140551529911936 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0806 10:56:55.358261 140551529911936 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0806 10:56:55.360572 140551529911936 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0806 10:56:55.514609 140551529911936 deprecation_wrapper.py:119] From /home/nagasaki45/code/floor-control/experiments/env/lib/python3.7/site-packages/keras/back

In [19]:
%%time

idx = 0
for _, textgrid in sorted(textgrids.items()):
    intervals = []
    for part in textgrid.get_tier_by_name('Part').intervals:
        start_time = part.start_time
        end_time = part.end_time
        n_frames = int(end_time - start_time) * 20
        part_X = X[idx:idx + n_frames]
        batch_generator = skantze_train.BatchGenerator(
            part_X,
            y=np.zeros(len(X)),  # Unused but still needed
            sequence_length=skantze_train.SEQUENCE_LENGTH,
            batch_size=skantze_train.BATCH_SIZE,
        )
        model_0_predictions = model_0.predict_generator(batch_generator)
        model_1_predictions = model_1.predict_generator(batch_generator)
        predictions = np.hstack([model_0_predictions, model_1_predictions])
        floor_holder = predictions.argmax(axis=1)
        intervals += list(
            generate_intervals_from_frames(
                floor_holder,
                frame_duration=0.05,
                shift=start_time
            )
        )
        idx += n_frames
    tier = tgt.core.IntervalTier(name='skantze')
    tier.add_intervals(intervals)
    textgrid.add_tier(tier)

CPU times: user 2min 25s, sys: 9.17 s, total: 2min 34s
Wall time: 56.9 s


### VAD

In [20]:
%%time

for session, tg in textgrids.items():
    audio_filepath = AUDIO_DIR / session / f'{session}.wav'
    with tempfile.NamedTemporaryFile(suffix='.wav') as tf:
        upsample(str(audio_filepath.resolve()), tf.name)
        frames_gen = generate_detection_per_frame_from_wav(
            tf.name,
            BUFFER_DURATION,
            session in SWAPPED_STEREO,
            VadDetector,
            {'vad_mode': 3},
        )
        tg.add_tier(frames_to_tier(list(frames_gen), BUFFER_DURATION, 'vad'))

CPU times: user 1min 23s, sys: 3.04 s, total: 1min 26s
Wall time: 1min 51s


# Export textgrid for manual observation

In [21]:
for session, textgrid in textgrids.items():
    with open(pathlib.Path('tmp') / f'{session}.textgrid', 'w') as f:
        f.write(tgt.io.export_to_long_textgrid(textgrid))

# Experiment 1 - agreement

In [22]:
def agreement(target_tier, candidate_tier):
    total = 0
    hits = 0
    for time in itertools.count(step=0.1):
        if time > target_tier.end_time:
            break
        target = target_tier.get_annotations_by_time(time)
        if target:
            detected = candidate_tier.get_annotations_by_time(time)
            if detected and detected[0].text == target[0].text:
                hits += 1
            total += 1
    return hits / total

In [23]:
%%time

for candidate in COMPARABLE_TIERS:
    print(candidate, 'agreement')
    vals = [
        agreement(t.get_tier_by_name('target'), t.get_tier_by_name(candidate))
        for t in textgrids.values()
    ]
    print('mean:', np.mean(vals), 'std:', np.std(vals))

detected agreement
mean: 0.8438686346523299 std: 0.0390078461939721
random agreement
mean: 0.5054952034088319 std: 0.019324800066512802
vad agreement
mean: 0.7504700209529584 std: 0.049604179842404686
skantze agreement
mean: 0.5828504362735213 std: 0.10529746007058853
CPU times: user 2min 46s, sys: 0 ns, total: 2min 46s
Wall time: 2min 47s


# Experiment 2 - backchannels

In [24]:
def backchannels_correctly_categorised(bc_tier, candidate_tier):
    total = 0
    hits = 0
    for bc in bc_tier.points:
        floor_at_bc = candidate_tier.get_annotations_by_time(bc.time)
        if floor_at_bc:
            if floor_at_bc[0].text != bc.text:
                hits += 1
            total += 1
    return hits / total

In [25]:
for candidate in COMPARABLE_TIERS:
    print(candidate, 'backchannels correctly categorised')
    vals = [
        backchannels_correctly_categorised(t.get_tier_by_name('backchannels'), t.get_tier_by_name(candidate))
        for t in textgrids.values()
    ]
    print('mean:', np.mean(vals), 'std:', np.std(vals))

detected backchannels correctly categorised
mean: 0.8069979228017964 std: 0.08324994883826912
random backchannels correctly categorised
mean: 0.49876321450211963 std: 0.07112312731291603
vad backchannels correctly categorised
mean: 0.6064839967832115 std: 0.07092072476182118
skantze backchannels correctly categorised
mean: 0.5985783627270752 std: 0.1278209531369245


# Experiment 3 - stability

In [26]:
def floor_holder_changes(tg, tier_name):
    tier = tg.get_tier_by_name(tier_name)
    changes = 0
    for part in tg.get_tier_by_name('Part').intervals:
        start_time = part.start_time
        end_time = part.end_time
        items = utils.dedup(i.text for i in tier.get_annotations_between_timepoints(start_time, end_time))
        changes += len(list(items)) - 1  # number of changes is number of values minus 1
    return changes

In [27]:
for candidate in COMPARABLE_TIERS:
    print(candidate, 'stability')
    vals = [
        floor_holder_changes(t, 'target') / floor_holder_changes(t, candidate)
        for t in textgrids.values()
    ]
    print('mean:', np.mean(vals), 'std:', np.std(vals))

detected stability
mean: 0.9551751041194774 std: 0.16217969695668932
random stability
mean: 0.8918649510323112 std: 0.26009393384302526
vad stability
mean: 0.23594985439160923 std: 0.07873030211875032
skantze stability
mean: 0.24002204809093844 std: 0.054741415982941966


# Experiment 4 - lag

In [28]:
def lag(target_tier, candidate_tier):
    lags = []
    visited_target_intervals = set()
    for candidate_interval in candidate_tier.intervals:
        target_intervals = target_tier.get_annotations_by_time(candidate_interval.start_time)
        if target_intervals:
            target_interval = target_intervals[0]
            if (
                target_interval.text == candidate_interval.text and
                target_interval not in visited_target_intervals
            ):
                lags.append(candidate_interval.start_time - target_interval.start_time)
            visited_target_intervals.add(target_interval)
    return np.mean(lags)

In [29]:
for candidate in COMPARABLE_TIERS:
    print(candidate, 'lag')
    vals = [
        lag(t.get_tier_by_name('target'), t.get_tier_by_name(candidate))
        for t in textgrids.values()
    ]
    print('mean:', np.mean(vals), 'std:', np.std(vals))

detected lag
mean: 0.4141007091403314 std: 0.0336827035557385
random lag
mean: 1.600404302903715 std: 0.3662854005786699
vad lag
mean: 0.4645374361449508 std: 0.10385029644714708
skantze lag
mean: 1.895935163747057 std: 2.733595731527794
