In [1]:
"""
    Cleans and formats the dialog transcriptions for the Map Task dataset, as the text
    is mostly aligned. Dialog interruptions are marked as double dashes, ie: 'lake fa--',
    letters between quotes indicate shape, ie: "u"-shaped turn or 'zee' shape.
    
    The text already comes lowercased, so the only option is replacing the shape figures
    by a token, if desired.
    
    [CONTENTS]
        - Preprocessing
        - Splitting into train/dev/test
        - Data peeking
        - Label occurrence statistics
        - Checking <uncodable> class
"""

import re
import csv
import random

from os import path, getcwd
from collections import defaultdict

DATASET_FILE = './maptask/all_transcripts.txt'
TRAIN_PATH   = './clean/maptask_train.tsv'
DEV_PATH     = './clean/maptask_dev.tsv'
TEST_PATH    = './clean/maptask_test.tsv'

In [2]:
def make_conversation():
    return {
        'name': None,
        'utterances': list(),
        'turns': 0
    }

def get_conv_name(conv_header):
    text = conv_header[0]
    return re.findall(':\s(.+?)\;', text)[0]

class Preprocessor:
    """Simply replaces shape-figures if replace_shapes is true, otherwise does nothing."""
    
    def preprocess(self, text):        
        return re.sub("--|\"", '', text).strip()


In [3]:
file = open(DATASET_FILE, 'r')
uncodable_samples = list()
reader = csv.reader(file, delimiter='\t')

conversations = list()
conv = make_conversation()

pre = Preprocessor()

# Reads the pooled file, splitting it into conversation dics for later
# further splitting into train/test/dev
first = next(reader)
name = get_conv_name(first)

conv['name'] = name
counter = 1

for line in reader:
    # reached the header of the next conversation
    if len(line) == 1:
        conv['turns'] = len(conv['utterances'])
        
        conversations.append(conv)
        conv = make_conversation()

        name = get_conv_name(line)
        conv['name'] = name

        counter = 1

    else:
        speaker, label, text = line
        
        text = pre.preprocess(text)
        idd = '{}_{}'.format(name, counter)
        
        if label == 'uncodable':
            uncodable_samples.append((idd, speaker, label, text))
            continue
        
        conv['utterances'].append((idd, speaker, label, text))
        counter += 1

# Brief summary
n_turns = sum([conv['turns'] for conv in conversations])
n_conversations = len(conversations)
print('Parsed {} dialogs with {} utterances.\n'.format(n_conversations, n_turns))


Parsed 127 dialogs with 26567 utterances.



In [4]:
# Splitting into train/dev/test

train_set = conversations[0:89]
dev_set = conversations[90:109]
test_set = conversations[109:]

print('Split summary:')
print(' - Train: {}   {:.4}%'.format(len(train_set), len(train_set) / n_conversations * 100))
print(' - Dev:   {}   {:.4}%'.format(len(dev_set), len(dev_set) / n_conversations * 100))
print(' - Test:  {}   {:.4}%'.format(len(test_set), len(test_set) / n_conversations * 100))

print('\n')
print(' - Train: [{}].\n'.format(', '.join([node['name'] for node in train_set])))
print(' - Dev: [{}].\n'.format(', '.join([node['name'] for node in dev_set])))
print(' - Test: [{}].\n'.format(', '.join([node['name'] for node in test_set])))

Split summary:
 - Train: 89   70.08%
 - Dev:   19   14.96%
 - Test:  18   14.17%


 - Train: [q1ec1, q1ec2, q1ec3, q1ec4, q1ec5, q1ec6, q1ec7, q1ec8, q1nc1, q1nc2, q1nc3, q1nc4, q1nc5, q1nc6, q1nc7, q1nc8, q2ec1, q2ec2, q2ec3, q2ec4, q2ec5, q2ec6, q2ec7, q2ec8, q2nc1, q2nc2, q2nc3, q2nc4, q2nc5, q2nc6, q2nc7, q2nc8, q3ec1, q3ec2, q3ec3, q3ec4, q3ec5, q3ec6, q3ec7, q3ec8, q3nc1, q3nc2, q3nc3, q3nc4, q3nc5, q3nc6, q3nc7, q3nc8, q4ec1, q4ec2, q4ec3, q4ec4, q4ec5, q4ec6, q4ec7, q4ec8, q4nc1, q4nc2, q4nc3, q4nc4, q4nc5, q4nc6, q4nc7, q4nc8, q5ec1, q5ec2, q5ec3, q5ec4, q5ec5, q5ec6, q5ec7, q5ec8, q5nc1, q5nc2, q5nc3, q5nc4, q5nc5, q5nc6, q5nc7, q5nc8, q6ec1, q6ec2, q6ec3, q6ec4, q6ec5, q6ec6, q6ec7, q6ec8, q6nc1].

 - Dev: [q6nc3, q6nc4, q6nc5, q6nc6, q6nc7, q6nc8, q7ec1, q7ec2, q7ec3, q7ec4, q7ec5, q7ec6, q7ec7, q7ec8, q7nc1, q7nc2, q7nc3, q7nc4, q7nc5].

 - Test: [q7nc6, q7nc7, q7nc8, q8ec1, q8ec2, q8ec3, q8ec4, q8ec5, q8ec6, q8ec7, q8ec8, q8nc1, q8nc2, q8nc3, q8nc4, q8nc5, q8nc6, q8nc7].


In [5]:
# persisting data

def persist_split(filename, conversations):
    """Stores a given split as a tsv file."""
    clean_file = open(filename, 'w')
    clean_writer = csv.writer(clean_file, delimiter='\t')
    
    clean_writer.writerow(['id', 'speaker', 'label', 'clean'])
    for conv in conversations:
        for utt in conv['utterances']:
            clean_writer.writerow(utt)
                         
    clean_file.close()


persist_split(TRAIN_PATH, train_set)
persist_split(DEV_PATH, dev_set)
persist_split(TEST_PATH, test_set)

In [6]:
# Data peeking

AMOUNT_PEEKS = 30

conversation_no = random.sample(range(n_conversations), AMOUNT_PEEKS)
utterance_no    = random.sample(range(50), AMOUNT_PEEKS)

print('{:12.12}\t{:90.90}'.format('-label-', '-original text (90 chars)-'))
for cn, un in zip(conversation_no, utterance_no):
    sample = conversations[cn]['utterances'][un]
    print('{:12.12}\t{:90.90}\t'.format(sample[2], sample[3]))

-label-     	-original text (90 chars)-                                                                
reply_w     	well                                                                                      	
ready       	right                                                                                     	
instruct    	now go right                                                                              	
check       	cross the rope bridge                                                                     	
explain     	ah this is the same one i got with lynn we spent ages on it                               	
reply_n     	oh                                                                                        	
reply_y     	yeah got an apache camp                                                                   	
acknowledge 	uh-huh                                                                                    	
ready       	okay                                       

In [7]:
# Label occurrence statistics

ftable = defaultdict(int)
for conversation in conversations:
    for utt in conversation['utterances']:
        ftable[utt[2]] += 1
        
ftable = sorted(ftable.items(), key=lambda x: x[1], reverse=True)
print('-label-\t     -frequency- ')
for label, freq in ftable:
    print('{:14.12} {}'.format(label, freq))

-label-	     -frequency- 
acknowledge    5556
instruct       4234
reply_y        3199
explain        2152
check          2119
ready          2051
align          1774
query_yn       1735
clarify        1190
reply_w        913
reply_n        875
query_w        769
