In [1]:
import os
import pympi
import re

import random
random.seed(20)

import warnings
warnings.filterwarnings("ignore")

In [2]:
directory = 'EAF'

### Displaying Datasets and Helper Functions

Helper functions that are used in multiple sections of processing.

In [3]:
def display(dataset_label, dataset, n=5):

    print(f'Dataset: {dataset_label}\n')

    for i in range(0, n):
        parts = dataset[i].split('\t')
        sentence = parts[0].strip()
        sequence = parts[1].strip() 
        print(f'Sign Glosses:   {sequence}')
        print(f'English Text:   {sentence}\n')

def join(glosses):
    return " ".join(glosses)

def split_line(line):
    parts = line.split('\t')
    return parts[0].strip(), parts[1].strip()

def count_words(sentence):
    return sentence.count(" ") + 1

### Parsing Data from the XML ELAN Files

The settings included are the directory that contains the EAF files and the type of data that should be parsed.

In [4]:
def create_dataset(directory, data_type):

    dataset = []

    for filename in os.listdir(directory):
        if '.eaf' in filename:
            parse_file(directory, filename, dataset, data_type)

    random.shuffle(dataset)

    return dataset

def parse_file(directory, filename, dataset, data_type):
    eaf = pympi.Elan.Eaf(f'./{directory}/{filename}')

    sentences, sequences = parse_data(eaf, data_type)

    if eaf.tiers.get(data_type) is None:
        return

    for i in range(0, len(sentences)):
        if len(sentences[i]) > 40 or len(sequences[i]) > 30:
            continue
        if abs(len(sentences[i]) - len(sequences[i])) > 30:
            continue 
        dataset.append(f'{sentences[i]}\t{join(sequences[i])}\n')

def parse_data(eaf, data_type):
    sentences = filter_tier(eaf, data_type)

    left_gloss = filter_tier(eaf, 'LH-IDgloss')
    right_gloss = filter_tier(eaf, 'RH-IDgloss')

    glosses = parse_glosses(sentences, left_gloss, right_gloss)

    return filter_sentences(sentences), filter_glosses(glosses)


def parse_glosses(sentences, left_gloss, right_gloss):
    lhp = 0
    rhp = 0

    glosses = []
    for sentence in sentences:
        gloss_group = []
        while lhp < len(left_gloss) and left_gloss[lhp][1] <= sentence[1] + 2:
            gloss_group.append(left_gloss[lhp])
            lhp += 1
        while rhp < len(right_gloss) and right_gloss[rhp][1] <= sentence[1] + 2:
            gloss_group.append(right_gloss[rhp])
            rhp += 1
        glosses.append(gloss_group)
    return glosses
    
def filter_sentences(sentences):
    filtered_sentences = []

    for sentence in sentences:
        filtered_sentences.append(sentence[2])

    return filtered_sentences

def filter_glosses(glosses):
    filtered_glosses = []

    for gloss_group in glosses:
        gloss_group.sort()
        filtered_gloss_group = []
        for i in range(0, len(gloss_group)):
            if i == 0 or gloss_group[i][2] != gloss_group[i - 1][2]:
                filtered_gloss_group.append(gloss_group[i][2])
        filtered_glosses.append(filtered_gloss_group)

    return filtered_glosses

def filter_tier(eaf, data_type):
    segment_dict = eaf.tiers[data_type][0]
    
    filtered_segments = []
    
    for key in segment_dict:
        segment = segment_dict[key]
        if segment[2] != '':
            filtered_segments.append((int(segment[0][2:]), int(segment[1][2:]), segment[2]))
        
    return filtered_segments

### Filtering and Splitting Sign Decorators for Gloss Sequences and Sentences

Removing the sign decorators or splitting them into their own tokens. Sentences will also be processed to be more manageable.

In [5]:
def filter_dataset(dataset, remove=True):

    filtered_dataset = []

    for line in dataset:
        sentence, sequence = split_line(line)

        sentence = clean_sentence(sentence)

        sequence = clean_sequence(sequence, remove)

        if remove:
            sequence = clean_sequence_removal(sequence)

        if len(sentence) == 0 or len(sequence) == 0:
            continue

        if count_words(sentence) > 25 or count_words(sequence) > 25:
            continue

        if abs(count_words(sentence) - count_words(sequence)) > 7:
            continue
    
        filtered_dataset.append(f'{sentence}\t{sequence}\n')

    return filtered_dataset

def clean_sentence(sentence):

    sentence = sentence.replace("'s", " is")
    sentence = sentence.replace("n't", " not")
    sentence = sentence.replace("'d", " had")
    sentence = sentence.replace("'ll", " will")
    sentence = sentence.replace("'m", " am")
    sentence = sentence.replace("'ve", " have")
    sentence = sentence.replace("'re", " are")
    sentence = sentence.replace("ahh", "")
    sentence = sentence.replace("umm", "")
    sentence = sentence.replace("?", " ?")

    cleaned_sentence = ""
    
    for char in sentence:
        if char.isalpha() or char == "'" or char == "?":
            cleaned_sentence += char
        else:
            cleaned_sentence += " "
    
    cleaned_sentence = cleaned_sentence.lower()
    cleaned_sentence = re.sub(' +', ' ', cleaned_sentence)
    
    return cleaned_sentence.strip()

def clean_sequence(sequence, remove):
    sequence = sequence.replace("FALSE-START", "")
    sequence = sequence.replace("FALSE START", "")
    sequence = sequence.replace("?", " ?")

    glosses = sequence.strip().replace("-"," ").replace("):"," ").split(" ")
    cleaned_glosses = []
        
    for gloss in glosses:
        if gloss[:3] == "PT:" or gloss[:5] == "LOOK(" or gloss[:7] == "PTBUOY:":
            continue
        
        gloss = get_gloss(gloss, remove)
        
        if gloss in ["THE", "IN", "WELL", "", "AHH", "UMM", "FSL", "PTBUOY", "HMM", "ERR", "PHOOEY", "INDETERMINATE", "INDECIPHERABLE"]:
            continue

        if gloss[-1].isdigit():
            gloss = gloss[:-1]

        if len(gloss) < 2:
            continue
                        
        cleaned_glosses.append(gloss)

    unduped_glosses = []
    
    for i in range(0, len(cleaned_glosses)):
        if (i == 0 or cleaned_glosses[i] != cleaned_glosses[i - 1]) and (i < 2 or cleaned_glosses[i] != cleaned_glosses[i - 2]):
            unduped_glosses.append(cleaned_glosses[i].replace("-", " "))
        
    return (" ".join(unduped_glosses)).upper()

def clean_sequence_removal(sequence):
    
    glosses = sequence.split(" ")

    filtered_glosses = []

    for gloss in glosses:
        if gloss in ['THE', 'DS', 'DSM', 'DSL']:
            continue
        filtered_glosses.append(gloss)
    
    return (" ".join(filtered_glosses)).upper()
            

def get_gloss(gloss, remove):
    
    removed_decorator_start = ""
    removed_decorator_end = ""

    if gloss[:8] == "FBUOY:DS":
        split = gloss.split(':')
        if len(split) > 2:
            gloss = gloss.split(':')[2]
            removed_decorator_start = "FBUOY "
    elif gloss[:6] == "FBUOY:":
        split = gloss.split(':')
        if len(split) > 1:
            gloss = gloss.split(':')[1]
            removed_decorator_start = "FBUOY "
    elif gloss[:6] == "FUBOY:":
        split = gloss.split(':')
        if len(split) > 1:
            gloss = gloss.split(':')[1]
            removed_decorator_start = "FBUOY "
    elif gloss[:6] == "FBOUY:":
        split = gloss.split(':')
        if len(split) > 1:
            gloss = gloss.split(':')[1]
            removed_decorator_start = "FBUOY "
    elif gloss[:6] == "GICA):":
        split = gloss.split(':')
        if len(split) > 1:
            gloss = gloss.split(':')[1]
    elif gloss[:6] == "TBUOY:":
        split = gloss.split(':')
        if len(split) > 1:
            gloss = gloss.split(':')[1]
            removed_decorator_start = "FBUOY "
    elif gloss[:3] == "FS:":
        gloss = gloss[3:]
        removed_decorator_start = "FS "
    elif gloss[:3] == "FB:":
        gloss = gloss[3:]
        removed_decorator_start = "FB "
    elif gloss[:2] == "DS":
        split = gloss.split(':')
        if len(split) > 1:
            gloss = gloss.split(':')[1]
            removed_decorator_start = "DS "
    elif gloss[:2] == "G(":
        split = gloss.split(':')
        if len(split) > 1:
            gloss = gloss.split(':')[1]
    elif gloss[:3] == "CA:":
        split = gloss.split(':')
        if len(split) > 1:
            gloss = gloss.split(':')[1]
    elif gloss[:3] == "GA:":
        split = gloss.split(':')
        if len(split) > 1:
            gloss = gloss.split(':')[1]
    elif gloss[:2] == "G:":
        split = gloss.split(':')
        if len(split) > 1:
            gloss = gloss.split(':')[1]
    elif gloss[:2] == "M:":
        split = gloss.split(':')
        if len(split) > 1:
            gloss = gloss.split(':')[1]
    elif gloss[:7] == "FINISH.":
        gloss = "FINISH"

    index = gloss.find("-2H")
    if index != -1:
        gloss = gloss[:index]
        removed_decorator_start = " 2H"
    index = gloss.find("-1H")
    if index != -1:
        gloss = gloss[:index]
        removed_decorator_start = " 1H"
    index = gloss.find("2-H")
    if index != -1:
        gloss = gloss[:index]
        removed_decorator_start = " 2H"
    index = gloss.find("1-H")
    if index != -1:
        gloss = gloss[:index]
        removed_decorator_start = " 1H"

    index = gloss.find("(")
    if index != -1:
        gloss = gloss[:index]
    
    if remove:
        return gloss
    return f'{removed_decorator_start}{gloss}{removed_decorator_end}'

### Generalisation of Glosses with Synonyms

Different sign glosses have been used for signs that represent the same thing. This is an error with the dataset which has been attempted to be corrected.

In [6]:
def get_synonyms():
    synonym_filename = 'synonyms.txt'

    file = open(synonym_filename, 'r')
    lines = file.readlines()

    synonyms = {}

    i = 0
    while i < len(lines):
        gloss = lines[i].strip()

        i += 1

        alternatives = lines[i].strip()
        alternatives = alternatives[2:len(alternatives) - 2]
        alternatives = alternatives.split("', '")

        synonyms[gloss] = alternatives

        i += 1

    return synonyms

def count_frequencies(dataset):

    gloss_dict = {}

    for line in dataset:
        parts = line.split("\t")
            
        for gloss in parts[1].split(" "):
            gloss = gloss.strip('.,!:;"\n').upper()
            if gloss in gloss_dict:
                gloss_dict[gloss] += 1
            else:
                gloss_dict[gloss] = 1

    return gloss_dict

def clean_sentence_duplicates(sentence):
    sentence = sentence.replace("watch ", "look ")
    sentence = sentence.replace(" watch", " look")

    sentence = sentence.replace("turtle ", "tortoise ")
    sentence = sentence.replace(" turtle", " tortoise")

    sentence = sentence.replace("hare ", "rabbit ")
    sentence = sentence.replace(" hare", " rabbit")
    return sentence

def clean_sequence_duplicates(sequence):
    sequence = sequence.replace("TORTOISE TORTOISE", "TORTOISE")
    sequence = sequence.replace("RABBIT RABBIT", "RABBIT")
    return sequence

def generalise_synonyms(dataset):

    synonyms = get_synonyms()

    gloss_dict = count_frequencies(dataset)

    converted_dataset = []

    for line in dataset:
        sentence, sequence = split_line(line)

        converted_sequence = []

        for gloss in sequence.split(" "):
            if gloss == "INDETERMINATE":
                continue
        
            most_common_freq = gloss_dict[gloss]
            most_common = gloss

            if gloss in synonyms:
                for alt in synonyms[gloss]:
                    if alt.upper() in gloss_dict and gloss_dict[alt.upper()] > most_common_freq:
                        most_common_freq = gloss_dict[alt.upper()]
                        most_common = alt.upper()
            
            converted_sequence.append(most_common)

        sentence = clean_sentence_duplicates(sentence)
        sequence = clean_sequence_duplicates(sequence)
        
        converted_dataset.append(f'{sentence}\t{join(converted_sequence)}')
    
    return converted_dataset

In [7]:

    
dataset = []
generalise_synonyms(dataset)

[]

### Free Dataset Parsing and Filtering

In [8]:
free_dataset = create_dataset(directory, 'FreeTransl')
display('Raw Free Translation', free_dataset)

Dataset: Raw Free Translation

Sign Glosses:   G:ROLLS DOG1 G:OH
English Text:   The dog was a bit stunned.

Sign Glosses:   WITH1 DOG1 DS(1):THE-DEER-RUNS DS(1):ANIMAL-RUNS G(HOLD-PAUSE):UMM
English Text:   The dog ran along beside them.

Sign Glosses:   CA:TURTLE SHOW1-2H
English Text:   "Ahh...see, see".

Sign Glosses:   PASS FINISH.GOOD-2H G(CA):
English Text:   "Has he already passed me?"

Sign Glosses:   STORY BOY PT:LOC
English Text:   This story is about the boy.



In [9]:
removed_free_dataset = filter_dataset(free_dataset, remove=True)
display('Raw Free Translation', removed_free_dataset)

Dataset: Raw Free Translation

Sign Glosses:   ROLLS DOG OH
English Text:   the dog was a bit stunned

Sign Glosses:   WITH DOG DEER RUNS ANIMAL RUNS PAUSE
English Text:   the dog ran along beside them

Sign Glosses:   TURTLE SHOW 2H
English Text:   ahh see see

Sign Glosses:   PASS FINISH 2H
English Text:   has he already passed me ?

Sign Glosses:   STORY BOY
English Text:   this story is about the boy



In [17]:
retained_free_dataset = filter_dataset(free_dataset, remove=False)
display('Raw Free Translation', retained_free_dataset)

Dataset: Raw Free Translation

Sign Glosses:   ROLLS DOG OH
English Text:   the dog was a bit stunned

Sign Glosses:   WITH DOG DS DEER RUNS DS ANIMAL RUNS PAUSE
English Text:   the dog ran along beside them

Sign Glosses:   TURTLE SHOW 2H
English Text:   ahh see see

Sign Glosses:   PASS FINISH 2H
English Text:   has he already passed me ?

Sign Glosses:   STORY BOY
English Text:   this story is about the boy



In [11]:
random.seed(10)

### Literal Dataset Parsing and Filtering

In [12]:
lit_dataset = create_dataset(directory, 'LitTransl')
display('Raw Literal Translation', lit_dataset)

Dataset: Raw Literal Translation

Sign Glosses:   RIGHT THINK
English Text:   (He) thought

Sign Glosses:   LOOK SMELL1 PUSH TREE2 PUSH COINCIDENCE DSM(BC):SPHERICAL-BEEHIVE-FALLS DSG(BENT5):VERTICAL-TREE-TRUNK? SPILL
English Text:   Then, (the hive) fell breaking.

Sign Glosses:   DS(B):RABBIT-RUNNING
English Text:   (He) scampered.

Sign Glosses:   BIRD PT:LOC FLY1 PT: G(CA): WALK-1H
English Text:   (He) walked.

Sign Glosses:   G(CA):HUMAN-RESTS-HIS-HAND-ON-THE-BRANCH LOOK WHERE-1H FROG G(CA):HUMAN-RESTS-HIS-HAND-ON-THE-BRANCH SHOCK SURPRISED PT:PRO3SG MOVE
English Text:   It moved.



In [13]:
removed_lit_dataset = filter_dataset(lit_dataset, remove=True)
display('Removed Literal Translation', removed_lit_dataset)

Dataset: Removed Literal Translation

Sign Glosses:   RIGHT THINK
English Text:   he thought

Sign Glosses:   RABBIT RUNNING
English Text:   he scampered

Sign Glosses:   BIRD FLY WALK 1H
English Text:   he walked

Sign Glosses:   DSS SPHERICAL JAR ANIMAL WALKS BACKWARD
English Text:   the frog walks back

Sign Glosses:   FETCH TEN BOWLING BALL
English Text:   he gets a ten pin bowling ball



In [14]:
generalised_lit_dataset = generalise_synonyms(removed_lit_dataset)
display('Generalised Literal Translation', generalised_lit_dataset)

Dataset: Generalised Literal Translation

Sign Glosses:   RIGHT THINK
English Text:   he thought

Sign Glosses:   RABBIT RUN
English Text:   he scampered

Sign Glosses:   BIRD FLY WALK 1H
English Text:   he walked

Sign Glosses:   DSS SPHERICAL JAR ANIMAL WALKS BACKWARD
English Text:   the frog walks back

Sign Glosses:   TAKE TEN BOWLING BALL
English Text:   he gets a ten pin bowling ball

