In [None]:
import pympi
import os
import math

In [None]:
def get_token(token):    
    if token[:8] == "FBUOY:DS":
        split = token.split(':')
        if len(split) > 2:
            token = token.split(':')[2]
    if token[:6] == "FBUOY:":
        split = token.split(':')
        if len(split) > 1:
            token = token.split(':')[1]
    if token[:6] == "FUBOY:":
        split = token.split(':')
        if len(split) > 1:
            token = token.split(':')[1]
    if token[:6] == "FBOUY:":
        split = token.split(':')
        if len(split) > 1:
            token = token.split(':')[1]
    if token[:6] == "GICA):":
        split = token.split(':')
        if len(split) > 1:
            token = token.split(':')[1]
    if token[:6] == "TBUOY:":
        split = token.split(':')
        if len(split) > 1:
            token = token.split(':')[1]
    elif token[:3] == "FS:":
        token = token[3:]
    elif token[:3] == "FB:":
        token = token[3:]
    elif token[:2] == "DS":
        split = token.split(':')
        if len(split) > 1:
            token = token.split(':')[1]
    elif token[:2] == "G(":
        split = token.split(':')
        if len(split) > 1:
            token = token.split(':')[1]
    elif token[:3] == "CA:":
        split = token.split(':')
        if len(split) > 1:
            token = token.split(':')[1]
    elif token[:3] == "GA:":
        split = token.split(':')
        if len(split) > 1:
            token = token.split(':')[1]
    elif token[:2] == "G:":
        split = token.split(':')
        if len(split) > 1:
            token = token.split(':')[1]
    elif token[:2] == "M:":
        split = token.split(':')
        if len(split) > 1:
            token = token.split(':')[1]

    index = token.find("-2H")
    if index != -1:
        token = token[:index]
    index = token.find("-1H")
    if index != -1:
        token = token[:index]
    index = token.find("2-H")
    if index != -1:
        token = token[:index]
    index = token.find("1-H")
    if index != -1:
        token = token[:index]

    index = token.find("(")
    if index != -1:
        token = token[:index]
        
    return token

In [None]:
def create_dataset(directory, dataset_name):
    
    data_file = open(dataset_name, 'w')
    
    for filename in os.listdir(directory):
        if '.eaf' in filename:
            append_dataset(directory, filename, data_file)
            
    data_file.close()

def append_dataset(directory, eaf_file, data_file):
    eaf_object = pympi.Elan.Eaf(f'./{directory}/{eaf_file}')
    
    if not has_translation(eaf_object):
        return
    
    print(eaf_file)
    
    sentence, tokens = parse_eaf(eaf_object)
    
    tokens = filter_tokens(tokens)
    for i in range(0, len(sentence)):
        if len(tokens[i]) - len(sentence[i]) > 25:
            print(f'{sentence[i]}\t{tokens[i]}\n')
            continue
        data_file.write(f'{sentence[i]}\t{tokens[i]}\n')

def filter_tokens(tokens):
    
    filtered_tokens = []
    
    for token_sequence in tokens:
        filtered_token_sequence = []
        
        for token in token_sequence:
            
            if token[:3] == "PT:":
                continue
            if token[:5] == "LOOK(":
                continue
            if token[:7] == "PTBUOY:":
                continue
            
            token = get_token(token)
                    
            if token in ["WELL", "", "AHH", "UMM", "FSL", "PTBUOY"]:
                continue
            
            if token[-1].isdigit():
                token = token[:-1]
                
            if len(token) < 2:
                continue
            
            filtered_token_sequence.append(token)
        filtered_tokens.append(filtered_token_sequence)
                
    unduped_tokens = []
    for token_group in filtered_tokens:
        filtered_token_group = []
        for i in range(0, len(token_group)):
            if (i == 0 or token_group[i] != token_group[i - 1]) and (i < 2 or token_group[i] != token_group[i - 2]):
                filtered_token_group.append(token_group[i])
            
        unduped_tokens.append(join_tokens(filtered_token_group))
        
    return unduped_tokens

def has_translation(eaf_object):
    return eaf_object.tiers.get('FreeTransl') is not None

def parse_eaf(eaf_object):
    filtered_sentences = filter_tier(eaf_object, 'FreeTransl')
    filtered_lh = filter_tier(eaf_object, 'LH-IDgloss')
    filtered_rh = filter_tier(eaf_object, 'RH-IDgloss')
    
    lhp = 0
    rhp = 0
    tokens = []
    for sentence in filtered_sentences:
        token_group = []
        while (lhp < len(filtered_lh) and filtered_lh[lhp][1] <= sentence[1] + 2):
            token_group.append(filtered_lh[lhp])
            lhp += 1
        
        while (rhp < len(filtered_rh) and filtered_rh[rhp][1] <= sentence[1] + 2):
            token_group.append(filtered_rh[rhp])
            rhp += 1
        tokens.append(token_group)
    
    filtered_tokens = []
    for token_group in tokens:
        token_group.sort()
        filtered_token_group = []
        for i in range(0, len(token_group)):
            if i == 0 or token_group[i][2] != token_group[i - 1][2]:
                filtered_token_group.append(token_group[i][2])
            
        filtered_tokens.append(filtered_token_group)
    
    sentences = []
    for sentence in filtered_sentences:
        sentences.append(sentence[2])
        
    return sentences, filtered_tokens

def filter_tier(eaf_object, tier_name):
    segment_dict = eaf_object.tiers[tier_name][0]
    
    filtered_segments = []
    
    for key in segment_dict:
        segment = segment_dict[key]
        if segment[2] != '':
            filtered_segments.append((int(segment[0][2:]), int(segment[1][2:]), segment[2]))
        
    return filtered_segments
                        
def join_tokens(tokens):
    return " ".join(tokens)

In [None]:
directory = 'EAF'
dataset_name = 'elan_dataset.txt'

In [None]:
create_dataset(directory, dataset_name)

In [None]:
eaf_object = pympi.Elan.Eaf(f'./{directory}/AAPB1c2b.eaf')
filtered_sentences = filter_tier(eaf_object, 'RH-IDgloss')

In [None]:
filtered_sentences