# Convert DanPASS TextGrids to a useful format

Note: I assume there's an easier way to extract text from TextGrid files? Do let me know.

In [None]:
import textgrid as tg
from os.path import join, abspath
import re
import glob
import pandas as pd
import ntpath

In [None]:
data_path = ""
save_path = join(data_path, abspath( join(
    data_path, "../../preprocessed/danpass/")))

In [None]:
test_grid_path = join(data_path, "Corpus_2014_dialogues/d_001_1_f_non-v.TextGrid")

In [None]:
dialogue_paths = glob.glob(join(data_path, "Corpus_2014_dialogues/*.TextGrid"))
monologue_paths = glob.glob(join(data_path, "Corpus_2014_monologues/*.TextGrid"))

In [None]:
def path_leaf(path):
    """
    Extracts the filename from a path string

    """
    head, tail = ntpath.split(path)
    return tail

In [None]:
def extract_text_from_textgrid(path, conv_type = "dialogue"):
    tgrid = tg.TextGrid()
    tgrid.read(f = path)
    orto = tgrid.getList("ortografi")
    
    if (len(orto) > 1):
        print("Length or ortografi was larger than one. Extracting first element and ignoring rest.")
        
    orto = orto[0]
    
    clean_word = lambda w: re.sub('[,+=]','', w)
    filter_flatten = lambda l: [[element, item] for element, sublist in enumerate(l) for item in sublist if item != ""]
    words = filter_flatten([clean_word(o.mark).split("_") for o in orto])
    
    times = [[i, float(o.minTime),float(o.maxTime)] for i,o in enumerate(orto)]
    #print(times)
    
    speech_turn_df = pd.DataFrame.from_records(words, columns=['Interval', 'Token'])
    speech_turn_df["File"] = path_leaf(path)
    speech_turn_df["Type"] = conv_type
    
    times_df = pd.DataFrame.from_records(times, columns=['Interval', 'Start Time', 'End Time'])
    
    result_df = pd.merge(speech_turn_df, times_df, on = 'Interval', how = 'left')

    return result_df


In [None]:
extract_text_from_textgrid(test_grid_path).head(20)

In [None]:
all_dialogues = pd.concat([extract_text_from_textgrid(p, "dialogue") for p in dialogue_paths])

In [None]:
all_monologues = pd.concat([extract_text_from_textgrid(p, "monologue") for p in monologue_paths])

In [None]:
all_dialogues.head(5)

In [None]:
all_monologues

In [None]:
all_dialogues.to_csv(join(save_path, "dialogue_tokens.csv"))
all_monologues.to_csv(join(save_path, "monologue_tokens.csv"))