### Quick Description

Process transcribed text to a more readable format based on [CORAAL's User Guide](http://lingtools.uoregon.edu/coraal/userguide/CORAALUserGuide_current.pdf)

In [None]:
import os
import glob
import yaml
import re
import pandas as pd

from tqdm import tqdm

In [None]:
filepaths = yaml.load(open("/home/guilherme/Desktop/dissertation/config/filepaths.yaml"), Loader=yaml.FullLoader)

---
### Data Cleansing

In [None]:
filenames = [file.split("/")[-1] for file in glob.glob(os.path.join(filepaths["02_coraal_preprocessed"], "*"))]

#### Build the entire text

In [None]:
for filename in tqdm(filenames):
    df = pd.read_csv(os.path.join(filepaths["01_coraal_raw"], filename), sep="\t")

    # Get only the interviewed person's sentences
    df["is_interviewer"] = df.Spkr.apply(lambda s: "_int_" in s)
    df = df[~df.is_interviewer]

    s = ' '.join(df.Content.values)

    # Remove line-level notes, e.g. (laughing)
    s = re.sub("\(.*?\)", '', s)

    # Remove non-linguistic sounds, e.g. `<cough>`
    s = re.sub("\<.*?\>", '', s)

    # Introduce the overlapping text to the regular text
    s = re.sub("\[|\]", '', s)

    # Replace markers with real tokens
    s = re.sub("\/RD-ADRESS-[0-9]+\/", "3rd Ave", s)
    s = re.sub("\/RD-NAME-[0-9]+\/", "Mary", s)
    s = re.sub("\/RD-WORK-[0-9]+\/", "Google", s)
    s = re.sub("\/RD-PLACE-[0-9]+\/", "Starbucks", s)
    s = re.sub("\/RD-SCHOOL-[0-9]+\/", "Mission San José High School", s)

    # Remove non-recongnizable text
    s = re.sub("\/inaudible\/", '', s)
    s = re.sub("\/unintelligible\/", '', s)

    # Remove unknown terms
    s = re.sub("\/\?+\/", '', s)
    
    # Introduce the redacted text to the string as regular text
    s = re.sub("[\/\/]+", '', s)

    # Remove all punctuations
    s = re.sub("[\?!;\-\.,]+", '', s)

    df = pd.DataFrame([s], columns=["text"])
    
    df.to_csv(os.path.join(filepaths["02_coraal_preprocessed"], filename), index=False)