# Preparing data for tf modeling

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import re
from scipy.stats import iqr
import utipy as ut

In [None]:
INCLUDE_ASD = False

In [None]:
dpath = ""
data = pd.read_csv(dpath+"All-Diagnoses-Adults-DK-Triangles.csv")
data["Observation ID"] = list(range(len(data)))
unique_ids = pd.read_csv(dpath+"unique_IDs.csv")
unique_ids = unique_ids[["File","Sub.File","Study","Diagnosis","Subject","unique_ID"]]
unique_ids.columns = ["File","Sub File","Study","Diagnosis","Subject","Unique ID"]

In [None]:
if not INCLUDE_ASD:
    data = data[data.Diagnosis != "Asperger"]

In [None]:
data.head()

In [None]:
unique_ids.head()

In [None]:
# Add unique IDs to the data
data = pd.merge(data,
                unique_ids,
                on=["File","Sub File","Study","Diagnosis","Subject"], 
                how='left')

In [None]:
# Subset the columns relevant to the TensorFlow pipeline
tfdata = data[data.Recovered == 0]
tfdata = tfdata[["Unique ID", "Diagnosis", "Observation ID", "Transcript"]]

In [None]:
tfdata.head()

In [None]:
#transcripts = tfdata["Transcript"]
# transcripts.to_csv(dpath + "transcripts_only.csv")

In [None]:
test_string = tfdata["Transcript"][2]
test_string

In [None]:
def replace_slash(s):
    s = s.replace(' . / ',' . ')
    s = s.replace(' / . ',' . ')
    return s.replace(' / ',' . ')

In [None]:
replace_slash(tfdata["Transcript"][2])

In [None]:
def remove_hyphen(s):
    s = s.replace(' - ',' ')
    return s.replace('- ',' ') # Not within words
remove_hyphen(tfdata["Transcript"][103]+ " computer-mekanisk-agtigt.")

In [None]:
def enforce_special_danish_chars(s):
    s = s.replace('aa', 'å')
    s = s.replace('Aa', 'Å')
    s = s.replace('ae', 'æ')
    s = s.replace('Ae', 'Æ')
    s = s.replace('oe', 'ø')
    s = s.replace('Oe', 'Ø')
    return s
enforce_special_danish_chars("Oeh, "+tfdata["Transcript"][201])

In [None]:
def remove_apostrophe(s): # This happens automatically in keep_allowed_characters()
    s = s.replace('\'', '')
    s = s.replace('`', '')
    s = s.replace('´', '')
    s = s.replace('\"', '')
    return s
remove_apostrophe("han 'havde' ti ting med `sig´ da han \"fik\" nok ")

In [None]:
def decrease_spaces(s):
    return re.sub(r'\s\s+', ' ', s)
decrease_spaces("  , han havde     men så fik hun han ")

In [None]:
def space_punctuation(s):
    s = s.replace(',', ' , ')
    s = s.replace('.', ' . ')
    s = s.replace('?', ' ? ')
    s = s.replace('!', ' ! ')
    s = decrease_spaces(s)
    return s
space_punctuation(tfdata["Transcript"][199] + "  ,men hvem? Ja, hvem kan! Nej det... må være nok!")

In [None]:
def remove_parantheses(s):
    # Stuff in () was inserted by the transcriber
    # so we wish to remove that
    s = re.sub(r'\(\([^)]*\)\)', '', s)
    s = re.sub(r'\([^)]*\)', '', s)
    s = s.replace(' )', ' ')
    s = s.replace('( ', ' ')
    return decrease_spaces(s)
remove_parantheses("han (hende) var så() træls ((spiser)) men hun )")

In [None]:
def keep_allowed_characters(s):
    return re.sub('[^A-Za-zæøåÆØÅ.,\-!? ]', '', s)
keep_allowed_characters(tfdata["Transcript"][199] + "  ,men 'hvem?'' `Ja´, hÄ´un-kønnet hvæm kan! Nej det... må være nok!")

In [None]:
tfdata["Transcript"][199] + "  ,men 'hvem?'' `Ja´, hÄ´un-kønnet hvæm kan! Nej det... må være nok!"

In [None]:
def clean_text(s):
    s = replace_slash(s)
    s = remove_hyphen(s)
    s = remove_parantheses(s)
    s = enforce_special_danish_chars(s)
    s = keep_allowed_characters(s)
    s = space_punctuation(s)
    return s

In [None]:
clean_text(tfdata["Transcript"][199] + "  ,men 'hvem?'' `Ja´, hÄ´un-kønnet hvæm kan! Nej det... må være nok!")

In [None]:
clean_transcripts = [clean_text(str(s)) for s in tfdata["Transcript"]]

In [None]:
# clean_transcripts[2000:2100]

In [None]:
tfdata["Transcript"] = clean_transcripts

In [None]:
tfdata["Num Chars"] = [
    len(s) for s in tfdata["Transcript"]
]

In [None]:
tfdata.reset_index(inplace=True, drop=True)

In [None]:
tfdata

In [None]:
def describe_num_chars(nums):
    return {"median":np.median(nums),
            "mean":np.mean(nums),
            "std":np.std(nums),
            "iqr":iqr(nums),
            "min":min(nums),
            "max":max(nums)}

In [None]:
describe_num_chars(tfdata["Num Chars"])

In [None]:
# Filter out those shorter than 40 chars
tfdataFiltered = tfdata[tfdata['Num Chars'] > 40]

In [None]:
describe_num_chars(tfdataFiltered["Num Chars"])

In [None]:
def split_transcript(transcript, obs_id, size=210, stride = 40):

    if len(transcript) < size:
        return [[obs_id, transcript]]

    # Convert to array of chars
    def split_to_chars(sentence): 
        return [char for char in sentence]  
    
    transcript_chars = np.asarray(split_to_chars(transcript))
    
    naive_splits = ut.window(transcript_chars, size=size, gap=stride, rolling=True, discard_shorts = False)[0]
    
    def join_to_sentence(chars, is_first=False):
        to_sent = "".join(list(chars))
        if is_first:
            start_at = 0
            prefix = ""
        else:
            start_at = 1
            prefix = "... "
        
        return prefix + " ".join(to_sent.split(" ")[start_at:-1])
    
    first_transcript = join_to_sentence(naive_splits[0], is_first=True)
    rest = [join_to_sentence(s, is_first=False) for s in naive_splits[1:]]
    transcript_splits = np.concatenate([[first_transcript], rest])
    transcript_splits = [[obs_id, t] for t in transcript_splits]
    
    return transcript_splits
    

In [None]:
test_string = "fheh sdf h jehj fdsjh  sjhs aasdh  askjhd asdhj asd"
split_transcript(test_string, 3, 10, 3)

In [None]:
problem_row = tfdataFiltered[tfdataFiltered["Observation ID"] == 56]
problem_row
split_transcript(problem_row["Transcript"], problem_row["Observation ID"])

In [None]:
transcript_splits = [split_transcript(t, oi, size=210, stride=40) \
                     for oi, t in zip(tfdataFiltered["Observation ID"], tfdataFiltered["Transcript"])]

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
transcript_splits = flatten(transcript_splits)

In [None]:
len(transcript_splits)

In [None]:
transcript_splits[:3]

In [None]:
transcript_splits_df = pd.DataFrame.from_records(transcript_splits, columns=["Observation ID", "Transcript Split"])

In [None]:
transcript_splits_df["Num Chars Split"] = [
    len(s) for s in transcript_splits_df["Transcript Split"]
]

In [None]:
transcript_splits_df

In [None]:
# Max transcript split length
max(transcript_splits_df['Num Chars Split'])

In [None]:
# Filter out those shorter than 40 chars
transcript_splits_df_filtered = transcript_splits_df[transcript_splits_df['Num Chars Split'] > 40]

In [None]:
len(transcript_splits_df_filtered)

In [None]:
# Add unique IDs to the data
tfdata_final = pd.merge(tfdataFiltered,
                        transcript_splits_df_filtered,
                        on=["Observation ID"], 
                        how='left')

In [None]:
tfdata_final["Split ID"] = list(range(len(tfdata_final)))

In [None]:
tfdata_final

In [None]:
tfdata_final.to_csv(dpath + "preprocessed_for_tf.csv")