In [108]:
import os
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification
from copy import deepcopy

In [44]:
def load_data():
    COLUMBIA_PATH = "../data/ColumbiaGamesCorpus/"
    social_variables = pd.read_csv(
        os.path.join(COLUMBIA_PATH,"mturk/MTurkData.csv")
    )
    words = {}
    turns = {}
    for n_sess in range(1,13):
        words[n_sess] = {}
        turns[n_sess] = {}
        df_tasks = pd.read_csv(
            os.path.join(COLUMBIA_PATH,f"data/session_{n_sess:02}/s{n_sess:02}.objects.1.tasks"),
            sep=" ",
            header=None,
            names=["start_time","end_time","labels"]
        )
        df_tasks = df_tasks[df_tasks["labels"].str.contains("Images")].reset_index(drop=True)
        df_tasks.index += 1
        for n_task in range(1,15):
            words[n_sess][n_task] = {}
            turns[n_sess][n_task] = {}
            start_time = df_tasks.loc[n_task,"start_time"]
            end_time = df_tasks.loc[n_task,"end_time"]
            for speaker in ["A","B"]:
                w = pd.read_csv(
                    os.path.join(COLUMBIA_PATH,f"data/session_{n_sess:02}/s{n_sess:02}.objects.1.{speaker}.words"),
                    sep=" ",
                    header=None,
                    names=["start_time","end_time",f"word_{speaker}"]
                )
                w = w[(w["start_time"] >= start_time) & (w["end_time"] <= end_time)].reset_index(drop=True)
                words[n_sess][n_task][speaker] = w.loc[:,["start_time","end_time",f"word_{speaker}"]]
                
                t = pd.read_csv(
                    os.path.join(COLUMBIA_PATH,f"data/session_{n_sess:02}/s{n_sess:02}.objects.1.{speaker}.turns"),
                    sep=" ",
                    header=None,
                    names=["start_time","end_time",f"turn_{speaker}"]
                )
                t = t[(t["start_time"] >= start_time) & (t["end_time"] <= end_time)].reset_index(drop=True)
                turns[n_sess][n_task][speaker] = t.loc[:,["start_time","end_time",f"turn_{speaker}"]]
                
    social_variables = social_variables.set_index("hitid")
    return words, turns, social_variables


def print_aligned(df,width=80):
    n_batches = int(np.ceil(len(df) / width))
    
    def align_words(words):
        lens = [len(w) for w in words]
        max_len_word = np.argmax(lens)
        aligned_words = [w + " " * (len(words[max_len_word])-len(w)) for w in words]
        return aligned_words
    
    aligned_columns = align_words(df.columns)
    for i in range(n_batches):
        aligned_rows = [(column, []) for column in aligned_columns]
        for j in range(i*width,(i+1)*width):
            if j >= len(df):
                break
            aligned_words = align_words(df.loc[j,:].values)
            for r, w in zip(aligned_rows,aligned_words):
                r[1].append(w)
        for c, row in aligned_rows:
            print(f"{c}: {' '.join(row)}")
        print()

def merge_turns(words):
    dfs = {}
    for n_sess in range(1,13):
        dfs[n_sess] = {}
        for n_task in range(1,15):
            df_A = words[n_sess][n_task]["A"].rename(columns={"word_A": "word"})
            df_A["speaker"] = "A"
            df_B = words[n_sess][n_task]["B"].rename(columns={"word_B": "word"})
            df_B["speaker"] = "B"
            df = pd.concat([df_A,df_B],ignore_index=True,axis=0)
            df = df.sort_values(by=["start_time","end_time"],ascending=True)
            df = df.reset_index(drop=True)
            utts = []
            utt = []
            is_talking = "None"
            for i, row in df.iterrows():
                if row["word"] != "#" and is_talking == "None":
                    utt.append(row["word"])
                    is_talking = row["speaker"]
                elif row["word"] != "#" and is_talking != "None":
                    if is_talking == row["speaker"]:
                        utt.append(row["word"])
                    else:
                        utts.append({"utt": utt, "speaker": is_talking})
                        utt = [row["word"]]
                    is_talking = row["speaker"]
                elif len(utt) > 0:
                    utts.append({"utt": utt, "speaker": is_talking})
                    utt = []
                    is_talking = "None"
                else:
                    pass
            dfs[n_sess][n_task] = {}
            dfs[n_sess][n_task]["df"] = pd.DataFrame.from_records(utts)
            dfs[n_sess][n_task]["df_orig"] = df
            dfs[n_sess][n_task]["sv"] = social_variables.loc[f"g{n_sess}t{n_task}",:]
    return dfs
    
words, turns, social_variables = load_data()
dfs = merge_turns(words)

In [45]:
for _, row in dfs[2][4]["df"].iterrows():
    print(row["utt"],row["speaker"])

['okay'] A
['so', 'the', 'nail'] A
['is', 'gonna', 'go', 'directly', 'on', 'top', 'of', 'the', 'lawnmower'] A
['and'] A
['if', 'you', 'look', 'at', 'the', 'nail', "there's"] A
['a', 'line'] A
['n-', 'kind', 'of', 'see', 'the', "nail's", 'pointed', 'and', 'then', "there's", 'a', 'line', 'that', 'kinda', 'cuts', 'across'] A
['mmhm'] B
['um'] A
['mmhm'] B
['that', 'looks', 'like', 'it', 'gets', 'lined', 'up', 'with', 'the', 'front', 'edge', 'of', 'the', 'lawnmower'] A
['oh', 'okay', 'I', 'see', 'what', "you're", 'saying'] B
['and'] A
['so'] B
["it's", 'probably', 'like', 'half', 'an', 'inch', 'maybe'] B
['no', 'no', 'no'] B
['mm'] A
['that', 'that', 'line', 'tha-'] B
['mmhm'] A
['near', 'the', 'point', 'of', 'the', 'nail', 'that', 'lines', 'up', 'with', 'the', 'front', 'edge', 'of', 'the', 'lawnmower'] B
['yeah'] A
['like', 'r-'] A
['like', 'right', 'on', 'top', 'of', 'it'] A
['maybe', 'slightly', 'in', 'front', 'of', 'it'] A
['just', 'slight', 'slight', 'slight'] A
['and', 'then', 'the',

In [114]:
print_aligned(dfs[1][2]["df_orig"].loc[:,["word","speaker"]],width=20)

word   : okay so uh the lawnmower # is right in between of # the two lions and the nail # so
speaker: A    A  A  A   A         A A  A     A  A       A  A A   A   A     A   A   A    A A 

word   : right in the # the middle of them there # to the left of the nail mmhm
speaker: A     A  A   A A   A      A  A    A     A A  A   A    A  A   A    B   



In [48]:
for n_sess in range(1,13):
    for n_task in range(1,15):
        s = sum(dfs[n_sess][n_task]["df"]["utt"].apply(lambda x: len(x) == 0))
        if s > 0:
            print(s)

In [113]:
def create_input(dfs,n_sess,n_task):
    speaker = dfs[n_sess][n_task]["df"].loc[0,"speaker"]
    turns = []
    turn = []
    for i, utt in dfs[n_sess][n_task]["df"].iterrows():
        if utt.speaker != speaker:
            turns.append(" ".join(turn))
            turn = utt.utt
        else:
            turn.extend(utt.utt)
        speaker = utt.speaker
    turns = " <TURN> ".join(turns)
    return turns

inp = create_input(dfs,n_sess=1,n_task=4)
inp

''

In [61]:
tokenizer = AutoTokenizer.from_pretrained("pzelasko/longformer-swda-nolower")
model = AutoModelForTokenClassification.from_pretrained("pzelasko/longformer-swda-nolower")

Downloading:   0%|          | 0.00/369 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/593M [00:00<?, ?B/s]

Some weights of the model checkpoint at pzelasko/longformer-swda-nolower were not used when initializing LongformerForTokenClassification: ['longformer.embeddings.position_ids']
- This IS expected if you are initializing LongformerForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [83]:
tokenizer

PreTrainedTokenizerFast(name_or_path='pzelasko/longformer-swda-nolower', vocab_size=50265, model_max_len=4096, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False), 'additional_special_tokens': ['<TURN>']})

In [95]:
SWDA_DIALOG_ACT_TO_TAG = {
    "Statement-non-opinion": "sd",
    "Acknowledge-Backchannel": "b",
    "Statement-opinion": "sv",
    "Agree-Accept": "aa",
    "Abandoned-or-Turn-Exit": "%",
    "Appreciation": "ba",
    "Yes-No-Question": "qy",
    "Non-verbal": "x",
    "Yes-answers": "ny",
    "Conventional-closing": "fc",
    "Uninterpretable": "%",
    "Wh-Question": "qw",
    "No-answers": "nn",
    "Response-Acknowledgement": "bk",
    "Hedge": "h",
    "Declarative-Yes-No-Question": "qy^d",
    # Replace: "Other": "fo_o_fw_by_bc" with the following as it appears like that in SWDA
    "Other": 'fo_o_fw_"_by_bc',
    "Backchannel-in-question-form": "bh",
    "Quotation": "^q",
    "Summarize/reformulate": "bf",
    "Affirmative-non-yes-answers": "na",
    "Action-directive": "ad",
    "Collaborative-Completion": "^2",
    "Repeat-phrase": "b^m",
    "Open-Question": "qo",
    "Rhetorical-Questions": "qh",
    "Hold-before-answer-agreement": "^h",
    "Reject": "ar",
    "Negative-non-no-answers": "ng",
    "Signal-non-understanding": "br",
    "Other-answers": "no",
    "Conventional-opening": "fp",
    "Or-Clause": "qrr",
    "Dispreferred-answers": "arp_nd",
    "3rd-party-talk": "t3",
    "Offers-Options-Commits": "oo_co_cc",
    "Self-talk": "t1",
    "Downplayer": "bd",
    "Maybe-Accept-part": "aap_am",
    "Tag-Question": "^g",
    "Declarative-Wh-Question": "qw^d",
    "Apology": "fa",
    "Thanking": "ft",
    "+": "+"
}

def predict_da(dfs,n_sess,n_task,id2label):
    inp = create_input(dfs,n_sess,n_task)
    words = tokenizer.tokenize(inp)
    words.insert(0,"<s>")
    words.insert(-1,"</s>")
    
    encoded_input = tokenizer(inp,return_tensors="pt")
    logits = model(**encoded_input).logits
    predictions = logits.argmax(dim=-1).cpu().detach().numpy()
    predictions_labels = [id2label[i] for i in predictions[0]]

    print_aligned(pd.DataFrame({"words": words, "da": predictions_labels}),width=10)
    
    
words, turns, social_variables = load_data()
dfs = merge_turns(words)

n_sess, n_task = 2, 4
id2label = {int(i): SWDA_DIALOG_ACT_TO_TAG[label] if label in SWDA_DIALOG_ACT_TO_TAG else label for i, label in model.config.id2label.items()}
# id2label = {int(i): label for i, label in model.config.id2label.items()}
predict_da(dfs,n_sess,n_task,id2label)

words: <s> ok              ay              Ġso Ġthe Ġnail Ġis Ġgonna Ġgo Ġdirectly
da   : sd  fo_o_fw_"_by_bc fo_o_fw_"_by_bc I-  I-   I-    I-  I-     I-  I-       

words: Ġon Ġtop Ġof Ġthe Ġlawn m  ower Ġand Ġif Ġyou
da   : I-  I-   I-  I-   sd    sd sd   sd   I-  I-  

words: Ġlook Ġat Ġthe Ġnail Ġthere 's Ġa Ġline Ġn - 
da   : I-    I-  I-   I-    I-     I- I- I-    I- sd

words: Ġkind Ġof Ġsee Ġthe Ġnail 's Ġpointed Ġand Ġthen Ġthere
da   : I-    I-  I-   I-   I-    I- sd       sd   I-    I-    

words: 's Ġa Ġline Ġthat Ġkinda Ġcuts Ġacross Ġ  <TURN> Ġmm
da   : I- I- I-    I-    I-     I-    I-      sd O      I- 

words: hm Ġ <TURN> Ġum Ġ <TURN> Ġmm hm Ġ <TURN>
da   : I- b O      %   % O      I-  b  b O     

words: Ġthat Ġlooks Ġlike Ġit Ġgets Ġlined Ġup Ġwith Ġthe Ġfront
da   : I-    I-     I-    I-  I-    I-     I-  I-    I-   I-    

words: Ġedge Ġof Ġthe Ġlawn m  ower Ġ  <TURN> Ġoh Ġokay
da   : I-    I-  I-   sd    sd sd   sd O      I-  b    

words: ĠI Ġsee Ġwhat Ġyou 're 

In [106]:
def predict_das(id2label):
    words, turns, social_variables = load_data()
    dfs = merge_turns(words)
    
    inps = []
    for n_sess in range(1,2):
        for n_task in range(1,3):
            inp = create_input(dfs,n_sess,n_task)
            print(inp)
            inps.append(inp)
    print(inps)
    encoded_input = tokenizer(inps,return_tensors="pt")
    logits = model(**encoded_input).logits
    predictions = logits.argmax(dim=-1).cpu().detach().numpy()
    preds = []
    i = 0
    for n_sess in range(1,2):
        for n_task in range(1,3):
            preds.append({
                "session": n_sess,
                "task": n_task,
                "da": " ".join([id2label[ii] for ii in predictions[i]]),
                "sv": social_variables[n_sess][n_task]
            })
            i += 1
    return pd.DataFrame.from_records(preds)

preds = predict_das(id2label)
preds

okay so the mime um is directly above the owl and to the left I mean t- excuse me to the right of the ear so right in the <TURN> and it's parallel with the ear <TURN> exactly

["okay so the mime um is directly above the owl and to the left I mean t- excuse me to the right of the ear so right in the <TURN> and it's parallel with the ear <TURN> exactly", '']


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).