In [18]:
# Load data and Library

import os
import json
import numpy as np
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import torch

datas = [
    'data/data_fold/data_0/dailydialog_train.json',
    'data/data_fold/data_0/dailydialog_valid.json',
    'data/data_fold/data_0/dailydialog_test.json',
]

encoder_name = 'bert-base-cased'

In [10]:
datafile = datas[0]
f = open(datafile)
data = json.load(f)
f.close()
tokenizer_ = AutoTokenizer.from_pretrained(encoder_name)

In [16]:
tokenizer_.add_special_tokens({'additional_special_tokens': ['<speaker1>', '<speaker2>','<emotion>']})  # speaker token 추가 emotion token 추가

0

In [112]:
def load_utterance_with_context(data_file, device, max_seq_len, encoder_name):
    def make_context(utterance_list, start_t, end_t, max_seq_len):
        context = "[SEP]".join(utterance_list[start_t:end_t][::-1])

        if start_t > end_t:
            return ""

        if len(context.split()) + len(utterance_list[end_t].split()) > max_seq_len:
            context = make_context(utterance_list=utterance_list, start_t=start_t+1, end_t=end_t, max_seq_len=max_seq_len)
        else:
            return context
        
        # ver2
         
        return context
    
    f = open(data_file)
    data = json.load(f)
    f.close()

    tokenizer_ = AutoTokenizer.from_pretrained(encoder_name)

    max_seq_len = max_seq_len
    max_doc_len = 0

    doc_utterance = list()

    for doc_id, content in data.items():
        single_utterances = list()
        utterance = list()
        content = content[0]
        max_doc_len = max(len(content), max_doc_len)

        for turn_data in content:
            single_utterances.append(turn_data["utterance"])

        for end_t in range(len(single_utterances)):
            context = make_context(utterance_list=single_utterances, start_t=0, end_t=end_t, max_seq_len=max_seq_len)

            utterance.append(tokenizer_(single_utterances[end_t], context, padding='max_length', max_length = max_seq_len, truncation=True, return_tensors="pt"))
        
        doc_utterance.append(utterance)
        
    out_utterance_input_ids, out_utterance_attention_mask, out_utterance_token_type_ids = [list() for _ in range(3)]

    for utterance_t in doc_utterance:
        padding_sequence = tokenizer_('', padding='max_length', max_length = max_seq_len, truncation=True, return_tensors="pt")

        padding_sequence_t = [padding_sequence for _ in range(max_doc_len - len(utterance_t))]

        utterance_t = utterance_t + padding_sequence_t
        utterance_input_ids_t, utterance_attention_mask_t, utterance_token_type_ids_t = [list() for _ in range(3)]
        for _ in utterance_t:
            utterance_input_ids_t.append(_['input_ids'])
            utterance_attention_mask_t.append(_['attention_mask'])
            utterance_token_type_ids_t.append(_['token_type_ids'])

        utterance_input_ids_t = torch.vstack(utterance_input_ids_t)
        utterance_attention_mask_t = torch.vstack(utterance_attention_mask_t)
        utterance_token_type_ids_t = torch.vstack(utterance_token_type_ids_t)

        out_utterance_input_ids.append(utterance_input_ids_t)
        out_utterance_attention_mask.append(utterance_attention_mask_t)
        out_utterance_token_type_ids.append(utterance_token_type_ids_t)

    out_utterance_input_ids, out_utterance_attention_mask, out_utterance_token_type_ids = torch.stack(out_utterance_input_ids), torch.stack(out_utterance_attention_mask), torch.stack(out_utterance_token_type_ids)
    # return (out_utterance_input_ids.to(device), out_utterance_attention_mask.to(device), out_utterance_token_type_ids.to(device)), max_doc_len, max_seq_len
    
    # device로 보내는 옵션을 해제 ( CUDA error: initialization error 때문에 )
    return (out_utterance_input_ids, out_utterance_attention_mask, out_utterance_token_type_ids), max_doc_len, max_seq_len


In [113]:
max_seq_len = 75

In [115]:
preprocessed_utterance, max_doc_len, max_seq_len = load_utterance_with_context(datafile, 'cpu', max_seq_len, encoder_name)
tokenizer_.decode(preprocessed_utterance[0][0][4])

"[CLS] Isn't that a scary movie? [SEP] Ah, my girlfriend wanted to see that movie. I have to take her later so I don't want to watch it ahead of time. How about The Cube? [SEP] How about Legally Blonde. [SEP] Sounds like a good plan. What do you want to see? [SEP] Hey, you wanna see [SEP]"

In [118]:
tokenizer_.decode(preprocessed_utterance[0][0][1])

'[CLS] Sounds like a good plan. What do you want to see? [SEP] Hey, you wanna see a movie tomorrow? [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [97]:
preprocessed_utterance[2][0][5]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1])

In [107]:
preprocessed_utterance[2][0][5][17]

tensor(1)

In [109]:
tokenizer_.decode(preprocessed_utterance[0][0][5][17])

'Hey'

In [60]:
tokenizer_.decode((tokenizer_.encode(sep문장)[1:]))

"[CLS] Ah, my girlfriend wanted to see that movie. I have to take her later so I don't want to watch it ahead of time. How about The Cube? [SEP] Hey, you wanna see a movie tomorrow? [SEP] Sounds like a good plan. What do you want to see? [SEP] How about Legally Blonde. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [SEP]"

In [62]:
preprocessed_utterance[0][0][3]

tensor([  101,  7066,   117,  1139,  6124,  1458,  1106,  1267,  1115,  2523,
          119,   146,  1138,  1106,  1321,  1123,  1224,  1177,   146,  1274,
          112,   189,  1328,  1106,  2824,  1122,  3075,  1104,  1159,   119,
         1731,  1164,  1109,   140, 15209,   136,   102,  4403,   117,  1128,
        16445,  1267,   170,  2523,  4911,   136, 10560,  1176,   170,  1363,
         2197,   119,  1327,  1202,  1128,  1328,  1106,  1267,   136,  1731,
         1164, 10800,  1193,   139,  4934,  2007,   119,   102,     0,     0,
            0,     0,     0,     0,     0])

In [69]:
tokenizer_.encode('[PAD]')[1]

0

In [72]:
tokenizer_.decode(15209)

'##ube'