In [6]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset # 데이터로더

from kogpt2_transformers import get_kogpt2_tokenizer
from kobert_transformers import get_tokenizer

In [None]:
class WellnessAutoRegressiveDataset(Dataset):  # 질문과 답변 쌍 데이터
    def __init__(self,
                   file_path = "../data/wellness_dialog_for_autoregressive_seungwoo.txt",
                   n_ctx = 1024
                   ):
        self.file_path = file_path
        self.data =[]     # index_of_words  가 들어감.
        self.tokenizer = get_kogpt2_tokenizer()


        bos_token_id = [self.tokenizer.bos_token_id]    # 출력하면 [0] 값.
        eos_token_id = [self.tokenizer.eos_token_id]    # 출력하면 [1] 값.
        pad_token_id = [self.tokenizer.pad_token_id]    # 출력하면 [3] 값.

        file = open(self.file_path, 'r', encoding='utf-8')

        while True:
            line = file.readline()
            if not line:
                break
            datas = line.split("    ")
            #     print(datas)
            #     ['제 감정이 이상해진 것 같아요. 남편만 보면 화가 치밀어 오르고 감정 조절이 안되요.', '감정이 조절이 안 될 때만큼 힘들 때는 없는 거 같아요.\n']
            #     ['제 감정이 이상해진 것 같아요. 남편만 보면 화가 치밀어 오르고 감정 조절이 안되요.', '저도 그 기분 이해해요. 많이 힘드시죠?\n']

            index_of_words = bos_token_id +self.tokenizer.encode(datas[0]) + eos_token_id + bos_token_id + self.tokenizer.encode(datas[1][:-1])+ eos_token_id
            pad_token_len = n_ctx - len(index_of_words)

            index_of_words += pad_token_id * pad_token_len

            self.data.append(index_of_words)
        
        print('tokenizer.encode(datas[0]):', tokenizer.encode(datas[0]))
        tokenizer.encode(datas[0]): [47529, 47674]
        tokenizer.encode(datas[0]): [47481, 30790, 389, 3529, 120, 13996, 47440, 3502, 47494, 1767, 11388, 45585, 47487, 10992, 7212, 5680, 47438, 199, 47562, 47580, 47440] 
        print('tokenizer.encode(datas[1][:-1]):', tokenizer.encode(datas[1][:-1]))
        tokenizer.encode(datas[1][:-1]): [49251, 48326]
        tokenizer.encode(datas[1][:-1]): [47640, 1968, 5680, 47438, 199, 737, 215, 2747, 4171, 3161, 779, 241, 13996, 47440]
        print(index_of_words)
        [0, 47529, 47674, 1, 0, 49251, 48326, 1]
        [0, 47481, 30790, 389, 3529, 120, 13996, 47440, 3502, 47494, 1767, 11388, 45585, 47487, 10992, 7212, 5680, 47438, 199, 47562, 47580, 47440, 1, 0, 47640, 1968, 5680, 47438, 199, 737, 215, 2747, 4171, 3161, 779, 241, 13996, 47440, 1]

        file.close()

    def __len__(self):
        return len(self.data)   ### data = index_of_words

    def __getitem__(self,index):
        item = self.data[index]
        return item
    
########################################################################################
class WellnessTextClassificationDataset(Dataset):
    
    def __init__(self,
               file_path = "../data/wellness_dialog_for_text_classification_seungwoo.txt",
               num_label = 359,
               device = 'cpu',
               max_seq_len = 512,  # KoBERT max_length
               tokenizer = None
               ):
        self.file_path = file_path
        self.device = device
        self.data =[]
        self.tokenizer = tokenizer if tokenizer is not None else get_tokenizer()


        file = open(self.file_path, 'r', encoding='utf-8')

        while True:
            line = file.readline()
            if not line:
                break
            datas = line.split("    ")
            #     print(datas)
            #     ['제 감정이 이상해진 것 같아요. 남편만 보면 화가 치밀어 오르고 감정 조절이 안되요.', '0\n']
            #     ['더 이상 내 감정을 내가 컨트롤 못 하겠어.', '0\n']
            index_of_words = self.tokenizer.encode(datas[0])
            token_type_ids = [0] * len(index_of_words)
            attention_mask = [1] * len(index_of_words)

            # Padding Length
            padding_length = max_seq_len - len(index_of_words)

            # Zero Padding
            index_of_words += [0] * padding_length
            token_type_ids += [0] * padding_length
            attention_mask += [0] * padding_length

            # Label
            label = int(datas[1][:-1])

            data = {
                  'input_ids': torch.tensor(index_of_words).to(self.device),
                  'token_type_ids': torch.tensor(token_type_ids).to(self.device),
                  'attention_mask': torch.tensor(attention_mask).to(self.device),
                  'labels': torch.tensor(label).to(self.device)
                 }

            self.data.append(data)
        
        len(data_dict[i]: 'input_ids','token_type_ids','attention_mask')의 길이는 모두 512
        data_dict['labels']은 그냥 레이블 값
        len(data) : 5231    

        {'input_ids': tensor([   2, 1370, 5859, 3647, 6037, 5561,  517, 6744, 7086, 5850, 1434, 1917,
                 5812, 3135, 5876,   54,    3,    0,    0, ... 0, 0])
        'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0])
        'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
                 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0])
        'labels': tensor(0)} 
          
        file.close()
        
    def __len__(self):
        return len(self.data)
    def __getitem__(self,index):
        item = self.data[index]
        return item
    
    
    
if __name__ == "__main__":
    dataset = WellnessAutoRegressiveDataset()
    dataset2 = WellnessTextClassificationDataset()
    print(dataset)
    print(dataset2)

In [22]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset # 데이터로더
from kogpt2_transformers import get_kogpt2_tokenizer
from kobert_transformers import get_tokenizer


file_path = "../data/wellness_dialog_for_text_classification_seungwoo.txt"
num_label = 359
device = 'cpu'
max_seq_len = 512  # KoBERT max_length
tokenizer = None
###############################
file_path = file_path
device = device
data =[]
tokenizer = tokenizer if tokenizer is not None else get_tokenizer()

file = open(file_path, 'r', encoding='utf-8')

while True:
# for i in range(10) :
    line = file.readline()
    if not line:
        break
    datas = line.split("    ")
#     print(datas)
#     ['제 감정이 이상해진 것 같아요. 남편만 보면 화가 치밀어 오르고 감정 조절이 안되요.', '0\n']
#     ['더 이상 내 감정을 내가 컨트롤 못 하겠어.', '0\n']
    
    index_of_words = tokenizer.encode(datas[0])
    token_type_ids = [0] * len(index_of_words)
    attention_mask = [1] * len(index_of_words)

    # Padding Length
    padding_length = max_seq_len - len(index_of_words)

    # Zero Padding
    index_of_words += [0] * padding_length
    token_type_ids += [0] * padding_length
    attention_mask += [0] * padding_length

    # Label
    label = int(datas[1][:-1])
    
    data_dict = {
          'input_ids': torch.tensor(index_of_words).to(device),
          'token_type_ids': torch.tensor(token_type_ids).to(device),
          'attention_mask': torch.tensor(attention_mask).to(device),
          'labels': torch.tensor(label).to(device)
         }

    data.append(data_dict)

# len(data_dict[i]: 'input_ids','token_type_ids','attention_mask')의 길이는 모두 512
# data_dict['labels']은 그냥 레이블 값
# len(data) : 5231    
    
# {'input_ids': tensor([   2, 1370, 5859, 3647, 6037, 5561,  517, 6744, 7086, 5850, 1434, 1917,
#          5812, 3135, 5876,   54,    3,    0,    0, ... 0, 0])
# 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
#          0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0])
# 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
#          0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0])
# 'labels': tensor(0)} 




file.close()
        
#     def __len__(self):
#         return len(self.data)
#     def __getitem__(self,index):
#         item = self.data[index]
#         return item

In [31]:
len(data)

5231