In [20]:
from transformers import ElectraTokenizerFast, ElectraForMaskedLM, AdamW
import random
import logging
import json
from torch.utils.data import DataLoader, TensorDataset, Dataset
import torch
import os
import torch.nn as nn
from torch.nn.parallel import DataParallel
import torch.distributed as dist
from tqdm import tqdm

In [21]:
torch.cuda.empty_cache()

In [3]:
with open("/home/nlpgpu9/ellt/eojin/EA/nikluge-ea-2023-train_수정_중복제거.jsonl", 'r') as file:
    data = []
    for line in file:
        data.append(json.loads(line))

In [4]:
with open("/home/nlpgpu9/ellt/eojin/EA/nikluge-ea-2023-dev_수정.jsonl", 'r') as file:
    dev_data = []
    for line in file:
        dev_data.append(json.loads(line))

In [5]:
tokenizer = ElectraTokenizerFast.from_pretrained("monologg/koelectra-base-v3-discriminator")
model = ElectraForMaskedLM.from_pretrained("monologg/koelectra-base-v3-discriminator")

Some weights of ElectraForMaskedLM were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['generator_predictions.dense.weight', 'generator_predictions.dense.bias', 'generator_predictions.LayerNorm.bias', 'generator_lm_head.bias', 'generator_predictions.LayerNorm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def mask_text_with_tokenizer(text, target_begin, target_end, tokenizer, mask_prob=0.15):
    # 전체 문장 토큰화
    encoding = tokenizer.encode_plus(text, add_special_tokens=True)
    tokens = encoding['input_ids']
    tokens = tokenizer.convert_ids_to_tokens(tokens)

    # 문장에서 타겟 찾기
    target_tokens = []
    if target_begin is not None and target_end is not None:
        target_text = text[target_begin:target_end]
        target_tokens = tokenizer.tokenize(target_text)

    masked_tokens = []
    skip_count = 0
    
    # 전체 15% 마스킹(반올림). 적어도 한 개는 마스킹
    num_to_mask = max(1, round(len(tokens) * mask_prob))
    
    # 타겟 아닌 것만 마스킹 예약
    available_indices = [i for i in range(1, len(tokens) - 1)
                         if tokens[i:i+len(target_tokens)] != target_tokens]
    
    # 마스킹할 토큰 랜덤 선택
    indices_to_mask = random.sample(available_indices, min(num_to_mask, len(available_indices)))
    
    for i, token in enumerate(tokens):
        if skip_count > 0:
            skip_count -= 1
            continue

        if tokens[i:i+len(target_tokens)] == target_tokens:
            masked_tokens.extend(target_tokens)
            skip_count = len(target_tokens) - 1
            continue

        if i in indices_to_mask:
            masked_tokens.append('[MASK]')
        else:
            masked_tokens.append(token)
    
    # 다시 문장으로 바꾸기
    detokenized_text = tokenizer.convert_tokens_to_string(masked_tokens)  # [CLS]랑 [SEP]은 제외하려면 (masked_tokens[1:-1]
    
    return detokenized_text



sample_text = "하... 근데 준프샤 너무 고소각임..."
sample_target_begin = 8
sample_target_end = 11
print(mask_text_with_tokenizer(sample_text, sample_target_begin, sample_target_end, tokenizer))

[CLS] 하 [MASK].. 근데 준프샤 너무 고소각임.. [MASK] [SEP]


In [7]:
def preprocess_dataset(data_list, tokenizer, mask_prob=0.15):
    preprocessed_data = []
    
    for data_point in data_list:
        text = data_point['input']['form']
        target_begin = data_point['input']['target']['begin']
        target_end = data_point['input']['target']['end']     

        masked_text = mask_text_with_tokenizer(text, target_begin, target_end, tokenizer, mask_prob)
        
        preprocessed_data.append({
            'id': data_point['id'],
            'masked_text': masked_text,
            'labels': data_point['output']
        })
        
    return preprocessed_data

In [8]:
preprocessed_data = preprocess_dataset(data, tokenizer)
preprocessed_data[:5]

[{'id': 'nikluge-2023-ea-train-000001',
  'masked_text': '[CLS] 하... 근데 준프샤 너무 [MASK]각임... [SEP]',
  'labels': {'joy': 'True',
   'anticipation': 'False',
   'trust': 'False',
   'surprise': 'False',
   'disgust': 'False',
   'fear': 'False',
   'anger': 'False',
   'sadness': 'False'}},
 {'id': 'nikluge-2023-ea-train-000002',
  'masked_text': '[CLS] 2기였나 지은 [MASK] [MASK]랑 4기 메거진 [MASK] 지금도 읽는데 [SEP]',
  'labels': {'joy': 'True',
   'anticipation': 'False',
   'trust': 'False',
   'surprise': 'False',
   'disgust': 'False',
   'fear': 'False',
   'anger': 'False',
   'sadness': 'False'}},
 {'id': 'nikluge-2023-ea-train-000003',
  'masked_text': '[CLS] 흐아아아아악 흐아아아아 [MASK] [MASK]악악 [UNK] 손차이가 절케 난다 [MASK] 알고 [MASK] [MASK]엇지만 놀랍다 [MASK]아아악악 [SEP]',
  'labels': {'joy': 'False',
   'anticipation': 'False',
   'trust': 'False',
   'surprise': 'True',
   'disgust': 'False',
   'fear': 'False',
   'anger': 'False',
   'sadness': 'False'}},
 {'id': 'nikluge-2023-ea-train-000004',
  'masked_text

In [9]:
dev_preprocessed_data = preprocess_dataset(dev_data, tokenizer)
dev_preprocessed_data[:5]

[{'id': 'nikluge-2023-ea-dev-000001',
  'masked_text': '[CLS] 하,,, [MASK] 내일 옥상 [MASK] [MASK] 하기 전에 표 구하길 기도해주세요 [SEP]',
  'labels': {'joy': 'False',
   'anticipation': 'True',
   'trust': 'False',
   'surprise': 'False',
   'disgust': 'False',
   'fear': 'False',
   'anger': 'False',
   'sadness': 'False'}},
 {'id': 'nikluge-2023-ea-dev-000002',
  'masked_text': '[CLS] 밴드 사운드 진짜 너무너무 좋다.... [MASK]인 [MASK] 진짜 [SEP]',
  'labels': {'joy': 'True',
   'anticipation': 'False',
   'trust': 'False',
   'surprise': 'False',
   'disgust': 'False',
   'fear': 'False',
   'anger': 'False',
   'sadness': 'False'}},
 {'id': 'nikluge-2023-ea-dev-000003',
  'masked_text': '[CLS] 칸 태리 너무 풋풋 [MASK] [MASK] 귀여워.. 근데 허리가 한줌 [MASK]셔 & [MASK]s & [SEP]',
  'labels': {'joy': 'True',
   'anticipation': 'False',
   'trust': 'False',
   'surprise': 'False',
   'disgust': 'False',
   'fear': 'False',
   'anger': 'False',
   'sadness': 'False'}},
 {'id': 'nikluge-2023-ea-dev-000004',
  'masked_text': '[CLS] 미스터초밥왕

In [10]:
def prepend_labels_to_text(preprocessed_data, label_mapping):
    prepended_data = []
    
    for data_point in preprocessed_data:
        labels = data_point['labels']
        masked_text = data_point['masked_text']
        
        # 라벨 찾기
        prepend_str = ' '.join([f'{label_mapping[label]}' for label, value in labels.items() if value == 'True'])
        
        # [CLS] 뒤, 문장 앞에 라벨 붙이기
        if masked_text.startswith("[CLS]"):
            new_text = f"[CLS] {prepend_str} {masked_text[6:]}" if prepend_str else masked_text
        else:
            new_text = f"{prepend_str} {masked_text}" if prepend_str else masked_text

        # 데이터셋 만들기
        prepended_data.append({
            'id': data_point['id'],
            'prepended_text': new_text,
            'labels': labels
        })
        
    return prepended_data

# 라벨은 한국어로
label_mapping = {
    'joy': '기쁨',
    'anticipation': '기대',
    'trust': '믿음',
    'surprise': '놀람',
    'disgust': '혐오',
    'fear': '공포',
    'anger': '분노',
    'sadness': '슬픔'
}

In [11]:
prepended_data = prepend_labels_to_text(preprocessed_data, label_mapping)
print(prepended_data[:5])

[{'id': 'nikluge-2023-ea-train-000001', 'prepended_text': '[CLS] 기쁨 하... 근데 준프샤 너무 [MASK]각임... [SEP]', 'labels': {'joy': 'True', 'anticipation': 'False', 'trust': 'False', 'surprise': 'False', 'disgust': 'False', 'fear': 'False', 'anger': 'False', 'sadness': 'False'}}, {'id': 'nikluge-2023-ea-train-000002', 'prepended_text': '[CLS] 기쁨 2기였나 지은 [MASK] [MASK]랑 4기 메거진 [MASK] 지금도 읽는데 [SEP]', 'labels': {'joy': 'True', 'anticipation': 'False', 'trust': 'False', 'surprise': 'False', 'disgust': 'False', 'fear': 'False', 'anger': 'False', 'sadness': 'False'}}, {'id': 'nikluge-2023-ea-train-000003', 'prepended_text': '[CLS] 놀람 흐아아아아악 흐아아아아 [MASK] [MASK]악악 [UNK] 손차이가 절케 난다 [MASK] 알고 [MASK] [MASK]엇지만 놀랍다 [MASK]아아악악 [SEP]', 'labels': {'joy': 'False', 'anticipation': 'False', 'trust': 'False', 'surprise': 'True', 'disgust': 'False', 'fear': 'False', 'anger': 'False', 'sadness': 'False'}}, {'id': 'nikluge-2023-ea-train-000004', 'prepended_text': '기대 ', 'labels': {'joy': 'False', 'anticipation': 'True'

In [12]:
dev_prepended_data = prepend_labels_to_text(dev_preprocessed_data, label_mapping)
print(dev_prepended_data[:5])

[{'id': 'nikluge-2023-ea-dev-000001', 'prepended_text': '[CLS] 기대 하,,, [MASK] 내일 옥상 [MASK] [MASK] 하기 전에 표 구하길 기도해주세요 [SEP]', 'labels': {'joy': 'False', 'anticipation': 'True', 'trust': 'False', 'surprise': 'False', 'disgust': 'False', 'fear': 'False', 'anger': 'False', 'sadness': 'False'}}, {'id': 'nikluge-2023-ea-dev-000002', 'prepended_text': '[CLS] 기쁨 밴드 사운드 진짜 너무너무 좋다.... [MASK]인 [MASK] 진짜 [SEP]', 'labels': {'joy': 'True', 'anticipation': 'False', 'trust': 'False', 'surprise': 'False', 'disgust': 'False', 'fear': 'False', 'anger': 'False', 'sadness': 'False'}}, {'id': 'nikluge-2023-ea-dev-000003', 'prepended_text': '[CLS] 기쁨 칸 태리 너무 풋풋 [MASK] [MASK] 귀여워.. 근데 허리가 한줌 [MASK]셔 & [MASK]s & [SEP]', 'labels': {'joy': 'True', 'anticipation': 'False', 'trust': 'False', 'surprise': 'False', 'disgust': 'False', 'fear': 'False', 'anger': 'False', 'sadness': 'False'}}, {'id': 'nikluge-2023-ea-dev-000004', 'prepended_text': '[CLS] 기대 미스터초밥왕이 햄스터 [MASK]씨가 [MASK]어 오는 기분 [MASK]다는 더 [MASK]지 싶음.. [SE

In [13]:
class EmotionDataset(Dataset):
    def __init__(self, prepended_data, original_data, tokenizer):
        self.prepended_data = prepended_data
        self.original_data = original_data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.prepended_data)

    def __getitem__(self, idx):
        prepended_item = self.prepended_data[idx]
        original_item = self.original_data[idx]
        
        prepended_text = prepended_item['prepended_text']
        original_text = original_item['input']['form']

        # 입력: prepend 데이터 / 출력: 원래 문장
        prepended_encoding = self.tokenizer(prepended_text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')
        original_encoding = self.tokenizer(original_text, truncation=True, padding='max_length', max_length=512, return_tensors='pt')

        # input IDs와 attention masks
        prepended_input_ids = prepended_encoding['input_ids'].squeeze(0)
        prepended_attention_mask = prepended_encoding['attention_mask'].squeeze(0)
        original_input_ids = original_encoding['input_ids'].squeeze(0)

        return {
            'input_ids': torch.tensor(prepended_input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(prepended_attention_mask, dtype=torch.long),
            'labels': torch.tensor(original_input_ids, dtype=torch.long)
        }

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model = nn.DataParallel(model)

In [15]:
train_dataset = EmotionDataset(prepended_data, data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

In [16]:
dev_dataset = EmotionDataset(dev_prepended_data, dev_data, tokenizer)
dev_dataloader = DataLoader(dev_dataset, batch_size=16, shuffle=False)

In [22]:
optimizer = AdamW(model.parameters(), lr=4e-5)

In [18]:
save_dir = "/home/nlpgpu9/ellt/eojin/EA/증강/ELECTRA_DA"
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

In [23]:
for epoch in range(10):
    model.train()
    total_train_loss = 0
    num_train_batches = 0

    train_loop = tqdm(train_dataloader, leave=True)
    
    for batch in train_dataloader:
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss.sum()
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
        
        total_train_loss += loss.item()
        num_train_batches += 1

        train_loop.set_description(f"Epoch {epoch+1}")
        train_loop.set_postfix(loss=loss.item())
    
    
    avg_train_loss = total_train_loss / num_train_batches
    print(f"Epoch {epoch+1}, Average Training Loss: {avg_train_loss}")

    # Evaluation
    model.eval()
    total_dev_loss = 0
    num_dev_batches = 0

    eval_loop = tqdm(dev_dataloader, leave=True)
    
    with torch.no_grad():
        for batch in dev_dataloader:
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(inputs, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            total_dev_loss += loss.item()
            num_dev_batches += 1

            eval_loop.set_description(f"Epoch {epoch+1} Evaluation")
            eval_loop.set_postfix(loss=loss.item())
    
    avg_dev_loss = total_dev_loss / num_dev_batches
    print(f"Epoch {epoch+1}, Average Dev Loss: {avg_dev_loss}")

    model_save_path = os.path.join(save_dir, f"model_epoch_{epoch+1}")
    model.save_pretrained(model_save_path)
    print(f"Saved model for epoch {epoch+1} at {model_save_path}")

  'input_ids': torch.tensor(prepended_input_ids, dtype=torch.long),
  'attention_mask': torch.tensor(prepended_attention_mask, dtype=torch.long),
  'labels': torch.tensor(original_input_ids, dtype=torch.long)
Epoch 1:   0%|          | 0/2370 [09:18<?, ?it/s, loss=0.741]