In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

# 데이터 로드
data = pd.read_csv('data.csv')
src_texts = data.iloc[:100, 0].tolist()  # 한국어 텍스트
tgt_texts = data.iloc[:100, 1].tolist()  # 영어 텍스트

# 토크나이저 초기화
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# 토크나이저에 패딩 토큰 설정
tokenizer.pad_token = tokenizer.eos_token



In [2]:
class TranslationDataset(Dataset):
    def __init__(self, tokenizer, src_texts, tgt_texts, max_length=256):  # max_length를 256으로 설정
        self.tokenizer = tokenizer
        self.inputs = []
        self.attn_masks = []

        for src, tgt in zip(src_texts, tgt_texts):
            # `pad_to_max_length=True` 대신 `padding='max_length'` 사용
            tokenized_text = tokenizer.encode_plus(src + ' <SEP> ' + tgt, max_length=max_length, padding='max_length', truncation=True, return_tensors="pt")
            self.inputs.append(tokenized_text['input_ids'])
            self.attn_masks.append(tokenized_text['attention_mask'])

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx].squeeze(0), self.attn_masks[idx].squeeze(0)


In [3]:


# 데이터셋 객체 생성
dataset = TranslationDataset(tokenizer, src_texts, tgt_texts)
# 데이터 로더 생성, batch_size를 32로 설정
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)


In [ ]:
# 모델 로드
model = GPT2LMHeadModel.from_pretrained('gpt2')
model = model.to('mps')

# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=5e-5)

# 학습
model.train()
for epoch in range(4):  # 에포크 수는 필요에 따라 조정 가능
    for inputs, masks in data_loader:
        inputs = inputs.to('mps')
        masks = masks.to('mps')
        
        outputs = model(inputs, attention_mask=masks, labels=inputs)
        loss = outputs[0]
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(f"Epoch {epoch+1} Loss {loss.item()}")


In [None]:
model.eval()
test_sentence = "이 문장을 영어로 번역해주세요."
encoded_input = tokenizer.encode(test_sentence, return_tensors='pt').to('mps')
output_sequences = model.generate(input_ids=encoded_input, max_length=50)

decoded_translation = tokenizer.decode(output_sequences[0], skip_special_tokens=True)
print(decoded_translation)


In [ ]:
from ..model import BERTLM, BERT


class BERTTrainer :
    
    def __init__(self, bert : BERT, vocab_size : int, train_dataloader : DataLoader, test_dataloader : DataLoader = None,
                 lr : float = 1e-4, betas = (0.9, 0.999), weight_decay : float = 0.01, warmup_steps = 10000,
                 with_cuda : bool = True, cuda_devices = None, log_freq : int = 10) :