In [3]:
import torch
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# 데이터 로드 및 전처리
data = pd.read_csv('data.csv')
tgt_texts = data.iloc[:100, 1].tolist()  # 영어 텍스트
src_texts = data.iloc[:100, 0].tolist()  # 한국어 텍스트
data.head()


Unnamed: 0,'Bible Coloring'은 성경의 아름다운 이야기를 체험 할 수 있는 컬러링 앱입니다.,Bible Coloring' is a coloring application that allows you to experience beautiful stories in the Bible.
0,씨티은행에서 일하세요?,Do you work at a City bank?
1,푸리토의 베스트셀러는 해외에서 입소문만으로 4차 완판을 기록하였다.,"PURITO's bestseller, which recorded 4th rough ..."
2,11장에서는 예수님이 이번엔 나사로를 무덤에서 불러내어 죽은 자 가운데서 살리셨습니다.,In Chapter 11 Jesus called Lazarus from the to...
3,"6.5, 7, 8 사이즈가 몇 개나 더 재입고 될지 제게 알려주시면 감사하겠습니다.",I would feel grateful to know how many stocks ...
4,F/W 겐조타이거 키즈와 그리고 이번에 주문한 키즈 중 부족한 수량에 대한 환불입니다.,"18fw Kenzo Tiger Kids, and refund for lacking ..."


In [4]:
from transformers import GPT2Tokenizer


tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Padding token id
pad_token_id = tokenizer.eos_token_id

def pad_sequences(sequences, max_length, pad_token_id):
    padded_sequences = []
    for sequence in sequences:
        # Truncate the sequence if necessary
        sequence = sequence[:max_length]
        # Pad the sequence to the maximum length
        padded_sequence = sequence + [pad_token_id] * (max_length - len(sequence))
        padded_sequences.append(padded_sequence)
    return padded_sequences


# Apply padding to source and target sequences
src_encodings = tokenizer(src_texts, truncation=True, padding=False, max_length=512, return_tensors='pt')
tgt_encodings = tokenizer(tgt_texts, truncation=True, padding=False, max_length=512, return_tensors='pt')

max_length = 512  # Or a suitable value

src_encodings = tokenizer(src_texts, truncation=True, padding=False, max_length=512)
tgt_encodings = tokenizer(tgt_texts, truncation=True, padding=False, max_length=512)

# Flatten the input IDs
src_encodings['input_ids'] = [item for sublist in src_encodings['input_ids'] for item in sublist]
tgt_encodings['input_ids'] = [item for sublist in tgt_encodings['input_ids'] for item in sublist]

# Apply padding to flattened source and target sequences
max_length = 512  # Or determine dynamically the max length from your data
src_encodings['input_ids'] = pad_sequences(src_encodings['input_ids'], max_length, pad_token_id)
tgt_encodings['input_ids'] = pad_sequences(tgt_encodings['input_ids'], max_length, pad_token_id)


ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [None]:

from torch.utils.data import TensorDataset, DataLoader

# 데이터셋 및 데이터로더 생성
dataset = TensorDataset(src_encodings['input_ids'], tgt_encodings['input_ids'])
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)


In [None]:

# 모델 로드 및 설정
model = GPT2LMHeadModel.from_pretrained('gpt2')
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
model.to(device)


In [None]:

from transformers import AdamW

# 옵티마이저 및 학습 진행
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
num_epochs = 1

In [None]:


for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in dataloader:
        input_ids, labels = batch[0].to(device), batch[1].to(device)

        outputs = model(input_ids, labels=labels)
        loss = outputs.loss

        total_loss += loss.item()

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(dataloader)}')


In [ ]:

# 번역 예시
src_text = "안녕하세요."
input_ids = tokenizer.encode("안녕하세요.", return_tensors='pt').to(device)
output_ids = model.generate(
    input_ids,
    max_length=50,
    num_beams=5,  # Experiment with this value
    early_stopping=True
)
translation = tokenizer.decode(output_ids[0], skip_special_tokens=True)
print(f"Translation: {translation}")