In [1]:
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer

In [2]:
# 데이터 로드 및 전처리
data = pd.read_csv('data.csv')
tgt_texts = data.iloc[:100, 1].tolist()  # 영어 텍스트
src_texts = data.iloc[:100, 0].tolist()  # 한국어 텍스트
data.head()

Unnamed: 0,'Bible Coloring'은 성경의 아름다운 이야기를 체험 할 수 있는 컬러링 앱입니다.,Bible Coloring' is a coloring application that allows you to experience beautiful stories in the Bible.
0,씨티은행에서 일하세요?,Do you work at a City bank?
1,푸리토의 베스트셀러는 해외에서 입소문만으로 4차 완판을 기록하였다.,"PURITO's bestseller, which recorded 4th rough ..."
2,11장에서는 예수님이 이번엔 나사로를 무덤에서 불러내어 죽은 자 가운데서 살리셨습니다.,In Chapter 11 Jesus called Lazarus from the to...
3,"6.5, 7, 8 사이즈가 몇 개나 더 재입고 될지 제게 알려주시면 감사하겠습니다.",I would feel grateful to know how many stocks ...
4,F/W 겐조타이거 키즈와 그리고 이번에 주문한 키즈 중 부족한 수량에 대한 환불입니다.,"18fw Kenzo Tiger Kids, and refund for lacking ..."


In [3]:
data.shape, len(src_texts), len(tgt_texts)

((199999, 2), 100, 100)

In [4]:
device = 'cpu'

In [5]:
# 토크나이저 정의
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# 데이터 인코딩
src_encodings = tokenizer(src_texts, truncation=True, padding=True, max_length=256)
tgt_encodings = tokenizer(tgt_texts, truncation=True, padding=True, max_length=256)

In [6]:
import torch

In [7]:
# 데이터 로더 정의
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, src_encodings, tgt_encodings):
        self.src_encodings = src_encodings
        self.tgt_encodings = tgt_encodings

    def __len__(self):
        return len(self.src_encodings['input_ids'])

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.src_encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.src_encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.tgt_encodings['input_ids'][idx]),
            'decoder_input_ids': torch.tensor(self.tgt_encodings['input_ids'][idx])
        }
        return item

In [8]:
dataset = TranslationDataset(src_encodings, tgt_encodings)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, pin_memory=False)

# 모델 초기화 및 학습
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): La

In [9]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5, no_deprecation_warning=True)
from tqdm import tqdm

In [10]:
from tqdm import tqdm

# 학습 진행
total_loss = 0.0
total_correct = 0
total_tokens = 0

model.train()

# 최적의 모델을 저장하기 위한 변수 초기화
best_loss = float('inf')
best_model_path = None

# 학습 진행
for epoch in range(10):
    # tqdm을 사용하여 데이터 로더 감싸기
    with tqdm(dataloader, desc=f"Epoch {epoch + 1}", unit="batch") as loop:
        for batch_idx, batch in enumerate(loop):
            # 배치를 GPU 메모리로 이동
            batch = {k: v.to(device) for k, v in batch.items()}

            # 모델에 배치 입력
            outputs = model(**batch)
            loss = outputs.loss

            # 손실 누적
            total_loss += loss.item()

            # 정확도 계산
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            correct = (predictions == batch['labels']).sum().item()
            total_correct += correct
            total_tokens += batch['labels'].ne(0).sum().item()  # 패딩 토큰은 제외

            # 역전파 및 가중치 업데이트
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # 현재 손실이 최고의 손실보다 작으면 모델 저장
            if loss.item() < best_loss:
                best_loss = loss.item()
                best_model_path = f"bart_translation_best_model_1000row_maxlen256.pth"
                torch.save(model.state_dict(), best_model_path)

            # tqdm 업데이트
            loop.set_postfix(loss=loss.item(), accuracy=correct / total_tokens)

# 학습이 완료된 후 최적 모델 경로 출력
print(f"최적 모델이 저장된 경로: {best_model_path}")

Epoch 1: 100%|██████████| 4/4 [01:16<00:00, 19.16s/batch, accuracy=0.00132, loss=12.1]
Epoch 2: 100%|██████████| 4/4 [01:09<00:00, 17.27s/batch, accuracy=0.000789, loss=10.9]
Epoch 3: 100%|██████████| 4/4 [01:07<00:00, 16.86s/batch, accuracy=0.000439, loss=11.5]
Epoch 4: 100%|██████████| 4/4 [01:12<00:00, 18.12s/batch, accuracy=0.000592, loss=10.4]
Epoch 5:   0%|          | 0/4 [00:18<?, ?batch/s]


KeyboardInterrupt: 

In [14]:
# 모델 평가
model.eval()

# 샘플 텍스트 정의
sample_text = "안녕하세요."
encoding = tokenizer(sample_text, return_tensors='pt').to(device)

# 추론을 위해 모델에 입력
output = model.generate(**encoding, max_new_tokens=100)

# 결과 디코딩 및 출력
translation = tokenizer.decode(output[0], skip_special_tokens=True)
print(translation)

안녕하세요.


In [12]:
# 모델 평가
model.eval()

# 샘플 텍스트 정의
sample_text = "This is a sample English sentence."
encoding = tokenizer(sample_text, return_tensors='pt').to(device)

# 추론을 위해 모델에 입력
output = model.generate(**encoding, max_new_tokens=100)

# 결과 디코딩 및 출력
translation = tokenizer.decode(output[0], skip_special_tokens=True)
print(translation)

This is a sample English sentence.


In [13]:
import unicodedata

translation = unicodedata.normalize('NFKD', translation).encode('ascii', 'ignore').decode('ascii')
print(translation)

This is a sample English sentence.


In [None]:
print(translation.encode('ascii', errors='ignore').decode())

In [None]:
import re

translation = re.sub(r'[^\x20-\x7E]', '', translation)
print(translation)

In [15]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartForConditionalGeneration, BartTokenizer, AdamW
from tqdm import tqdm

# 데이터 로드
data = pd.read_csv('data.csv')
src_texts = data.iloc[:100, 0].tolist()  # 한국어 텍스트
tgt_texts = data.iloc[:100, 1].tolist()  # 영어 텍스트

# 토크나이저 및 모델 초기화
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
device = torch.device('cpu')  # or 'cuda'
model.to(device)

# 데이터 인코딩
src_encodings = tokenizer(src_texts, padding=True, truncation=True, max_length=256, return_tensors="pt")
tgt_encodings = tokenizer(tgt_texts, padding=True, truncation=True, max_length=256, return_tensors="pt")

# 데이터셋 클래스 정의
class TranslationDataset(Dataset):
    def __init__(self, src_encodings, tgt_encodings):
        self.src_encodings = src_encodings
        self.tgt_encodings = tgt_encodings

    def __len__(self):
        return len(self.src_encodings['input_ids'])

    def __getitem__(self, idx):
        src_item = {key: val[idx] for key, val in self.src_encodings.items()}
        tgt_item = {key: val[idx] for key, val in self.tgt_encodings.items()}
        src_item['labels'] = tgt_item['input_ids']
        return src_item

# 데이터 로더 정의
dataset = TranslationDataset(src_encodings, tgt_encodings)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# 옵티마이저 설정
optimizer = AdamW(model.parameters(), lr=1e-5)

# 학습 과정
model.train()
best_loss = float('inf')
for epoch in range(3):  # 3 에포크
    loop = tqdm(dataloader, leave=True)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())
        if loss.item() < best_loss:
            best_loss = loss.item()
            torch.save(model.state_dict(), "model_best.pth")

# 모델 가중치 불러오기
model.load_state_dict(torch.load("model_best.pth", map_location=device))
model.eval()

# 샘플 텍스트로 모델 평가
sample_texts = ["안녕하세요.", "This is a sample English sentence."]
translations = []

for text in sample_texts:
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    output = model.generate(**encoded_input, max_length=50, num_beams=5)
    translation = tokenizer.decode(output[0], skip_special_tokens=True)
    translations.append(translation)

for original, translated in zip(sample_texts, translations):
    print(f'Original: {original} - Translated: {translated}')


Epoch 0: 100%|██████████| 50/50 [04:39<00:00,  5.60s/it, loss=6.18]
Epoch 1: 100%|██████████| 50/50 [04:18<00:00,  5.18s/it, loss=6.31]
Epoch 2: 100%|██████████| 50/50 [02:50<00:00,  3.40s/it, loss=6.49]


Original: 안녕하세요. - Translated: 
Original: This is a sample English sentence. - Translated: This is a sample English sentence.


In [45]:

model.load_state_dict(torch.load("model_best.pth", map_location=device))
model.eval()

# 샘플 텍스트로 모델 평가
sample_texts = ["나는 학생입니다"]
translations = []

for text in sample_texts:
    encoded_input = tokenizer(text, return_tensors='pt').to(device)
    output = model.generate(**encoded_input, max_length=50, num_beams=5)
    translation = tokenizer.decode(output[0], skip_special_tokens=True)
    translations.append(translation)

for original, translated in zip(sample_texts, translations):
    print(f'Original: {original} - Translated: {translated}')


Original: 나는 학생입니다 - Translated: 
