In [42]:
import pandas as pd
from transformers import BartForConditionalGeneration, BartTokenizer

In [43]:
# 데이터 로드 및 전처리
data = pd.read_csv('data.csv')
tgt_texts = data.iloc[:10000, 0].tolist()  # 영어 텍스트
src_texts = data.iloc[:10000, 1].tolist()  # 한국어 텍스트
data.head()

Unnamed: 0,'Bible Coloring'은 성경의 아름다운 이야기를 체험 할 수 있는 컬러링 앱입니다.,Bible Coloring' is a coloring application that allows you to experience beautiful stories in the Bible.
0,씨티은행에서 일하세요?,Do you work at a City bank?
1,푸리토의 베스트셀러는 해외에서 입소문만으로 4차 완판을 기록하였다.,"PURITO's bestseller, which recorded 4th rough ..."
2,11장에서는 예수님이 이번엔 나사로를 무덤에서 불러내어 죽은 자 가운데서 살리셨습니다.,In Chapter 11 Jesus called Lazarus from the to...
3,"6.5, 7, 8 사이즈가 몇 개나 더 재입고 될지 제게 알려주시면 감사하겠습니다.",I would feel grateful to know how many stocks ...
4,F/W 겐조타이거 키즈와 그리고 이번에 주문한 키즈 중 부족한 수량에 대한 환불입니다.,"18fw Kenzo Tiger Kids, and refund for lacking ..."


In [44]:
data.shape, len(src_texts), len(tgt_texts)

((199999, 2), 10000, 10000)

In [45]:
device = 'cpu'

In [57]:
# 토크나이저 정의
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

# 데이터 인코딩
src_encodings = tokenizer(src_texts, truncation=True, padding=True, max_length=512)
tgt_encodings = tokenizer(tgt_texts, truncation=True, padding=True, max_length=512)

In [47]:
import torch

In [48]:
# 데이터 로더 정의
from torch.utils.data import Dataset, DataLoader

class TranslationDataset(Dataset):
    def __init__(self, src_encodings, tgt_encodings):
        self.src_encodings = src_encodings
        self.tgt_encodings = tgt_encodings

    def __len__(self):
        return len(self.src_encodings['input_ids'])

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.src_encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.src_encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.tgt_encodings['input_ids'][idx]),
            'decoder_input_ids': torch.tensor(self.tgt_encodings['input_ids'][idx])
        }
        return item

In [49]:
dataset = TranslationDataset(src_encodings, tgt_encodings)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, pin_memory=False)

# 모델 초기화 및 학습
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): La

In [50]:
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5, no_deprecation_warning=True)
from tqdm import tqdm

In [52]:
from tqdm import tqdm

# 학습 진행
total_loss = 0.0
total_correct = 0
total_tokens = 0

model.train()

# 최적의 모델을 저장하기 위한 변수 초기화
best_loss = float('inf')
best_model_path = None

# 학습 진행
for epoch in range(5):
    # tqdm을 사용하여 데이터 로더 감싸기
    with tqdm(dataloader, desc=f"Epoch {epoch + 1}", unit="batch") as loop:
        for batch_idx, batch in enumerate(loop):
            # 배치를 GPU 메모리로 이동
            batch = {k: v.to(device) for k, v in batch.items()}

            # 모델에 배치 입력
            outputs = model(**batch)
            loss = outputs.loss

            # 손실 누적
            total_loss += loss.item()

            # 정확도 계산
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            correct = (predictions == batch['labels']).sum().item()
            total_correct += correct
            total_tokens += batch['labels'].ne(0).sum().item()  # 패딩 토큰은 제외

            # 역전파 및 가중치 업데이트
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # 현재 손실이 최고의 손실보다 작으면 모델 저장
            if loss.item() < best_loss:
                best_loss = loss.item()
                best_model_path = f"bart_translation_best_model.pth"
                torch.save(model.state_dict(), best_model_path)

            # tqdm 업데이트
            loop.set_postfix(loss=loss.item(), accuracy=correct / total_tokens)

# 학습이 완료된 후 최적 모델 경로 출력
print(f"최적 모델이 저장된 경로: {best_model_path}")

Epoch 1: 100%|██████████| 1250/1250 [2:51:29<00:00,  8.23s/batch, accuracy=0.000804, loss=0.0124] 
Epoch 2: 100%|██████████| 1250/1250 [2:28:19<00:00,  7.12s/batch, accuracy=0.000402, loss=0.00622] 
Epoch 3: 100%|██████████| 1250/1250 [3:37:36<00:00, 10.44s/batch, accuracy=0.000268, loss=0.00251]    
Epoch 4: 100%|██████████| 1250/1250 [2:15:42<00:00,  6.51s/batch, accuracy=0.000201, loss=0.000498] 
Epoch 5: 100%|██████████| 1250/1250 [2:14:41<00:00,  6.46s/batch, accuracy=0.000161, loss=0.000352] 

최적 모델이 저장된 경로: bart_translation_best_model.pth





In [54]:
# 모델 평가
model.eval()

# 샘플 텍스트 정의
sample_text = "This is a sample English sentence."
encoding = tokenizer(sample_text, return_tensors='pt').to(device)

# 추론을 위해 모델에 입력
output = model.generate(**encoding, max_new_tokens=100)

# 결과 디코딩 및 출력
translation = tokenizer.decode(output[0], skip_special_tokens=True)
print(translation)

������ � � � � � ���� � � � ��������������������������������������������������������������������������'' ''


In [1]:
# 모델 평가
model.eval()

# 샘플 텍스트 정의
sample_text = "안녕하세요"
encoding = tokenizer(sample_text, return_tensors='pt').to(device)

# 추론을 위해 모델에 입력
output = model.generate(**encoding, max_new_tokens=100)

# 결과 디코딩 및 출력
translation = tokenizer.decode(output[0], skip_special_tokens=True)
print(translation)

NameError: name 'model' is not defined

In [59]:
import unicodedata

translation = unicodedata.normalize('NFKD', translation).encode('ascii', 'ignore').decode('ascii')
print(translation)

          '' ''


In [60]:
print(translation.encode('ascii', errors='ignore').decode())

          '' ''


In [61]:
import re

translation = re.sub(r'[^\x20-\x7E]', '', translation)
print(translation)

          '' ''


In [3]:
import torch
from transformers import PreTrainedTokenizer, PreTrainedModel

# Load tokenizer and model (Assuming you're using Hugging Face Transformers library)
tokenizer: PreTrainedTokenizer = None  # Initialize your tokenizer here (e.g., BertTokenizer)
model: PreTrainedModel = None  # Initialize your model here (e.g., BertForSequenceClassification)

# Load the best saved model
best_model_path = "bart_translation_best_model.pth"
model.load_state_dict(torch.load(best_model_path, map_location=device))
model.to(device)
model.eval()

# Define a sample text to translate
sample_text = "This is a sample English sentence."
# Tokenize the text for the model
# 'return_tensors' creates multidimensional arrays (tensors) from the encoded text
# 'pt' indicates that the generated tensors will be PyTorch tensors
encoding = tokenizer(sample_text, return_tensors='pth').to(device)

# Generate translation using the model
# 'max_new_tokens' specifies the maximum length of the output sequence
output = model.generate(**encoding, max_new_tokens=100)

# Decode the generated tokens to text
# 'skip_special_tokens' when set to True, removes all special tokens from the output
translation = tokenizer.decode(output[0], skip_special_tokens=True)

# Print the translation
print(translation)


AttributeError: 'NoneType' object has no attribute 'load_state_dict'