In [1]:
import numpy as np
import pandas as pd
import pymysql
import pymysql.cursors as cursors
import torch
import multiprocessing

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)
from datasets import load_dataset, load_metric


In [2]:
# !pip install datasets


In [3]:
# 데이터 베이스 연결하기
conn = pymysql.connect(
    host="1.251.203.204",
    user="root",
    password="kdt5",
    db="Team4",
    charset="utf8",
    port=33065,
)

curs = conn.cursor()

# 검색 명령어 사용
sql = "SELECT en.text as en, ru.text as ru FROM language_en en join language_ru ru on en.id = ru.id;"
curs.execute(sql)
result = curs.fetchall()
print("현재 테이블의 데이터수는 총 {}개 입니다.".format(len(result)))
endeDF = pd.read_sql(sql, conn)

# 데이터베이스 연결 종료
conn.close()
endeDF.head()


현재 테이블의 데이터수는 총 31102개 입니다.


  endeDF = pd.read_sql(sql, conn)


Unnamed: 0,en,ru
0,In the beginning God created the heaven and th...,В начале сотворил Бог небо и землю.
1,"And the earth was without form, and void; and ...","Земля же была безвидна и пуста, и тьма над без..."
2,"And God said, Let there be light: and there wa...",И сказал Бог: да будет свет. И сталсвет.
3,"And God saw the light, that it was good: and G...","И увидел Бог свет, что он хорош, и отделил Бог..."
4,"And God called the light Day, and the darkness...","И назвал Бог свет днем, а тьму ночью. И был ве..."


In [5]:
num_train = 30000
num_valid = 1000
num_test = 1102

bible_trainDF = endeDF.iloc[:num_train]
bible_validDF = endeDF.iloc[num_train : num_train + num_valid]
bible_testDF = endeDF.iloc[num_train + num_valid :]

bible_trainDF.to_csv("./data/train.tsv", sep="\t", index=False)
bible_validDF.to_csv("./data/valid.tsv", sep="\t", index=False)
bible_testDF.to_csv("./data/test.tsv", sep="\t", index=False)

data_files = {
    "train": "./data/train.tsv",
    "valid": "./data/valid.tsv",
    "test": "./data/test.tsv",
}

dataset = load_dataset("csv", data_files=data_files, delimiter="\t")


Generating train split: 0 examples [00:00, ? examples/s]

Generating valid split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
dataset


DatasetDict({
    train: Dataset({
        features: ['en', 'ru'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['en', 'ru'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['en', 'ru'],
        num_rows: 102
    })
})

In [7]:
print(dataset["train"][:3]["en"])
print(dataset["train"][:3]["ru"])


['In the beginning God created the heaven and the earth.', 'And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.', 'And God said, Let there be light: and there was light.']
['В начале сотворил Бог небо и землю.', 'Земля же была безвидна и пуста, и тьма над бездною, и Дух Божий носился над водою.', 'И сказал Бог: да будет свет. И сталсвет.']


In [8]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device


'cuda'

In [9]:
model_ckpt = "Helsinki-NLP/opus-mt-en-ru"
max_token_length = 128


In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/803k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.60M [00:00<?, ?B/s]

In [11]:
def convert_examples_to_features(tokenizer, max_token_length, examples):
    model_inputs = tokenizer(
        examples["en"],
        text_target=examples["ru"],
        max_length=max_token_length,
        truncation=True,
    )

    return model_inputs


In [12]:
NUM_CPU = multiprocessing.cpu_count()
NUM_CPU


20

In [13]:
from functools import partial

tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
partial_tokenize_function = partial(
    convert_examples_to_features, tokenizer, max_token_length
)
tokenized_datasets = dataset.map(
    partial_tokenize_function,
    batched=True,
    remove_columns=dataset["train"].column_names,
    num_proc=NUM_CPU,
)


Map (num_proc=20):   0%|          | 0/30000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/1000 [00:00<?, ? examples/s]

Map (num_proc=20):   0%|          | 0/102 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 30000
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 102
    })
})

In [15]:
print("원 데이터    :", dataset["train"][0]["en"])
print("처리 후 데이터:", tokenized_datasets["train"][0]["input_ids"])
print(
    "토큰화       :",
    tokenizer.convert_ids_to_tokens(tokenized_datasets["train"][0]["input_ids"]),
)

print("\n")
print("원 데이터    :", dataset["train"][0]["ru"])
print(
    "처리 후 데이터:",
    tokenizer.convert_ids_to_tokens(tokenized_datasets["train"][0]["labels"]),
)
print("토큰화       :", tokenized_datasets["train"][0]["labels"])


원 데이터    : In the beginning God created the heaven and the earth.
처리 후 데이터: [90, 4, 3729, 805, 2966, 4, 13254, 8, 4, 3447, 3, 0]
토큰화       : ['▁In', '▁the', '▁beginning', '▁God', '▁created', '▁the', '▁heaven', '▁and', '▁the', '▁earth', '.', '</s>']


원 데이터    : В начале сотворил Бог небо и землю.
처리 후 데이터: ['▁В', '▁начале', '▁сотворил', '▁Бог', '▁небо', '▁и', '▁землю', '.', '</s>']
토큰화       : [49, 4790, 13400, 4220, 16849, 7, 7436, 3, 0]


In [16]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)


pytorch_model.bin:   0%|          | 0.00/307M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [17]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


In [18]:
# 콜레이터를 돌리면 알아서 패딩하고 쉬프트 시킨다.
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])


In [19]:
# !pip install sacrebleu


In [20]:
import evaluate

metric = evaluate.load("sacrebleu")


In [21]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds

    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    return result


In [26]:
training_args = Seq2SeqTrainingArguments(
    output_dir="chkpt",
    learning_rate=0.0005,
    weight_decay=0.01,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    save_steps=500,
    save_total_limit=2,
    evaluation_strategy="epoch",
    logging_strategy="no",
    predict_with_generate=True,
    fp16=False,
    gradient_accumulation_steps=2,
    report_to="none",  # Wandb 로그 끄기
)


In [27]:
import transformers

transformers.__version__


'4.40.0'

In [28]:
trainer = Seq2SeqTrainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


In [29]:
trainer.train()


  0%|          | 0/9370 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.3269896507263184, 'eval_bleu': 17.35810195039809, 'eval_runtime': 37.4772, 'eval_samples_per_second': 26.683, 'eval_steps_per_second': 1.681, 'epoch': 1.0}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.251943349838257, 'eval_bleu': 18.622530558145883, 'eval_runtime': 36.9537, 'eval_samples_per_second': 27.061, 'eval_steps_per_second': 1.705, 'epoch': 2.0}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.209239959716797, 'eval_bleu': 19.348418823421994, 'eval_runtime': 36.696, 'eval_samples_per_second': 27.251, 'eval_steps_per_second': 1.717, 'epoch': 3.0}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.218851327896118, 'eval_bleu': 19.647974928497593, 'eval_runtime': 35.885, 'eval_samples_per_second': 27.867, 'eval_steps_per_second': 1.756, 'epoch': 4.0}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.2853922843933105, 'eval_bleu': 19.85504492661564, 'eval_runtime': 36.4773, 'eval_samples_per_second': 27.414, 'eval_steps_per_second': 1.727, 'epoch': 5.0}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.355679988861084, 'eval_bleu': 20.130858225293125, 'eval_runtime': 37.6463, 'eval_samples_per_second': 26.563, 'eval_steps_per_second': 1.673, 'epoch': 6.0}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.44789981842041, 'eval_bleu': 20.286306121146406, 'eval_runtime': 37.7584, 'eval_samples_per_second': 26.484, 'eval_steps_per_second': 1.669, 'epoch': 7.0}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.548858880996704, 'eval_bleu': 20.293590848456287, 'eval_runtime': 38.2129, 'eval_samples_per_second': 26.169, 'eval_steps_per_second': 1.649, 'epoch': 8.0}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.6213648319244385, 'eval_bleu': 20.38367062475198, 'eval_runtime': 38.7869, 'eval_samples_per_second': 25.782, 'eval_steps_per_second': 1.624, 'epoch': 9.0}


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 2.6558945178985596, 'eval_bleu': 21.06103300369076, 'eval_runtime': 37.1734, 'eval_samples_per_second': 26.901, 'eval_steps_per_second': 1.695, 'epoch': 9.99}
{'train_runtime': 1375.1252, 'train_samples_per_second': 218.162, 'train_steps_per_second': 6.814, 'train_loss': 0.9484577191168623, 'epoch': 9.99}


TrainOutput(global_step=9370, training_loss=0.9484577191168623, metrics={'train_runtime': 1375.1252, 'train_samples_per_second': 218.162, 'train_steps_per_second': 6.814, 'total_flos': 4901843399344128.0, 'train_loss': 0.9484577191168623, 'epoch': 9.994666666666667})

In [30]:
trainer.save_model("./result")


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[62517]], 'forced_eos_token_id': 0}


In [31]:
model_dir = "./result"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)

model.cpu()


MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(62518, 512, padding_idx=62517)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(62518, 512, padding_idx=62517)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLU()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,), eps=1e-05

In [32]:
input_text = [
    "In the beginning God created the heaven and the earth.",
    "And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.",
    "I'm hungry.",
]


In [33]:
inputs = tokenizer(input_text, return_tensors="pt", padding=True)




In [34]:
frenchs = model.generate(
    **inputs,
    max_length=max_token_length,
    num_beams=5,
)

frenchs.shape


torch.Size([3, 31])

In [35]:
[
    tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(french))
    for french in frenchs
]


['<pad> В начале сотворил Бог небо и землю.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>',
 '<pad> Земля же была безвидна и пуста, и тьма над бездною. И Дух Божий носился над водою.</s>',
 '<pad> алчу с голодным.</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>']

In [36]:
pred_text = tokenizer.batch_decode(frenchs, skip_special_tokens=True)[0]
pred_text


['В начале сотворил Бог небо и землю.',
 'Земля же была безвидна и пуста, и тьма над бездною. И Дух Божий носился над водою.',
 'алчу с голодным.']