In [None]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

!pip install transformers

# Maceb download
! apt-get install -y openjdk-8-jdk python3-dev
! pip install konlpy "tweepy<4.0.0"
! /bin/bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

from tqdm import tqdm

from transformers import AutoTokenizer, PreTrainedTokenizerFast
from transformers import BartTokenizerFast, BartTokenizer, BartForConditionalGeneration
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

import torch
from torch.optim import Adam, AdamW
from torch.utils.data import DataLoader,TensorDataset,RandomSampler

from konlpy.tag import Mecab

import gc

In [None]:
# 재현을 위해 랜덤시드 고정
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
train_data = pd.read_csv('/gdrive/MyDrive/goorm/03.translator/datas/train.csv')
dev_data = pd.read_csv('/gdrive/MyDrive/goorm/03.translator/datas/dev.csv')
test1_data = pd.read_csv('/gdrive/MyDrive/goorm/03.translator/datas/test.csv')
test2_data = pd.read_csv('/gdrive/MyDrive/goorm/03.translator/datas/test2.csv')


print(f"학습 데이터 개수: {len(train_data)}")
print(f"검증 데이터 개수: {len(dev_data)}")
print(f"첫 번째 평가 데이터 개수: {len(test1_data)}")
print(f"두 번째 평가 데이터 개수: {len(test2_data)}")

학습 데이터 개수: 150000
검증 데이터 개수: 10000
첫 번째 평가 데이터 개수: 10000
두 번째 평가 데이터 개수: 10000


In [None]:
def add_padding_data(inputs):
    if len(inputs) < max_len:
        pad = np.array([pad_index] *(max_len - len(inputs)))
        inputs = np.concatenate([inputs, pad])
    else:
        inputs = inputs[:max_len]

    return list(inputs)

def add_ignored_data(inputs):
    if len(inputs) < max_len:
        pad = np.array([ignore_index] *(max_len - len(inputs)))
        inputs = np.concatenate([inputs, pad])
    else:
        inputs = inputs[:max_len]

    return list(inputs)

def add_mask_data(input):
    pad = [torch.where(i!=pad_index,torch.tensor(1),i) for i in input]
    inputs = [torch.where(i==pad_index,torch.tensor(0),i).tolist() for i in pad]
    
    return list(inputs)

In [None]:
def data_tokenize(df, tokenizer, batch_size=64):

    global pad_index, ignore_index 
    pad_index = tokenizer.pad_token_id
    ignore_index = -100

    source_input = tokenizer(df['en'].tolist())['input_ids']
    target_input = tokenizer(df['ko'].tolist())['input_ids']
    
    global max_len
    max_len = max(max([len(i) for i in source_input]),max([len(i) for i in target_input]))

    input_ids = torch.tensor([add_padding_data(i) for i in source_input])
    attention_mask = torch.tensor([add_mask_data(i) for i in input_ids])

    dec_input_ids = [[0] + i for i in target_input]
    dec_input_ids = torch.tensor([add_padding_data(i) for i in dec_input_ids])
    dec_attention_mask = torch.tensor([add_mask_data(i) for i in input_ids])

    label_ids = torch.tensor([add_ignored_data(i) for i in target_input])

    data = {'input_ids': input_ids,
            'attention_mask' : attention_mask,
            'decoder_input_ids': dec_input_ids,
            'dec_attention_mask' : dec_attention_mask,
            'labels': label_ids}

    dataset = TensorDataset(data['input_ids'],data['attention_mask'],data['decoder_input_ids'],data['dec_attention_mask'],data['labels'])
    dataloader = DataLoader(dataset, batch_size = batch_size, num_workers=2, shuffle=True)

    return dataloader

In [None]:
# 평가 매트릭스 bleu_score

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, ngrams, brevity_penalty
mecab_tagger = Mecab()

def bleu_upto(reference, hypothesis, n_gram):
    res = []
    for i in range(1, n_gram + 1):
        res.append(calc_bleu_ngram(reference, hypothesis, i))
    return res


def corpuswise_bleu(predicts, gts, n_gram=4):
    res_predict = []
    res_gt = []

    for predict in predicts:
        res_predict.append([i for i in mecab_tagger.morphs(predict)])

    for gt in gts:
        res_gt.append([i for i in mecab_tagger.morphs(gt)])

    return bleu_upto(res_gt, res_predict, n_gram)


def calc_bleu_ngram(reference, hypothesis, n_gram):
    score = 0.0
    ratio = 1 / n_gram

    cc = SmoothingFunction()

    for refer, hypo in zip(reference, hypothesis):
        # refer.index()
        score += sentence_bleu([refer], hypo, (ratio,) * n_gram, cc.method1)

    return score / len(reference)

In [None]:
# 평가나 테스트 셋을 만들기 위한 함수
def evaluation(df,model, tokenizer, batch_size=128, length=20,beams=1,method = 'valid'):

    gc.collect()
    torch.cuda.empty_cache()

    src_input = torch.tensor(tokenizer(df['en'].tolist(),padding=True)['input_ids'])

    pred_dataloader = DataLoader(src_input, batch_size=batch_size)

    pred = []
    for i in tqdm(pred_dataloader):
        with torch.no_grad():
            generated = model.generate(i.to(device),do_sample=False,num_beams=beams, num_return_sequences=1,
                                       max_length=length)
            # decoded = tokenizer.decode(generated,skip_special_tokens=True)
        pred.append(generated)

    prediction = [tokenizer.decode(i,skip_special_tokens=True) for i in torch.cat(pred, dim=0)]

    predictions = []

    for p in prediction:
        if '.' in p:
            pred = p[:p.index('.')+1]
        elif '?' in p:
            pred = p[:p.index('?')+1]
        else:
            pred = p

        predictions.append(pred)

    if method == 'valid':
        answers = df['ko'].tolist()
        bleu_socres = corpuswise_bleu(predictions, answers)
        HM = 4 / ((1/bleu_socres[0])+(1/bleu_socres[1])+(1/bleu_socres[2])+(1/bleu_socres[3]))
        print('harmonic mean : ', round(HM,5))

        predictions = HM

    elif method == 'test':
        new_df = pd.DataFrame()
        new_df['sid'] = df['sid']
        new_df['predicts'] = predictions
        predictions = new_df

    return predictions

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1')

Downloading:   0%|          | 0.00/666k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/111 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

In [None]:
train_dataloader = data_tokenize(train, tokenizer, batch_size=128)
dev_dataloader = data_tokenize(dev_data, tokenizer, batch_size=128)

In [None]:
model = BartForConditionalGeneration.from_pretrained('gogamza/kobart-base-v1')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = model.to(device)

Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]

In [None]:
gc.collect()
torch.cuda.empty_cache()

epochs = 10

optimizer = AdamW(model.parameters(),
                lr = 1e-4, # 학습률
                eps = 1e-8)

# 총 훈련 스텝 : 배치반복 횟수 * 에폭
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


avg_loss = []
HM_score = []

for epoch in range(1, epochs+1):
# =====================================
#               Training
# =====================================
    model.train()

    train_loss = 0

    for batchs in tqdm(train_dataloader):
        # print(batch)
        batch = tuple(b.to(device) for b in batchs)

        inputs = {
            'src_ids' : batch[0],
            'src_mask' : batch[1], 
            'tgt_ids' : batch[2],
            'tgt_mask' : batch[3],
            'tgt_label' : batch[4]
        }

        output = model(input_ids = inputs['src_ids'],
                       attention_mask = inputs['src_mask'],
                       decoder_input_ids = inputs['tgt_ids'],
                       decoder_attention_mask = inputs['tgt_mask'],
                       labels  = inputs['tgt_label'])


        optimizer.zero_grad()
        
        loss = output['loss']
        
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

        train_loss += loss
    
    total_loss = (train_loss/len(train_dataloader)).detach().cpu().tolist()
    print(f'epoch - {epoch} Train_Loss : ', total_loss)
    avg_loss.append(total_loss)

    model.save_pretrained(f'/gdrive/My Drive/goorm/03.translator/models/gogamza_nmt_lr_1e-4_03m_{epoch}')

    HM = evaluation(dev_data, model, tokenizer, method='valid')
    HM_score.append(HM)

# sample data 생성

In [None]:
path = '/gdrive/My Drive/goorm/03.translator/models/gogamza_nmt_lr_1e-4_03m_5'
tokenizer = PreTrainedTokenizerFast.from_pretrained('gogamza/kobart-base-v1')

model = BartForConditionalGeneration.from_pretrained(f'{path}')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
file_name = 'sumission'
test1_sample = evaluation(test1_data, model, tokenizer, length=35,beams=4,method='test')
test1_sample.to_csv(f'/gdrive/MyDrive/goorm/03.translator/samples/test1_{file_name}.csv',index=False, encoding='utf-8')

test2_sample = evaluation(test2_data, model, tokenizer, length=34,beams=4, method='test')
test2_sample.to_csv(f'/gdrive/MyDrive/goorm/03.translator/samples/test2_{file_name}.csv',index=False, encoding='utf-8')

100%|██████████| 79/79 [02:22<00:00,  1.81s/it]
100%|██████████| 79/79 [02:39<00:00,  2.01s/it]
