# KorGPT2 Lyric Fine-Tuning Tutorial
https://github.com/MrBananaHuman/KorGPT2Tutorial

In [1]:
!pip -q install tqdm==4.46.0
!pip -q install tokenizers==0.7.0
!pip -q install torch==1.5.0
!pip -q install transformers==2.11.0
!pip -q install gdown

[33mYou are using pip version 19.0.3, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 19.0.3, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 19.0.3, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 19.0.3, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[33mYou are using pip version 19.0.3, however version 20.2b1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


## Download the pre-trained model
* https://drive.google.com/drive/folders/124Uux07pym2YaCeQKQWNhzhLNeIlLm7r?usp=sharing 

In [2]:
!mkdir -p KorGPT-2SampleModel
!gdown -O ./KorGPT-2SampleModel/pytorch_model.bin --id 1kX_dB05dkLRgxJkqoHidrT2OFYHGYWPF

Downloading...
From: https://drive.google.com/uc?id=1kX_dB05dkLRgxJkqoHidrT2OFYHGYWPF
To: /Users/hunkim/work/KorGPT2Tutorial/KorGPT-2SampleModel/pytorch_model.bin
516MB [00:09, 55.3MB/s]


In [3]:
# tokenizer code
!pygmentize new_tokenizer.py

[34mfrom[39;49;00m [04m[36mtokenizers[39;49;00m[04m[36m.[39;49;00m[04m[36mimplementations[39;49;00m [34mimport[39;49;00m SentencePieceBPETokenizer
[34mfrom[39;49;00m [04m[36mtokenizers[39;49;00m[04m[36m.[39;49;00m[04m[36mprocessors[39;49;00m [34mimport[39;49;00m BertProcessing

[34mfrom[39;49;00m [04m[36mtransformers[39;49;00m[04m[36m.[39;49;00m[04m[36mtokenization_utils[39;49;00m [34mimport[39;49;00m PreTrainedTokenizer, PreTrainedTokenizerFast

[34mimport[39;49;00m [04m[36mjson[39;49;00m

[34mclass[39;49;00m [04m[32mMyTokenizer[39;49;00m():

    [34mdef[39;49;00m [32m__init__[39;49;00m([36mself[39;49;00m, vocab_file_path, merge_file_path):
        [36mself[39;49;00m.tokenizer = SentencePieceBPETokenizer(vocab_file_path, merge_file_path)
        [36mself[39;49;00m.unknown_token = [36mself[39;49;00m.tokenizer.token_to_id([33m"[39;49;00m[33m<unk>[39;49;00m[33m"[39;49;00m)
        [36mself[39;49;00m._pad_token = [33m"

## Loading the model

In [4]:
from transformers import GPT2LMHeadModel, GPT2Config, AdamW
from new_tokenizer import MyTokenizer
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

vocab_file_path = './tokenizer/vocab.json'
merge_file_path = './tokenizer/merges.txt'
model_dir = './KorGPT-2SampleModel/pytorch_model.bin'

tokenizer = MyTokenizer(vocab_file_path, merge_file_path)
config = GPT2Config(vocab_size=52000)
model = GPT2LMHeadModel(config)
  
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model.load_state_dict(torch.load(model_dir, map_location=device), strict=False)
model.to(device).eval()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(52000, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): Laye

In [5]:
ATTR_TO_SPECIAL_TOKEN = ['<song>', '</song>']

def add_special_tokens_(model, tokenizer):
    orig_num_tokens = tokenizer.get_vocab_size()
    tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
    num_added_tokens = len(ATTR_TO_SPECIAL_TOKEN)
    model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens + 1)

add_special_tokens_(model, tokenizer)
b_song = tokenizer.convert_tokens_to_ids('<song>')
e_song = tokenizer.convert_tokens_to_ids('</song>')

## Line by line dataset

In [6]:
class LyricDataSet(Dataset):
    def __init__(self, file_path):
        self.data = []
        self.file_path = file_path
        
    def split_songs(self, lines):
        songs = []
        single_song = []
        for line in lines:
            line = line.strip()
            if line == '':
                if len(single_song) > 5:
                    songs.append(single_song)
                single_song = []
            else:
                single_song.append(line)
        return songs
    
    def load_data(self):
        lyric_file = open(self.file_path, 'r', encoding='utf-8')
        lyric_lines = lyric_file.readlines()
        lyric_file.close()
        
        song_list = self.split_songs(lyric_lines)
        for song in song_list:
            song_data = ['<song>']
            for line in song:
                tokenized_line = ['<s>'] + tokenizer.tokenize(line) + ['</s>']
                if len(song_data) + len(tokenized_line) < 1024:
                    song_data += tokenized_line
                else:
                    break
            song_data += ['</song>']
            padded_song_data = song_data + ['<pad>'] * (1024 - len(song_data))
            self.data.append(torch.tensor(tokenizer.convert_tokens_to_ids(padded_song_data)).unsqueeze(0))
                
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        item = self.data[index]
        return item


        

lyric_file_path = 'lyric_data/preprocessed_data.txt'
lyric_data = LyricDataSet(lyric_file_path)
lyric_data.load_data()
lyric_data_loader = DataLoader(lyric_data, batch_size=4, shuffle=True)

## Find tuning/training

In [7]:
optimizer = AdamW(model.parameters(), lr=1e-4, correct_bias=True)

epochs = 5
count = 0

avg_loss = (0.0, 0.0)
for epoch in range(epochs):
	for data in lyric_data_loader:
		optimizer.zero_grad()
		data = data.transpose(1,0)
		data = data.to(device)
		model = model.to(device)

		outputs = model(data, labels=data)
		loss, logits = outputs[:2]
		loss = loss.to(device)
		loss.backward()
		avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0)
		optimizer.step()
		count+=1

	print('epoch no.{0} train no.{1}  loss = {2:.5f} avg_loss = {3:.5f}' . format(epoch, count, loss, avg_loss[0] / avg_loss[1]))
	


epoch no.0 train no.1  loss = 7.05945 avg_loss = 7.05945
epoch no.1 train no.2  loss = 4.58444 avg_loss = 5.81573
epoch no.2 train no.3  loss = 1.98967 avg_loss = 4.52753
epoch no.3 train no.4  loss = 1.82610 avg_loss = 3.84196
epoch no.4 train no.5  loss = 1.74568 avg_loss = 3.41424


In [8]:
# Save the mode 
from os import path

torch.save(model.state_dict(), 
    path.join(path.dirname(model_dir), 'my_lyric_model.bin'))

In [9]:
bos = tokenizer.convert_tokens_to_ids('<s>')
eos = tokenizer.convert_tokens_to_ids('</s>')
pad = tokenizer.convert_tokens_to_ids('<pad>')
unk = tokenizer.convert_tokens_to_ids('<unk>')

def add_special_tokens_(model, tokenizer):
    orig_num_tokens = tokenizer.get_vocab_size()
    tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
    num_added_tokens = len(ATTR_TO_SPECIAL_TOKEN)
    model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens + 1)

add_special_tokens_(model, tokenizer)
b_song = tokenizer.convert_tokens_to_ids('<song>')
e_song = tokenizer.convert_tokens_to_ids('</song>')

def encoding(text):
    tokens = ['<song>', '<s>'] + tokenizer.tokenize(text)
    return torch.tensor(tokenizer.convert_tokens_to_ids(tokens)).unsqueeze(0)

def decoding(ids):
    return tokenizer.convert_ids_to_tokens(ids[0])

input_ids = encoding('하늘을 날아')

sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=1024, 
    top_k=50, 
    top_p=0.95, 
    eos_token_id=e_song,
    early_stopping=True,
    bad_words_ids=[[unk]]
)
print(decoding(sample_outputs.tolist()))

Setting `pad_token_id` to 52001 (first `eos_token_id`) to generate sequence
<song><s> 하늘을 날아야 한다고 말한다.</s><s> 그리고 얼마 후 어느 날, 갑자기 </s><s> </s><s> 시몽, 마블의 첫 등장에 이어서 등장한 카드.</s><s> </s><s> 첫 등장은 4화부터 등장.</s><s> 시몽을 위해 시몽에게 도움을 요청할 때 시몽의 적인 효과를 사용해 시몽을 소환하는 것이 주된 포인트였다.</s><s> 시몽의 등장은  시몽의 등장이 아니라며 시몽의 등장을 위해 시몽에게 연락을 받고 시몽의 등장에 들어간다.</s><s> 시몽의 등장은 4화에서 시몽이 재등장한다.</s><s> 시몽은 재등장.</s><s> 시몽의 등장은 6화에서 시몽이 재등장한다.</s><s> 시몽이 재등장하면서 시몽을 재등장시킨다.</s><s> 시몽이 재등장하면서 시몽의 등장은 4화에서 시몽의 등장은 5화에서 시몽이 재등장한다.</s><s> 시몽의 등장은 6화에서 시몽에게 첫 등장이 확정되었다.</s><s> 시몽은 재등장했으나 시몽의 등장이 3화에서 시몽에게 첫 등장을 하게 되자 시몽의 등장이 확정되고 시몽이 재등장한다.</s><s> 시몽이 재등장하자 시몽은 재등장하지만 시몽에게 등장을 하지 않는다.</s><s> 시몽이 재등장한다.</s><s> 시몽은 재등장하지만 시몽이 재등장해 시몽이 재등장한다.</s><s> 시몽은 재등장한다.</s><s> 시몽에게 등장이지만 시몽의 등장은 5화에서 시몽을 재등장한다.</s><s> 시몽에게 등장은 12화에서 시몽에게 재등장을 하면서 시몽의 등장은 9화에서 시몽에게 등장은 12화, 시몽이 재등장한다.</s><s> 시몽이 재등장해 시몽이 재등장해 시몽을 재등장.</s><s> 시몽을 재등장하고 시몽을 재등장한다.</s><s> 시몽을 재등장해 시몽의 등장은 6화에서 시몽에게 처음으로 등장은 23화에서 시몽이 재등장해 시몽이 재등장해 시몽의 등장은 7화에서 시몽이 재등장해 시몽이 재등장해

## Decoding using the saved model

In [10]:
!mkdir -p KorGPT-2SampleModel
!gdown -O ./KorGPT-2SampleModel/lyric_model.bin --id 1nopu647K2KwnMAc97CNL2owPKA4GsF22

Downloading...
From: https://drive.google.com/uc?id=1nopu647K2KwnMAc97CNL2owPKA4GsF22
To: /Users/hunkim/work/KorGPT2Tutorial/KorGPT-2SampleModel/lyric_model.bin
516MB [00:16, 31.9MB/s]


In [11]:
from transformers import GPT2LMHeadModel, GPT2Config
import torch

ATTR_TO_SPECIAL_TOKEN = ['<song>', '</song>']

vocab_file_path = './tokenizer/vocab.json'
merge_file_path = './tokenizer/merges.txt'
model_dir = './KorGPT-2SampleModel/lyric_model.bin'

tokenizer = MyTokenizer(vocab_file_path, merge_file_path)
bos = tokenizer.convert_tokens_to_ids('<s>')
eos = tokenizer.convert_tokens_to_ids('</s>')
pad = tokenizer.convert_tokens_to_ids('<pad>')
unk = tokenizer.convert_tokens_to_ids('<unk>')

config = GPT2Config(vocab_size=52003, resid_pdrop=0, embd_pdrop=0, attn_pdrop=0, summary_first_dropout=0)
model = GPT2LMHeadModel(config)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load(model_dir, map_location=device), strict=False)
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(52003, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0, inplace=False)
    (h): ModuleList(
      (0): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0, inplace=False)
          (resid_dropout): Dropout(p=0, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0, inplace=False)
        )
      )
      (1): Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0, inplace=False)
          (resid_dropout): Dropout(p=0, inplace=False)
        )
        (ln_2): LayerNorm((768,)

In [12]:


def add_special_tokens_(model, tokenizer):
    orig_num_tokens = tokenizer.get_vocab_size()
    tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
    num_added_tokens = len(ATTR_TO_SPECIAL_TOKEN)
    model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens + 1)

add_special_tokens_(model, tokenizer)
b_song = tokenizer.convert_tokens_to_ids('<song>')
e_song = tokenizer.convert_tokens_to_ids('</song>')

def encoding(text):
    tokens = ['<song>', '<s>'] + tokenizer.tokenize(text)
    return torch.tensor(tokenizer.convert_tokens_to_ids(tokens)).unsqueeze(0)

def decoding(ids):
    return tokenizer.convert_ids_to_tokens(ids[0])

input_ids = encoding('우리는 오늘')

sample_outputs = model.generate(
    input_ids,
    do_sample=True, 
    max_length=1024, 
    top_k=50, 
    top_p=0.95, 
    eos_token_id=e_song,
    early_stopping=True,
    bad_words_ids=[[unk]]
)
print(decoding(sample_outputs.tolist()))



Setting `pad_token_id` to 52001 (first `eos_token_id`) to generate sequence
<song><s> 우리는 오늘 밤도 오늘 밤</s><s> 함께 걷던 이 곳</s><s> 우리 둘 사이엔 아직 많은 날들이</s><s> 우리 둘의 추억을 함께했던 추억은 사라져 버렸어</s><s> 우리의 사랑은 우리의 우정</s><s> 우리 둘이 함께했던 기억엔</s><s> 우리의 추억이 남아 있어요</s><s> 우리의 추억을 함께했던 시간엔</s><s> 우리의 추억이 남아 있어요</s><s> 우리 둘은 함께한 기억이 남아 있어요</s><s> 우리의 추억을 함께했던 시간엔</s><s> 우리의 추억이 남아 있어요</s><s> 우리의 추억을 함께했던 시간엔</s><s> 우리의 추억이 남아 있어요</s><s> 우리의 추억을 함께했던 시간엔</s><s> 우리의 추억이 남아 있어요</s></song>
