In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/vietnamese-poem-dataset/poems_dataset.csv


In [2]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
# !pip install --upgrade pip
!pip install -q transformers huggingface_hub wandb
!apt-get install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 81 not upgraded.


In [4]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Prepare dataset

In [5]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load the CSV file into a DataFrame
data = pd.read_csv('/kaggle/input/vietnamese-poem-dataset/poems_dataset.csv')

def build_text_files(data_df, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for index, row in data_df.iterrows():
        summary = str(row['content']).strip()
        summary = re.sub(r"\s", " ", summary)
        data += summary + "  "
    f.write(data)

# Split the data into training and test sets
train, test = train_test_split(data, test_size=0.15)

# Write the training set to a text file
build_text_files(train, 'train_dataset.txt')

# Write the test set to a text file
build_text_files(test, 'test_dataset.txt')

print("Train dataset length: " + str(len(train)))
print("Test dataset length: " + str(len(test)))




Train dataset length: 145504
Test dataset length: 25678


# Model + Tokenizer


In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

tokenizer = AutoTokenizer.from_pretrained("danghuy1999/gpt2-viwiki")
model = AutoModelForCausalLM.from_pretrained("danghuy1999/gpt2-viwiki")

train_path = 'poem_train.txt'
test_path = 'poem_test.txt'

config.json:   0%|          | 0.00/916 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/773k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/431k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [7]:
print(tokenizer.encode("<|startoftext|>")) # fail to encode this token
print(tokenizer.encode("<|endoftext|>"))
print(tokenizer.encode(""))

[28, 92, 1472, 1632, 1247, 19862, 92, 30]
[0]
[]


In [8]:
print(len(tokenizer))
tokenizer.add_tokens(["\n"])
print(len(tokenizer))

50257
50258


In [9]:
tokenizer

GPT2TokenizerFast(name_or_path='danghuy1999/gpt2-viwiki', vocab_size=50257, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("
", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
}

# Resize model's word embedding

In [10]:
model.resize_token_embeddings(len(tokenizer))

# New weight for our new tokens (all zeros)
with torch.no_grad():
    model.transformer.wte.weight[-1, :] = torch.zeros([768])

print(model.transformer.wte.weight.shape)

torch.Size([50258, 768])


In [11]:
from transformers import TextDataset,DataCollatorForLanguageModeling

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=100)
     
    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=100)   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator
train_path = '/kaggle/working/train_dataset.txt'
test_path =  '/kaggle/working/test_dataset.txt'
train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



# Initialize Trainer with TrainingArguments and GPT-2 model

In [12]:
%env WANDB_PROJECT=GPT2-POEM
%env WANDB_WATCH=all

env: WANDB_PROJECT=GPT2-POEM
env: WANDB_WATCH=all


In [13]:
import os
os.makedirs('/kaggle/working/GPT2_Poet')

In [14]:
import wandb
from transformers import TrainerCallback
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead,EarlyStoppingCallback

class StopTrainingCallback(TrainerCallback):
    def on_step_end(self, args, state, control, **kwargs):
        if state.global_step > 1500:
            control.should_training_stop = True

training_args = TrainingArguments(
    output_dir="/kaggle/working/GPT2_Poet",
    overwrite_output_dir=True, #overwrite the content of the output directory
    num_train_epochs= 5,
    per_device_train_batch_size= 32,
    per_device_eval_batch_size= 32,
    evaluation_strategy = 'steps',
    eval_steps = 500, # Number of update steps between two evaluations.
    save_strategy = 'steps',
    push_to_hub=True,
    hub_model_id = "GPT2_Poet",
    save_total_limit = 10,
    warmup_steps = 1000,
    report_to=                      'wandb',
    run_name=                       'Run 6 - w/o label smoothing',
    logging_steps =                 5,                    
    gradient_accumulation_steps=    2,
    learning_rate=                  5e-4,
    weight_decay =                  0.5,
    dataloader_num_workers = 2,
    # label_smoothing_factor = 0.3,
    load_best_model_at_end = True,
    metric_for_best_model = 'eval_loss',

)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    callbacks = [EarlyStoppingCallback(early_stopping_patience= 5),StopTrainingCallback()]
)

In [15]:
trainer.train()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc




Step,Training Loss,Validation Loss
500,5.6605,5.524101
1000,5.1077,5.046876
1500,4.8414,4.75812


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=1501, training_loss=5.545836874995527, metrics={'train_runtime': 3961.1211, 'train_samples_per_second': 207.044, 'train_steps_per_second': 1.617, 'total_flos': 9804626265600000.0, 'train_loss': 5.545836874995527, 'epoch': 1.17})

In [None]:
model.push_to_hub('GPT2_Poet')

In [None]:
tokenizer.push_to_hub('GPT2_Poet')

# Test the model

In [16]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("tuanle/GPT2_Poet")
model = AutoModelForCausalLM.from_pretrained("tuanle/GPT2_Poet").to(device)

tokenizer_config.json:   0%|          | 0.00/589 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/773k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/431k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.06M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/13.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.04k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [25]:
text = "hôm nay"

input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
min_length = 60
max_length = 100

sample_outputs = model.generate(input_ids,pad_token_id=tokenizer.eos_token_id,
                                   do_sample=True,
                                   max_length=max_length,
                                   min_length=min_length,
                                   top_p = 0.8,
                                   num_beams= 10,
                                   no_repeat_ngram_size= 2,
                                   num_return_sequences= 3)

for i, sample_output in enumerate(sample_outputs):
    print(">> Generated text {}\n\n{}".format(i+1, tokenizer.decode(sample_output.tolist(), skip_special_tokens=True)))
    print('\n---')

>> Generated text 1

hôm nay trời đã sang đông
nhớ mùa thu đến bên cầu đợi chờ
đêm thu hoa nở bên đường
mênh mông sóng vỗ mênh mông cõi lòng
vần thơ lục bát thẫn thờ
mà nghe tiếng sáo vi vu lời ca
hỏi rằng ai ở đâu đây
thưa rằng ta ở đây đây đâu mà
một mình một bóng trăng vàng
người ta gặp gỡ một mình gặp nhau
cớ sao gặp lại một người
để cho ai lại gặp

---
>> Generated text 2

hôm nay trời đã sang đông
nhớ mùa thu đến bên cầu đợi chờ
đêm thu hoa nở bên đường
mênh mông sóng vỗ mênh mông cõi lòng
vần thơ lục bát thẫn thờ
mà nghe tiếng sáo vi vu lời ca
hỏi rằng ai ở đâu đây
thưa rằng ta ở đây đây đâu mà
một mình một bóng trăng vàng
người ta gặp gỡ một mình gặp nhau
cớ sao gặp lại một người
để cho mình lại gặp

---
>> Generated text 3

hôm nay trời đã sang đông
nhớ mùa thu đến bên cầu đợi chờ
đêm thu hoa nở bên đường
mênh mông sóng vỗ mênh mông cõi lòng
vần thơ lục bát thẫn thờ
mà nghe tiếng sáo vi vu lời ca
hỏi rằng ai ở đâu đây
thưa rằng ta ở đây đây đâu mà
một mình một bóng trăng vàng
