In [1]:
# !pip install --quiet pytorch_lightning
# !pip install --quiet transformers

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
# os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [3]:
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from sklearn.model_selection import train_test_split
import textwrap
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer

from transformers import(
    AdamW,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

In [4]:
from peft import PeftModel, LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

In [5]:
from datasets import Dataset, load_metric

In [43]:
torch.cuda.current_device()

0

In [7]:
def get_device_and_set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    np.random.seed(seed)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    return device
    
SEED = 123
device = get_device_and_set_seed(SEED)

In [8]:
device

device(type='cuda')

## Load origin model

In [7]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base", load_in_8bit=True, device_map="auto")
# model.to(device)
print(next(model.parameters()).is_cuda)

In [9]:
tokenizer = AutoTokenizer.from_pretrained("google/mt5-base", use_fast = True) 
labels = tokenizer(
        'tôi thích bạn', max_length=256, truncation=True, padding=True
    )

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [10]:
tokenizer.convert_ids_to_tokens(labels['input_ids'])

['▁tô', 'i', '▁th', 'ích', '▁b', 'ạn', '</s>']

## Prepare Data

In [8]:
train_path = './data/address_data.csv'

In [9]:
data_df = pd.read_csv(train_path)
data_df.head()

Unnamed: 0,input_address,filter_address,mistake_address
0,"Thửa đất số 11, Tờ bản đồ số 39, Ấp Hoàng Việt...","Xã Tân Phước, Huyện Tân Hồng, Tỉnh Đồng Tháp",Thửa đất số11 Tờbản đồ số 39 p Hoàng Việt Xã T...
1,"Số 27, Đường Thiên Hộ Dương, Khóm 3, Phường An...","Đường Thiên Hộ Dương, Phường An Thạnh, Thành p...",số 27 đường thiên hộ dương khóm 3 phường an th...
2,"Tổ 20, Khóm An Lợi, Phường An Bình A, Thành ph...","Phường An Bình A, Thành phố Hồng Ngự, Tỉnh Đồn...",To 20 Khom An Lợi Phường An Binh A Thành phố H...
3,"Thôn Thiếp Trì, Xã Thái Đào, Huyện Lạng Giang,...","Xã Thái Đào, Huyện Lạng Giang, Tỉnh Bắc Giang",thôn thiếp trì xã thái đào huyện lạng giang tỉ...
4,"Số nhà 335 đường Trường Chinh, Thị Trấn Thắng,...","đường Trường Chinh, Thị Trấn Thắng, Huyện Hiệp...","ố nhà 335 đường Trưng Chinh, Thị rấn Thắng, Hu..."


In [10]:
# data_df = data_df[:500000]
# data_df

In [11]:
data_df = data_df.dropna()

In [12]:
SEED

123

In [31]:
train_df, test_df = train_test_split(data_df, test_size=0.1, random_state=SEED)

In [14]:
def preprocess_function(examples, padding="max_length"):
    # tokenize inputs
    model_inputs = tokenizer(
        examples["inputs"], max_length=256, truncation=True, padding=True
    )
    
    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(
        examples["labels"], max_length=256, truncation=True, padding=True
    )
    
    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]
        
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [15]:
# dict_obj = {'inputs': train_df['mistake_address'], 'labels': train_df['filter_address']}
# dataset = Dataset.from_dict(dict_obj)
# dataset = dataset.train_test_split(test_size=0.1)
# train_data = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=8)

In [17]:
dict_obj = {'inputs': test_df['mistake_address'], 'labels': test_df['mistake_address']}
dataset = Dataset.from_dict(dict_obj)
test_data = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=8)

In [None]:
train_data['train'].column_names

In [None]:
len(test_data)

In [27]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, pad_to_multiple_of=8, return_tensors="pt")
data_collator([train_data['train'].__getitem__(2)])['labels'].shape

## PEFT

In [30]:
# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=8,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1769472 || all params: 584170752 || trainable%: 0.3029032169005271


In [26]:
training_args = Seq2SeqTrainingArguments(
    "T5_address_model/",
    do_train=True,
    do_eval=True,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    num_train_epochs=15,
    learning_rate=1e-5,
    warmup_ratio=0.05,
    weight_decay=0.01,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=2,
    logging_dir='./log',
    group_by_length=True,
    load_best_model_at_end=True,
    save_total_limit=1,
    fp16=True,
)

In [31]:
output_dir = "lora_T5_address_model"

In [32]:
training_args = Seq2SeqTrainingArguments(
    output_dir="lora_T5_address_model/",
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    # gradient_accumulation_steps=4, 
    learning_rate=1e-4, # higher learning rate
    num_train_epochs=10,
    logging_dir=f"{output_dir}/logs",
    logging_steps = 100,
    group_by_length=True,
    save_strategy='epoch',
    # load_best_model_at_end=True,
    save_total_limit=1,
)

In [33]:
len(train_data["train"]) / 4

101250.0

In [34]:
300 * 5

1500

## Training

In [35]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data["train"],
    eval_dataset=train_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[34m[1mwandb[0m: Currently logged in as: [33mlenghia11a4[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss


## Inference

In [82]:
model = AutoModelForSeq2SeqLM.from_pretrained("merge_lora_t5_1", torch_dtype=torch.bfloat16, device_map="auto")

In [74]:
model = AutoModelForSeq2SeqLM.from_pretrained("google/mt5-base", torch_dtype=torch.bfloat16, device_map="auto")

In [75]:
model.hf_device_map

{'': 0}

In [46]:
peft_model_id = './lora_T5_address_model/checkpoint-183458'

In [47]:
model = PeftModel.from_pretrained(model, peft_model_id)

In [48]:
# model

In [49]:
model = model.merge_and_unload()

In [50]:
for param in model.parameters():
    param.data = param.data.contiguous()

In [51]:
model.save_pretrained('merge_lora_t5_1')

### Test 

In [19]:
import time

In [33]:
idx = 9
test_df.iloc[idx]['mistake_address']

'SN 172C đường Minh Lang, Phường Tin Cát,Thàn phố Vit Trì, Phú Thọ'

In [90]:
test_df.iloc[idx]['filter_address']

'đường Minh Lang, Phường Tiên Cát, Thành phố Việt Trì,  Phú Thọ'

In [95]:
train_df.iloc[0]['mistake_address']

In [92]:
# t = test_df.iloc[idx]['mistake_address']
t = 'đường Lam Sơn phường Tân Sơn thành phố Thanh Hoá'
b = tokenizer(t, return_tensors='pt')
b

{'input_ids': tensor([[  355,  2241, 47314,   320,  2238,   690,  2241,   366,  1534,   320,
          2238,   394,  3255,   690,  1263,   259, 19404,  1824,   471,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [93]:
model.eval()
print('Input: ', t)
st = time.time()
with torch.no_grad():
    outputs = model.generate(
          input_ids=b['input_ids'].to('cuda'),
          max_length=256,
          attention_mask=b['attention_mask'].to('cuda'),
      )
end = time.time() - st
print('Prediction times: ', end)
print('Output: ', tokenizer.decode(outputs[0], skip_special_tokens=True))

Input:  đường Lam Sơn phường Tân Sơn thành phố Thanh Hoá
Prediction times:  0.5218467712402344
Output:  đường Lam Sơn, Phường Tân Sơn, Thành phố Thanh Hóa, Thanh Hóa


In [29]:
# model.to('cuda')

In [15]:
# for param in model.parameters():
#     print(param.dtype)

### Eval metrics

In [52]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

In [57]:
import tqdm
import torch 
import numpy as np
metrics = rouge

max_target_length = 256
dataloader = torch.utils.data.DataLoader(test_data, collate_fn=data_collator, batch_size=32)

predictions = []
references = []
for i, batch in enumerate(dataloader):
  outputs = model.generate(
      input_ids=batch['input_ids'].to('cuda'),
      max_length=max_target_length,
      attention_mask=batch['attention_mask'].to('cuda'),
  )
  outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

  labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
  actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
  predictions.extend(outputs)
  references.extend(actuals)
  metrics.add_batch(predictions=outputs, references=actuals)


metrics.compute()

{'rouge1': 0.23835126750408075,
 'rouge2': 0.14379510590522415,
 'rougeL': 0.2161130252997705,
 'rougeLsum': 0.21546445876754589}

In [26]:
correct = 0
correct += sum(o==a for o, a in zip(predictions, references))
correct

5341

In [27]:
correct/ len(predictions)

0.9945996275605214

In [28]:
predictions[0]

'nguyễn văn tiến'

In [29]:
references[0]

'nguyễn văn tiến'

In [30]:
a= next(iter(dataloader))

In [31]:
tokenizer.decode(a['input_ids'][0], skip_special_tokens=True)

'nguyễn văn tiến thì dạ bên không cho'