In [1]:
# !pip install --quiet pytorch_lightning
# !pip install --quiet transformers

In [5]:
import re
text = "ngày mười tháng mười một một chín chín ba tháng mười"

# pattern to find 'mười' right after 'tháng'
pattern = r'(?<=tháng )\bmười\b'

match = re.search(pattern, text)

In [24]:
import re

def change_string(text):
    pattern = r'\btháng một\b'
    new_text = re.sub(r'\btháng một\b', 'tháng mười', text)
    return new_text

# Test the function
text = 'ngày hai mươi ba tháng một một năm chín chín tư năm'
print(change_string(text))

ngày hai mươi ba tháng mười một năm chín chín tư năm


In [21]:
match

In [15]:
import re

def check_string(text):
    pattern = r'\btháng mười\b(?=.*\b\w+\b)'
    match = re.search(pattern, text)
    print(match)
    return match is not None

# Test the function
print(check_string('tháng mười năm hai không'))  # True
print(check_string('tháng mười'))  # False
print(check_string('tháng hai năm hai không'))  # False

<re.Match object; span=(0, 10), match='tháng mười'>
True
None
False
None
False


In [10]:
text[match.span()[1] + 1:]

'một một chín chín ba tháng mười'

In [2]:
import pandas as pd
import numpy as np
import torch
from pathlib import Path
from sklearn.model_selection import train_test_split
import textwrap

from torch.utils.data import Dataset,DataLoader, random_split
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning import Trainer

from transformers import(
    AdamW,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)

In [3]:
from datasets import Dataset

In [4]:
def get_device_and_set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    np.random.seed(seed)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    return device
    
SEED = 123
device = get_device_and_set_seed(SEED)

In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained("VietAI/vit5-base")
model.to(device)

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [6]:
next(model.parameters()).is_cuda

True

In [5]:
tokenizer = AutoTokenizer.from_pretrained("VietAI/vit5-base")  

In [6]:
labels = tokenizer(
        'tôi thích bạn', max_length=256, truncation=True, padding=True
    )

In [7]:
labels

{'input_ids': [671, 1470, 1113, 1], 'attention_mask': [1, 1, 1, 1]}

## Prepare Data

In [8]:
train_path = './UIT-ViNames-Dataset/UIT-ViNames/Train.csv'
val_path = './UIT-ViNames-Dataset/UIT-ViNames/Val.csv'
test_path = './UIT-ViNames-Dataset/UIT-ViNames/Test.csv'
null_path = './UIT-ViNames-Dataset/UIT-ViNames/Null.csv'
null_path_1 = './UIT-ViNames-Dataset/UIT-ViNames/Null_1.csv'

In [9]:
train_df = pd.read_csv(train_path)
train_df.head()

Unnamed: 0,Full_Names,Gender,sentence
0,ngô xuân tùng,1,ngô xuân tùng một à
1,bùi dương thảo vy,0,bùi dương thảo vy rồi vầng anh cho
2,lưu thế huy,1,ừ một ơ lưu thế huy
3,nguyễn thị vân,0,ơ ờ nguyễn thị vân là có cái
4,dương minh long,1,chị ạ năm mình một dương minh long


In [10]:
val_df = pd.read_csv(val_path)
test_df = pd.read_csv(test_path)
null_df = pd.read_csv(null_path)
null_df_1 = pd.read_csv(null_path_1)

In [11]:
train_df = pd.concat([train_df, null_df, null_df_1], ignore_index = False)
train_df.head()

Unnamed: 0,Full_Names,Gender,sentence
0,ngô xuân tùng,1.0,ngô xuân tùng một à
1,bùi dương thảo vy,0.0,bùi dương thảo vy rồi vầng anh cho
2,lưu thế huy,1.0,ừ một ơ lưu thế huy
3,nguyễn thị vân,0.0,ơ ờ nguyễn thị vân là có cái
4,dương minh long,1.0,chị ạ năm mình một dương minh long


In [12]:
train_df[-5:]

Unnamed: 0,Full_Names,Gender,sentence
995,,,dạ vâng ạ em xin phép có thể nói chuyện với a ...
996,,,ờ dạ vâng ạ
997,,,a lô dạ vâng em chào anh ạ anh cho em hỏi số m...
998,,,đó để em trao đổi với các chuyên gia của bên e...
999,,,dạ rồi ok em cảm ơn em chào anh ạ


In [13]:
def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["inputs"], max_length=256, truncation=True, padding=True
    )
    
    
    labels = tokenizer(
        examples["labels"], max_length=256, truncation=True, padding=True
    )
    model_inputs['labels'] = labels['input_ids']
    model_inputs['input_ids'] = model_inputs['input_ids']
    return model_inputs

In [14]:
dict_obj = {'inputs': train_df['sentence'], 'labels': train_df['Full_Names']}
dataset = Dataset.from_dict(dict_obj)
train_data = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=8)

                

#1:   0%|          | 0/3 [00:00<?, ?ba/s]

#0:   0%|          | 0/3 [00:00<?, ?ba/s]

#2:   0%|          | 0/3 [00:00<?, ?ba/s]

#3:   0%|          | 0/3 [00:00<?, ?ba/s]

#4:   0%|          | 0/3 [00:00<?, ?ba/s]

#5:   0%|          | 0/3 [00:00<?, ?ba/s]

#6:   0%|          | 0/3 [00:00<?, ?ba/s]

#7:   0%|          | 0/3 [00:00<?, ?ba/s]

In [15]:
dict_obj = {'inputs': val_df['sentence'], 'labels': val_df['Full_Names']}
dataset = Dataset.from_dict(dict_obj)
val_data = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=8)

                

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
dict_obj = {'inputs': test_df['sentence'], 'labels': test_df['Full_Names']}
dataset = Dataset.from_dict(dict_obj)
test_data = dataset.map(preprocess_function, batched=True, remove_columns=['inputs'], num_proc=8)

                

#1:   0%|          | 0/1 [00:00<?, ?ba/s]

#0:   0%|          | 0/1 [00:00<?, ?ba/s]

#4:   0%|          | 0/1 [00:00<?, ?ba/s]

#3:   0%|          | 0/1 [00:00<?, ?ba/s]

#2:   0%|          | 0/1 [00:00<?, ?ba/s]

#7:   0%|          | 0/1 [00:00<?, ?ba/s]

#6:   0%|          | 0/1 [00:00<?, ?ba/s]

#5:   0%|          | 0/1 [00:00<?, ?ba/s]

In [17]:
train_data.__getitem__(0)

{'labels': [6043, 4211, 4694, 1, 0, 0, 0, 0, 0],
 'input_ids': [6043, 4211, 4694, 68, 536, 1, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0]}

In [48]:
len(test_data)

5370

In [19]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")


training_args = Seq2SeqTrainingArguments("tmp/",
                                      do_train=True,
                                      do_eval=True,
                                      evaluation_strategy='steps',
                                      eval_steps=2475,
                                      num_train_epochs=15,
                                      learning_rate=1e-5,
                                      warmup_ratio=0.05,
                                      weight_decay=0.01,
                                      per_device_train_batch_size=4,
                                      per_device_eval_batch_size=4,
                                      logging_dir='./log',
                                      group_by_length=True,
                                      load_best_model_at_end=True,
                                      save_steps=2475,
                                      save_total_limit=1,
                                      #eval_steps=1,
                                      #evaluation_strategy="steps",
                                      # evaluation_strategy="no",
                                      fp16=True,
                                      )

NameError: name 'model' is not defined

## Training

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
)

trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 19795
  Num Epochs = 15
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 37125
  Number of trainable parameters = 225950976
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
2475,0.0167,0.010165
4950,0.0057,0.007088
7425,0.0029,0.005444
9900,0.0049,0.00509
12375,0.0024,0.006284
14850,0.0007,0.007116


***** Running Evaluation *****
  Num examples = 2686
  Batch size = 8
Saving model checkpoint to tmp/checkpoint-2475
Configuration saved in tmp/checkpoint-2475/config.json
Configuration saved in tmp/checkpoint-2475/generation_config.json
Model weights saved in tmp/checkpoint-2475/pytorch_model.bin
Deleting older checkpoint [tmp/checkpoint-21150] due to args.save_total_limit
Deleting older checkpoint [tmp/checkpoint-22325] due to args.save_total_limit
Deleting older checkpoint [tmp/checkpoint-23500] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2686
  Batch size = 8
Saving model checkpoint to tmp/checkpoint-4950
Configuration saved in tmp/checkpoint-4950/config.json
Configuration saved in tmp/checkpoint-4950/generation_config.json
Model weights saved in tmp/checkpoint-4950/pytorch_model.bin
Deleting older checkpoint [tmp/checkpoint-2475] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2686
  Batch size = 8
Saving model checkpoi

In [25]:
print('hello')

hello


In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
)

trainer.train()

Using cuda_amp half precision backend
***** Running training *****
  Num examples = 18795
  Num Epochs = 30
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 35250
  Number of trainable parameters = 225950976
You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.0929,0.008796
2,0.0097,0.005408
3,0.0045,0.005494
4,0.0021,0.00576
5,0.0031,0.004866
6,0.0011,0.004718
7,0.0019,0.007651
8,0.0007,0.004296
9,0.0006,0.005472
10,0.0009,0.006687


***** Running Evaluation *****
  Num examples = 2686
  Batch size = 16
Saving model checkpoint to tmp/checkpoint-1175
Configuration saved in tmp/checkpoint-1175/config.json
Configuration saved in tmp/checkpoint-1175/generation_config.json
Model weights saved in tmp/checkpoint-1175/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2686
  Batch size = 16
Saving model checkpoint to tmp/checkpoint-2350
Configuration saved in tmp/checkpoint-2350/config.json
Configuration saved in tmp/checkpoint-2350/generation_config.json
Model weights saved in tmp/checkpoint-2350/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2686
  Batch size = 16
Saving model checkpoint to tmp/checkpoint-3525
Configuration saved in tmp/checkpoint-3525/config.json
Configuration saved in tmp/checkpoint-3525/generation_config.json
Model weights saved in tmp/checkpoint-3525/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 2686
  Batch size = 16
Saving model checkpoint to tm

## Inference

In [20]:
from datasets import load_metric
metric = load_metric("rouge")

  metric = load_metric("rouge")


In [21]:
model = AutoModelForSeq2SeqLM.from_pretrained("./tmp/checkpoint-7425")
model.to('cuda')

T5ForConditionalGeneration(
  (shared): Embedding(36096, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(36096, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [24]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="pt")

In [25]:
import tqdm
import torch 
import numpy as np
metrics = load_metric('rouge')

max_target_length = 256
dataloader = torch.utils.data.DataLoader(test_data, collate_fn=data_collator, batch_size=32)

predictions = []
references = []
for i, batch in enumerate(dataloader):
  outputs = model.generate(
      input_ids=batch['input_ids'].to('cuda'),
      max_length=max_target_length,
      attention_mask=batch['attention_mask'].to('cuda'),
  )
  outputs = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in outputs]

  labels = np.where(batch['labels'] != -100,  batch['labels'], tokenizer.pad_token_id)
  actuals = [tokenizer.decode(out, clean_up_tokenization_spaces=False, skip_special_tokens=True) for out in labels]
  predictions.extend(outputs)
  references.extend(actuals)
  metrics.add_batch(predictions=outputs, references=actuals)


metrics.compute()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'rouge1': AggregateScore(low=Score(precision=0.9988018200762616, recall=0.998607608406491, fmeasure=0.9986347304154567), mid=Score(precision=0.9993805976766871, recall=0.9992200939966306, fmeasure=0.9992122872988795), high=Score(precision=0.999738926576217, recall=0.999624013478762, fmeasure=0.9995694423257271)),
 'rouge2': AggregateScore(low=Score(precision=0.9981314400993173, recall=0.9978736809435135, fmeasure=0.9978791822185674), mid=Score(precision=0.9989075108628183, recall=0.9987212911235258, fmeasure=0.9986936873389385), high=Score(precision=0.9995065952824334, recall=0.9993792675356922, fmeasure=0.9992694402121775)),
 'rougeL': AggregateScore(low=Score(precision=0.9988835018178597, recall=0.9986260973663208, fmeasure=0.9987033673961049), mid=Score(precision=0.9993805976766871, recall=0.9992294049835948, fmeasure=0.9992270898974809), high=Score(precision=0.9997396027312231, recall=0.9996311519021017, fmeasure=0.9995801538881147)),
 'rougeLsum': AggregateScore(low=Score(precisi

In [26]:
correct = 0
correct += sum(o==a for o, a in zip(predictions, references))
correct

5341

In [27]:
correct/ len(predictions)

0.9945996275605214

In [28]:
predictions[0]

'nguyễn văn tiến'

In [29]:
references[0]

'nguyễn văn tiến'

In [30]:
a= next(iter(dataloader))

In [31]:
tokenizer.decode(a['input_ids'][0], skip_special_tokens=True)

'nguyễn văn tiến thì dạ bên không cho'

In [52]:
t = 'nguyễn ờ thảo linh'
b = tokenizer(t, return_tensors='pt')
b

{'input_ids': tensor([[1337,  278,  559, 1407, 2448,    1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [53]:
outputs = model.generate(
      input_ids=b['input_ids'].to('cuda'),
      max_length=max_target_length,
      attention_mask=b['attention_mask'].to('cuda'),
  )
outputs

tensor([[   0, 1337,  278,  559, 1407, 2448,    1]], device='cuda:0')

In [54]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

'nguyễn ờ thảo linh'