In [1]:
import torch
import torchtext
from torchtext.data import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch.nn as nn
from torch import Tensor
from typing import Tuple

In [2]:
df = pd.read_csv('data/final_data.csv')

In [3]:
df.head(5)

Unnamed: 0,text_clean,text_clean_no_accent
0,chây ì nộp phạt nguội,chay i nop phat nguoi
1,cháu đòi tiền cơm dì đòi tiền nhà,chau doi tien com di doi tien nha
2,đà nẵng nghiên cứu tiện ích nhắn tin khi vi ph...,da nang nghien cuu tien ich nhan tin khi vi ph...
3,khó xử vụ mẹ tuổi trộm xe hơi của con gái,kho xu vu me tuoi trom xe hoi cua con gai
4,thay đổi về đăng ký chuyển nhượng xe từ bạn cầ...,thay doi ve dang ky chuyen nhuong xe tu ban ca...


In [4]:
# tokenize by space
tokenizer = get_tokenizer(tokenizer=None)

In [5]:
def build_vocab(iter_text_data, tokenizer):
    counter = Counter()
    for line in iter_text_data:#df.text_clean.to_numpy():
        counter.update(tokenizer(line))
    return Vocab(counter, min_freq=8, specials=['<unk>', '<pad>', '<bos>', '<eos>'])

In [6]:
%%time
tone_vocab = build_vocab(df.text_clean.to_numpy(), tokenizer)
no_tone_vocab = build_vocab(df.text_clean_no_accent.to_numpy(), tokenizer)

CPU times: user 1min 27s, sys: 240 ms, total: 1min 27s
Wall time: 1min 27s


In [7]:
freq = np.array([f for w, f in tone_vocab.freqs.most_common()])
# plt.hist(freq)
from scipy.stats import iqr
# iqr(freq, rng=(25, 75))
np.percentile(freq, [10 ,90])

array([ 1., 82.])

In [8]:
def data_process(tone_array, no_tone_array, tone_vocab, no_tone_vocab, tokenizer):
    data = []
    for (tone_str, no_tone_str) in tqdm(zip(tone_array, no_tone_array)):
#         print(tone_str, no_tone_str)
#         break
        tone_tensor_ = torch.tensor([tone_vocab[token] for token in tokenizer(tone_str)],
                                dtype=torch.long)
        no_tone_tensor_ = torch.tensor([no_tone_vocab[token] for token in tokenizer(no_tone_str)],
                                dtype=torch.long)
        data.append((tone_tensor_, no_tone_tensor_))
    return data

In [68]:
train_range = 1000#int(df.shape[0]*1/100)
test_range = 100#int(df.shape[0]*99/100)
df_train = df.iloc[:train_range, :]
df_test = df.iloc[-test_range:, :]
df_train.shape, df_test.shape

((1000, 2), (100, 2))

In [69]:
len(tone_vocab), len(no_tone_vocab)

(29246, 23902)

In [70]:
# %%time
train_data = data_process(df_train.text_clean.to_numpy(), 
                          df_train.text_clean_no_accent.to_numpy(), 
                          tone_vocab, no_tone_vocab, tokenizer)
# val_data = data_process(df_test.text_clean.to_numpy(), 
#                           df_test.text_clean_no_accent.to_numpy(), 
#                           tone_vocab, no_tone_vocab, tokenizer)

1000it [00:00, 5960.44it/s]


In [71]:
val_data = data_process(df_test.text_clean.to_numpy(), 
                          df_test.text_clean_no_accent.to_numpy(), 
                          tone_vocab, no_tone_vocab, tokenizer)

100it [00:00, 1552.87it/s]


In [47]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [48]:
BATCH_SIZE = 128
PAD_IDX = tone_vocab['<pad>']
BOS_IDX = tone_vocab['<bos>']
EOS_IDX = tone_vocab['<eos>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
    tone_batch, no_tone_batch = [], []
    for (tone_item, no_tone_item) in data_batch:
        tone_batch.append(torch.cat([torch.tensor([BOS_IDX]), tone_item, torch.tensor([EOS_IDX])], dim=0))
        no_tone_batch.append(torch.cat([torch.tensor([BOS_IDX]), no_tone_item, torch.tensor([EOS_IDX])], dim=0))
        
    tone_batch = pad_sequence(tone_batch, padding_value=PAD_IDX)
    no_tone_batch = pad_sequence(no_tone_batch, padding_value=PAD_IDX)
    return no_tone_batch, tone_batch

In [72]:
%%time
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)
valid_iter = DataLoader(val_data, batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)

CPU times: user 3.4 s, sys: 5.58 ms, total: 3.4 s
Wall time: 3.35 s


In [73]:
for X, y in train_iter:
    print(X.shape, y.shape)
    break

torch.Size([25, 128]) torch.Size([25, 128])


In [58]:
class Encoder(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 embed_size: int,
                 num_hiddens: int,
                 num_layers: int,
                 dropout: float):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size, num_hiddens, num_layers, dropout=dropout)

    def forward(self,
                X: Tensor) -> Tuple[Tensor]:
        # X: (batch_size, num_steps)
        X = self.embedding(X)
        # X: (batch_size, num_steps, vocab_size)
        # swap time steps axis 
        X = X.permute(1, 0, 2)
        # X: (num_steps, batch_size, vocab_size)
        
        output, state = self.rnn(X)

        return output, state

In [59]:
encoder = Encoder(10, 5, 1, 2, 0.1)
X = torch.ones((3, 5), dtype=torch.long)
y, state = encoder(X)
y.shape, state.shape

(torch.Size([5, 3, 1]), torch.Size([2, 3, 1]))

In [60]:
class Decoder(nn.Module):
    def __init__(self,
                 vocab_size: int,
                 embed_size: int,
                 num_hiddens: int,
                 num_layers: int,
                 dropout: float = 0.0):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.GRU(embed_size + num_hiddens, num_hiddens, num_layers, dropout=dropout)
        self.dense = nn.Linear(num_hiddens, vocab_size)
    
    def init_state(self, enc_outputs):
        return enc_outputs[1]
    
    def forward(self,
                X: Tensor,
                state: Tensor) -> Tuple[Tensor]:
        X = self.embedding(X).permute(1, 0, 2)
        
        # state: (num_layers, num_batch, num_hiddens)
        # broadcast context to perform broadcast sum: (num_steps, 1, 1)
        context = state[-1].repeat(X.shape[0], 1, 1)
        
        X_context = torch.cat([X, context], dim=2)
        output, state = self.rnn(X_context, state)
        # output (num_steps, batch_size, num_hiddens) -> (num_steps, batch_size, vocab_size)
        output = self.dense(output).permute(1, 0, 2)
        return output, state

In [61]:
decoder = Decoder(10, 5, 1, 2, 0.0)
X = torch.ones((3, 5), dtype=torch.long)
state = decoder.init_state(encoder(X))
y, state = decoder(X, state)
y.shape, state.shape

(torch.Size([3, 5, 10]), torch.Size([2, 3, 1]))

In [62]:
class Seq2Seq(nn.Module):
    def __init__(self,
                 encoder: nn.Module,
                 decoder: nn.Module,
                 device: torch.device):
        super().__init__()

        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self,
                enc_X: Tensor,
                dec_X: Tensor) -> Tensor:
        enc_outputs = self.encoder(enc_X)
        dec_state = self.decoder.init_state(enc_outputs)
        
        return self.decoder(dec_X, dec_state)

In [78]:
def train_seq2seq(net, data_iter, lr, num_epochs, no_tone_vocab, device):
    def xavier_init_weights(m):
        if type(m) == nn.Linear:
            nn.init.xavier_uniform_(m.weight)
        if type(m) == nn.GRU:
            for param in m._flat_weights_names:
                if "weight" in param:
                    nn.init.xavier_uniform_(m._parameters[param])
    net.apply(xavier_init_weights)
    net.to(device)
    
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    
    PAD_IDX = tone_vocab.stoi['<pad>']
    loss = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
    net.train()
    
    for epoch in range(num_epochs):
        epoch_loss = 0
        for batch in tqdm(data_iter):
            optimizer.zero_grad()
            
            X, Y = [x.to(device) for x in batch]
            bos = torch.tensor([no_tone_vocab['<bos>']]*Y.shape[0], device=device).reshape(-1, 1)
            dec_input = torch.cat([bos, Y[:, :-1]], 1) # teacher forcing
            
            Y_hat, _ = net(X, dec_input)
            
            l = loss(Y_hat.permute(0, 2, 1), Y)#nn.functional.one_hot(Y, len(no_tone_vocab)).squeeze())
            
            l.sum().backward()
            
            # clip gradient
            torch.nn.utils.clip_grad_norm_(net.parameters(), 1)
            optimizer.step()
            epoch_loss += l.item()
        if ((epoch + 1) % 50 == 0):
            print(f'Loss {epoch_loss/len(data_iter):.10f}')

In [77]:
tone_vocab_size = len(tone_vocab)
no_tone_vocab_size = len(no_tone_vocab)
# ENC_EMB_DIM = 256
# DEC_EMB_DIM = 256
# ENC_HID_DIM = 512
# DEC_HID_DIM = 512
# ATTN_DIM = 64
# ENC_DROPOUT = 0.5
# DEC_DROPOUT = 0.5

embed_size = 32
num_hiddens = 64
num_layers = 2
dropout = 0.5

lr = 0.05
num_epochs = 10

enc = Encoder(no_tone_vocab_size, embed_size, num_hiddens, num_layers, dropout)

dec = Decoder(tone_vocab_size, embed_size, num_hiddens, num_layers, dropout)

net = Seq2Seq(enc, dec, device).to(device)

In [75]:
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


print(f'The model has {count_parameters(net):,} trainable parameters')

The model has 3,701,566 trainable parameters


In [79]:
train_seq2seq(net, train_iter, lr, num_epochs, no_tone_vocab, device)

 12%|█▎        | 1/8 [00:52<06:09, 52.75s/it]


KeyboardInterrupt: 

In [None]:
def predict_seq2seq(net, sentence, no_tone_vocab, tone_vocab, ):
    net.eval()
    
    
    