### A3

In [None]:
from sklearn.model_selection import train_test_split

# File paths
myanmar_file_path = './app/dataset/data_myn.txt'
english_file_path = './app/dataset/data_eng.txt'

# Read the files
with open(myanmar_file_path, 'r', encoding='utf-8') as myfile:
    myanmar_lines = myfile.readlines()

with open(english_file_path, 'r', encoding='utf-8') as engfile:
    english_lines = engfile.readlines()

# Pair the lines
paired_lines = list(zip(myanmar_lines, english_lines))

In [None]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

BATCH_SIZE = 64

# helper function to club together sequential operations
def sequential_transforms(*transforms):
    def func(txt_input):
        if txt_input is None:
            return []
        for transform in transforms:
            txt_input = transform(txt_input)
        return txt_input
    return func

# function to add BOS/EOS and create tensor for input sequence indices
def tensor_transform(token_ids):
    return torch.cat((torch.tensor([SOS_IDX]), 
                      torch.tensor(token_ids), 
                      torch.tensor([EOS_IDX])))

# src and trg language text transforms to convert raw strings into tensors indices
text_transform = {}
for ln in [SRC_LANGUAGE, TRG_LANGUAGE]:
    text_transform[ln] = sequential_transforms(token_transform[ln], #Tokenization
                                               vocab_transform[ln], #Numericalization
                                               tensor_transform) # Add BOS/EOS and create tensor

# function to collate data samples into batch tensors
def collate_batch(batch):
    src_batch, src_len_batch, trg_batch = [], [], []
    for lang_data in batch:
        for lang, tokens in lang_data.items():
            processed_text = text_transform[lang](tokens) if tokens else torch.empty(0, dtype=torch.int64)
            if lang == SRC_LANGUAGE:
                src_batch.append(processed_text)
                src_len_batch.append(processed_text.size(0))
            elif lang == TRG_LANGUAGE:
                trg_batch.append(processed_text)
                
    src_batch = pad_sequence(src_batch, padding_value=PAD_IDX, batch_first = True) #<----need this because we use linear
    # print('src_batch', src_batch.shape

    # Set the length of empty sequences to the maximum length
    max_len = max(src_batch.size(0), max(src_len_batch))
    # print('max_length: ', max_len)
    src_len_batch = [length if length > 0 else max_len for length in src_len_batch]
    # print('src_len_batch', src_len_batch)

    trg_batch = pad_sequence(trg_batch, padding_value=PAD_IDX, batch_first = True)

    return src_batch, torch.tensor(src_len_batch, dtype=torch.int64), trg_batch

### A2

In [None]:
dataset = datasets.load_dataset('KaungHtetCho/Harry_Potter_LSTM') #load dataset from hugging face

In [None]:
# tokenization

from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english') # built-in tokenizer from torchtext

tokenize_data = lambda example, tokenizer: {'tokens': tokenizer(example['text'])} 

tokenized_dataset = dataset.map(tokenize_data, remove_columns=['text'], fn_kwargs={'tokenizer': tokenizer})

In [None]:
# numericalization

from torchtext.vocab import build_vocab_from_iterator

vocab = build_vocab_from_iterator(tokenized_dataset['train']['tokens'], min_freq=3) 
vocab.insert_token('<unk>', 0)
vocab.insert_token('<eos>', 1)
vocab.set_default_index(vocab['<unk>']) # if there is no index, assigned '<unk>'

In [None]:
def get_data(dataset, vocab, batch_size): # vocab = tokens to integer index

    data = []

    for example in dataset:
        if example['tokens']:
            tokens = example['tokens'].append('<eos>')
            tokens = [vocab[token] for token in example['tokens']]
            data.extend(tokens) # creating new list # append = extra list

    data = torch.LongTensor(data) # coverting to tensor int

    num_batches = data.shape[0]  // batch_size # integer division # data.shape[0] = 12 as in example
    data        = data[:num_batches * batch_size] 

    data = data.view(batch_size, num_batches) #view vs. reshape (whether data is contiguous)
    return data #[batch size, seq len] 

# can use dataloader in pytorch way also

In [None]:
batch_size = 128
train_data = get_data(tokenized_dataset['train'], vocab, batch_size)
valid_data = get_data(tokenized_dataset['validation'], vocab, batch_size)
test_data  = get_data(tokenized_dataset['test'],  vocab, batch_size)