In [40]:
# imports
import string
import torch
import pandas as pd
import datasets
from datasets import Dataset
import torchtext
from torchtext.vocab import Vocab, build_vocab_from_iterator
from collections import Counter

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\linda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\linda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [41]:
# load training dataset
train_df = pd.read_csv(r"../Dataset/zu-en.training.csv")
train_df.head(3)

Unnamed: 0,zu,en
0,"Lomkhakha kufanele uthuthukiswe, lawa amazwi k...","This sector needs to be developed, These are ..."
1,Yonke imibuzo: Ucingo: 031- 311 3154 (Shaks Ra...,All questions: Phone: 031- 311 3154 (Shaks Ram...
2,Axhumanisa umphakathi noMkhandlu ngoba abika k...,They connect the community with the Council be...


In [42]:
# load evaluation dataset
eval_df = pd.read_csv(r"../Dataset/zu-en.eval.csv")
eval_df.head(3)

Unnamed: 0,zu,en
0,Ikomidi elihlelela imidlalo ye-2013 Orange Afr...,The 2013 Orange Africa Cup of Nations (known a...
1,Futhi ipolitiki akuwona umdlalo wabantu abanga...,And politics is not a game for immature people.
2,Ikhasi lethu lakwa e-Careers likubeka ngokucac...,Our e-Careers page makes it clear that if you ...


In [43]:
# load test dataset
test_df = pd.read_csv(r"../Dataset/zu-en.test.csv")
test_df.head(3)

Unnamed: 0,zu,en
0,NONDUDUZO NGCONGO SEKUVELILE ukuthi ezokuvakas...,NONDUDUZO NGCONGO It has come to light that to...
1,Umkhankaso uzobe usezindaweni zokubhukuda ezis...,The campaign will be at the swimming pools in ...
2,SinguMasipala siyakuqonda ukukhala kwabantu ka...,As a Municipality we understand the cries of t...


In [44]:
# convert panda dataframes to HuggingFace datasets for easy data manipulation
train_data = Dataset.from_pandas(train_df)
eval_data = Dataset.from_pandas(eval_df)
test_data = Dataset.from_pandas(test_df)

# show features to work with
print(f"{train_data.features}\n{eval_data.features}\n{test_data.features}")

{'zu': Value(dtype='string', id=None), 'en': Value(dtype='string', id=None)}
{'zu': Value(dtype='string', id=None), 'en': Value(dtype='string', id=None)}
{'zu': Value(dtype='string', id=None), 'en': Value(dtype='string', id=None)}


In [45]:
# # tokenize function
# def tokenize(src_data, column):
#   data_copy = src_data.copy()
#   for index,row in data_copy.iterrows():
#     col_text = row[column]
#     col_text_without_punctuation = ''.join([char for char in col_text if char not in string.punctuation])
#     token = word_tokenize(col_text_without_punctuation)
#     data_copy.at[index, column] = token
#   return data_copy

# # tokenize zu
# df_copy = tokenize(df, 'zu')
# # zulu_tokenized = df_copy['zu']

# # tokenize en
# df_copy = tokenize(df_copy, 'en')
# # english_tokenized = df_copy['en']

# df_copy.head(3)

In [46]:
# function to tokenize a row (example) in dataset using .map function
def tokenize_example(example, max_length, lower, sos_token, eos_token):
    zu_tokens = word_tokenize(example["zu"])[:max_length]
    en_tokens = word_tokenize(example["en"])[:max_length]
    if lower:
        zu_tokens = [token.lower() for token in zu_tokens]
        en_tokens = [token.lower() for token in en_tokens]
    zu_tokens = [sos_token] + zu_tokens + [eos_token]
    en_tokens = [sos_token] + en_tokens + [eos_token]
    return {"zu_tokens": zu_tokens, "en_tokens": en_tokens}

    
# maybe try with spacy (treating zulu as english and tokenize both with en_nlp?)

In [47]:
max_length = 500
lower = True
sos_token = "<sos>"
eos_token = "<eos>"

fn_kwargs = {
    "max_length": max_length,
    "lower": lower,
    "sos_token": sos_token,
    "eos_token": eos_token,
}

train_data = train_data.map(tokenize_example, fn_kwargs=fn_kwargs)
eval_data = eval_data.map(tokenize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(tokenize_example, fn_kwargs=fn_kwargs)

Map: 100%|████████████████████████████████████████████████████████████████| 4960/4960 [00:01<00:00, 4141.05 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 498/498 [00:00<00:00, 3761.39 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 487/487 [00:00<00:00, 3986.90 examples/s]


In [56]:
# build vocabulary
min_freq = 1    # maybe change to 2
zu_counter = Counter()
en_counter = Counter()

for tokens in train_data["zu_tokens"]:
    zu_counter.update(tokens)

for tokens in train_data["en_tokens"]:
    en_counter.update(tokens)

zu_vocab = build_vocab_from_iterator(
    train_data["zu_tokens"],
)

en_vocab = build_vocab_from_iterator(
    train_data["en_tokens"],
)

# apply min_freq
zu_vocab = Vocab(zu_counter, min_freq=min_freq)
en_vocab = Vocab(en_counter, min_freq=min_freq)

4960lines [00:00, 259712.47lines/s]
4960lines [00:00, 206130.77lines/s]


In [49]:
# show items in vocab, torchtext adds unk and pad tokens
# unk is for words in eval and test but not in train
zu_vocab.itos[:10]

['<unk>', '<pad>', '.', '<eos>', '<sos>', ',', 'ukuthi', ':', '(', ')']

In [63]:
# function to convert tokens in vocab to indices
def numericalize_example(example, zu_vocab , en_vocab):
    zu_ids = [zu_vocab.stoi[token] for token in example["zu_tokens"]]
    en_ids = [en_vocab.stoi[token] for token in example["en_tokens"]]
    return {"zu_ids": zu_ids, "en_ids": en_ids}

In [64]:
# use .map function to iteratively use the numericalize_example function
fn_kwargs = {"zu_vocab": zu_vocab, "en_vocab": en_vocab}

train_data = train_data.map(numericalize_example, fn_kwargs=fn_kwargs)
eval_data = eval_data.map(numericalize_example, fn_kwargs=fn_kwargs)
test_data = test_data.map(numericalize_example, fn_kwargs=fn_kwargs)

Map: 100%|████████████████████████████████████████████████████████████████| 4960/4960 [00:00<00:00, 6394.89 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 498/498 [00:00<00:00, 5725.98 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 487/487 [00:00<00:00, 6191.18 examples/s]


In [66]:
# show new features in an example (zu_ids and en_ids)
train_data[0]

{'zu': 'Lomkhakha kufanele uthuthukiswe, lawa amazwi kaSomlomo, uLogie Naidoo ekhuluma kwinkomfa nombukiso wobuciko ebuse Durban ICC ngesonto elidlule.',
 'en': 'This sector needs to be developed,  These are the words of Speaker, Logie Naidoo, from a conference and art exhibition held at the Durban ICC last week.',
 'zu_tokens': ['<sos>',
  'lomkhakha',
  'kufanele',
  'uthuthukiswe',
  ',',
  'lawa',
  'amazwi',
  'kasomlomo',
  ',',
  'ulogie',
  'naidoo',
  'ekhuluma',
  'kwinkomfa',
  'nombukiso',
  'wobuciko',
  'ebuse',
  'durban',
  'icc',
  'ngesonto',
  'elidlule',
  '.',
  '<eos>'],
 'en_tokens': ['<sos>',
  'this',
  'sector',
  'needs',
  'to',
  'be',
  'developed',
  ',',
  'these',
  'are',
  'the',
  'words',
  'of',
  'speaker',
  ',',
  'logie',
  'naidoo',
  ',',
  'from',
  'a',
  'conference',
  'and',
  'art',
  'exhibition',
  'held',
  'at',
  'the',
  'durban',
  'icc',
  'last',
  'week',
  '.',
  '<eos>'],
 'zu_ids': [4,
  5501,
  43,
  20930,
  5,
  1898,
  

In [71]:
# convert indices to PyTorch tensors for use with PyTorch
data_type = "torch"
format_columns = ["zu_ids", "en_ids"]

train_data = train_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

eval_data = eval_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

test_data = test_data.with_format(
    type=data_type,
    columns=format_columns,
    output_all_columns=True,
)

# show new type of indices
type(train_data[0]["en_ids"])

torch.Tensor