In [1]:
# install bnunicodenormalizer
!pip install bnunicodenormalizer



In [2]:
!pip install swifter



### imports necessaries

In [3]:
import pandas as pd

from tqdm.auto import tqdm  # for progress bar for iterative elements

from bnunicodenormalizer import Normalizer
bnorm = Normalizer()

import swifter

### Read the data and find unique unicodes

In [4]:
# load train and validation data [n.b. csv files]
train_df = pd.read_csv('../data/given/train.csv')
train_df = train_df[['sentence']]

val_df = pd.read_csv('../data/given/validation.csv')
val_df = val_df[['sentence']]

In [5]:
sens = train_df['sentence'].tolist() + val_df['sentence'].tolist()
print('Number of total sentences: ', len(sens))

Number of total sentences:  214697


In [6]:
# Non normalized vocabularies (unicodes)
vocab = []

for sen in tqdm(sens):
    sen = sen.replace('"', '')
    for c in sen:
        if c not in vocab:
            vocab.append(c)

non_norm_vocab = sorted(vocab)

print("Non normalized vocab(unicodes): [total: {}]".format(len(non_norm_vocab)))
for index, vocab in enumerate(non_norm_vocab):
    if not index % 20: print('\n')
    print("'{}'".format(vocab), end=', ')

  0%|          | 0/214697 [00:00<?, ?it/s]

Non normalized vocab(unicodes): [total: 90]


' ', '!', ''', ',', '-', '.', '/', ':', ';', '=', '?', 'A', 'B', 'V', '©', '।', '॥', 'ঁ', 'ং', 'ঃ', 

'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 

'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 

'ষ', 'স', 'হ', '়', 'া', 'ি', 'ী', 'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ', 'ড়', 'ঢ়', 'য়', 'ৰ', 

'৵', '৷', '–', '—', '‘', '’', '‚', '“', '”', '…', 

### Unicodes that look the same, but are not same

In [7]:
print("Index of '–': ", non_norm_vocab.index('–'))
print("Index of '—': ", non_norm_vocab.index('—'))
print("Are they same? : ", '–' == '—')

Index of '–':  82
Index of '—':  83
Are they same? :  False


In [8]:
print("Index of '।': ", non_norm_vocab.index('।'))
print("Index of '৷': ", non_norm_vocab.index('৷'))
print("Are they same? : ", '।' == '৷')

Index of '।':  15
Index of '৷':  81
Are they same? :  False


In [9]:
print("Index of ',': ", non_norm_vocab.index(','))
print("Index of '‚': ", non_norm_vocab.index('‚'))
print("Are they same? : ", ',' == '‚')

Index of ',':  3
Index of '‚':  86
Are they same? :  False


But the most dangerous thing in this text is _Nukta_ '্'

In [10]:
print("Index of Nukta: ", non_norm_vocab.index('্'))

Index of Nukta:  74


* What is this Nukta '্' mean, after breaking the word?
* Let's consider an example: 'কেন্দ্রীয়'=='কেন্দ্রীয়'
    * Both look the same... are they?

In [11]:
'কেন্দ্রীয়'=='কেন্দ্রীয়'

False

But why? ... **Because the first one contains Nukta '্'**

In [12]:
print("First one: ", [f for f in 'কেন্দ্রীয়'])
print("Second one: ", [f for f in 'কেন্দ্রীয়'])

First one:  ['ক', 'ে', 'ন', '্', 'দ', '্', 'র', 'ী', 'য', '়']
Second one:  ['ক', 'ে', 'ন', '্', 'দ', '্', 'র', 'ী', 'য়']


### Normalization
Now let's solve these problems by normalizing

In [13]:
# Normalize method to normalize a single sentence
def normalize(sen):
    _words = [bnorm(word)['normalized'] for word in sen.split()]
    return ' '.join([word for word in _words if word is not None])

In [14]:
val_df['sentence'] = val_df['sentence'].swifter.apply(lambda x: normalize(x))
train_df['sentence'] = train_df['sentence'].swifter.apply(lambda x: normalize(x))

sens = train_df['sentence'].tolist() + val_df['sentence'].tolist()
print('Number of total sentences:', len(sens))

Pandas Apply:   0%|          | 0/7747 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/206950 [00:00<?, ?it/s]

Number of total sentences: 214697


In [17]:
vocab = []

for sen in tqdm(sens):
    sen = sen.replace('"', '')
    for c in sen:
        if c not in vocab:
            vocab.append(c)

norm_vocab = sorted(vocab)

print("Normalized vocab(unicodes): [total: {}]".format(len(norm_vocab)))
for index, vocab in enumerate(norm_vocab):
    if not index % 20: print('\n')
    print("'{}'".format(vocab), end=', ')

  0%|          | 0/214697 [00:00<?, ?it/s]

Normalized vocab(unicodes): [total: 74]


' ', '!', ''', ',', '-', '.', ':', ';', '=', '?', '।', 'ঁ', 'ং', 'ঃ', 'অ', 'আ', 'ই', 'ঈ', 'উ', 'ঊ', 

'ঋ', 'এ', 'ঐ', 'ও', 'ঔ', 'ক', 'খ', 'গ', 'ঘ', 'ঙ', 'চ', 'ছ', 'জ', 'ঝ', 'ঞ', 'ট', 'ঠ', 'ড', 'ঢ', 'ণ', 

'ত', 'থ', 'দ', 'ধ', 'ন', 'প', 'ফ', 'ব', 'ভ', 'ম', 'য', 'র', 'ল', 'শ', 'ষ', 'স', 'হ', 'া', 'ি', 'ী', 

'ু', 'ূ', 'ৃ', 'ে', 'ৈ', 'ো', 'ৌ', '্', 'ৎ', 'ড়', 'ঢ়', 'য়', '—', '”', 

In [19]:
# Removed symbols
remove_count = 0
print("Removed symbols: ")
for c in non_norm_vocab:
    if c not in norm_vocab:
        print("'{}'".format(c), end=', ')
        remove_count += 1

print("Total removed symbols: ", remove_count)

Removed symbols: 
'/', 'A', 'B', 'V', '©', '॥', '়', 'ৰ', '৵', '৷', '–', '‘', '’', '‚', '“', '…', Total removed symbols:  16


* We will also remove '—' from normalized vocab since '-' has the same functional value
* We also have to consider "\u200d" to cover words like - র‍্যাব, "র‍্যাকেট", "র‍্যাশানাল"

In [20]:
words = ["র‍্যাব", "র‍্যাকেট", "র‍্যাশানাল"]
for word in words:
    print(word)

র‍্যাব
র‍্যাকেট
র‍্যাশানাল


basically any word that has

In [24]:
print('র'+'\u200d'+'্'+'য')

র‍্য


in the text, simply adding 'র'+'্'+'য' result in -

In [25]:
print('র'+'্'+'য')

র্য


final vocab we can go with (or even remove some puntuations and numbers if we want)

In [26]:
vocab=[ '\u200d',
        ' ','!',"'",',','-','.',':',';','=','?','।',
        'ঁ','ং','ঃ',
        'অ','আ','ই','ঈ','উ','ঊ','ঋ','এ','ঐ','ও','ঔ',
        'ক','খ','গ','ঘ','ঙ',
        'চ','ছ','জ','ঝ','ঞ',
        'ট','ঠ','ড','ঢ','ণ',
        'ত','থ','দ','ধ','ন',
        'প','ফ','ব','ভ','ম',
        'য','র','ল',
        'শ','ষ','স','হ',
        'া','ি','ী','ু','ূ','ৃ','ে','ৈ','ো','ৌ','্',
        'ৎ','ড়','ঢ়','য়',
        '০','১','২','৩','৪','৫','৬','৭','৮','৯']

to explore non norm text try the following -

In [33]:
import unicodedata

text = "বায়ান্ন"
new_text = unicodedata.normalize('NFKC', text)

print("text: ", text, [t for t in text])
print("new text: ", new_text, [t for t in new_text])
print("are they same?: ", text == new_text)

text:  বায়ান্ন ['ব', 'া', 'য়', 'া', 'ন', '্', 'ন']
new text:  বায়ান্ন ['ব', 'া', 'য', '়', 'া', 'ন', '্', 'ন']
are they same?:  False


this will cause problem in WER and CER calculation