In [3]:
with open('data/cleaned_train.txt', 'r', encoding='utf-8') as f:
    big_string = f.read()

sentences = big_string.split('\n')

print(f"Total sentences: {len(sentences)}")
print("First 3 sentences:")
for i in range(3):
    print(f"Sentence {i}: {sentences[i][:50]}...")

Total sentences: 50001
First 3 sentences:
Sentence 0: وَلَوْ جَمَعَ ثُمَّ عَلِمَ تَرْكَ رُكْنٍ مِنْ الْأ...
Sentence 1: قَالَ أَبُو زَيْدٍ أَهْلُ تِهَامَةَ يُؤَنِّثُونَ ا...
Sentence 2: بِمَنْزِلَةِ أَهْلِ الذِّمَّةِ إذَا دَخَلُوا قَرْي...


In [4]:
empty_sentences = [s for s in sentences if len(s.strip()) == 0]
print(f"Empty sentences: {len(empty_sentences)}")

Empty sentences: 1


In [6]:
sentences = list(filter(lambda s: s.strip(), sentences))
print(f"Total sentences after removing empty ones: {len(sentences)}")

Total sentences after removing empty ones: 50000


In [14]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = Whitespace()

trainer = BpeTrainer(
    vocab_size=20000,
    special_tokens=["[UNK]", "[PAD]", "[CLS]", "[SEP]"]
)

tokenizer.train_from_iterator(sentences, trainer)
tokenizer.save('tokenizer.json') 

print("BPE tokenizer trained!")
print(f"Vocabulary size: {tokenizer.get_vocab_size()}")

test_tokens = tokenizer.encode(sentences[0]).tokens
print(f"First sentence: {sentences[0]}")
print(f"First sentence tokens: {test_tokens}")




BPE tokenizer trained!
Vocabulary size: 20000
First sentence: وَلَوْ جَمَعَ ثُمَّ عَلِمَ تَرْكَ رُكْنٍ مِنْ الْأُولَى بَطَلَتَا وَيُعِيدُهُمَا جَامِعًا ، أَوْ مِنْ الثَّانِيَةِ ، فَإِنْ لَمْ يَطُلْ تَدَارَكَ ، وَإِلَّا فَبَاطِلَةٌ وَلَا جَمَعَ ، وَلَوْ جَهِلَ أَعَادَهُمَا لِوَقْتَيْهِمَا
First sentence tokens: ['وَلَوْ', 'جَمَعَ', 'ثُمَّ', 'عَلِمَ', 'تَرْكَ', 'رُكْنٍ', 'مِنْ', 'الْأُولَى', 'بَطَلَ', 'تَا', 'وَيُ', 'عِيدُ', 'هُمَا', 'جَامِ', 'عًا', '،', 'أَوْ', 'مِنْ', 'الثَّانِيَةِ', '،', 'فَإِنْ', 'لَمْ', 'يَطُلْ', 'تَدَا', 'رَكَ', '،', 'وَإِلَّا', 'فَ', 'بَاطِلَةٌ', 'وَلَا', 'جَمَعَ', '،', 'وَلَوْ', 'جَهِلَ', 'أَعَادَ', 'هُمَا', 'لِ', 'وَقْتَ', 'يْهِمَا']


In [None]:
import pickle

with open('utils/arabic_letters.pickle', 'rb') as f:
    letters = pickle.load(f)

letter2idx = {letter: idx for idx, letter in enumerate(letters)}

with open('utils/letter2idx.pickle', 'wb') as f:
    pickle.dump(letter2idx, f)
