In [None]:
import pickle
import re
import numpy as np

from tensorflow.keras.preprocessing.sequence import pad_sequences
from utils.utils import separate_diacritics
from utils.utils import preprocess

In [None]:
with open('data/train.txt', 'rb') as f:
    train_data = f.read().decode('utf-8')

with open("utils/diacritics.pickle", "rb") as f:
    diacritics_chars = "".join(list(pickle.load(f)))

with open('data/val.txt', 'rb') as f:
    val_data = f.read().decode('utf-8')

print(f"Original train data size: {len(train_data)} characters")

cleaned_train_data = preprocess(train_data, diacritics_chars)

print(f"Cleaned train data size: {len(cleaned_train_data)} characters")


print(f"Original val data size: {len(val_data)} characters")

cleaned_val_data = preprocess(val_data, diacritics_chars)

print(f"Cleaned val data size: {len(cleaned_val_data)} characters")

with open('data/cleaned_train.txt', 'w', encoding='utf-8') as f:
    f.write(cleaned_train_data)

with open('data/cleaned_val.txt', 'w', encoding='utf-8') as f:
    f.write(cleaned_val_data)

In [23]:
with open('data/cleaned_train.txt', 'r', encoding='utf-8') as f:
    train_data = f.read()

with open('data/cleaned_val.txt', 'r', encoding='utf-8') as f:
    val_data = f.read()

split_punct = {",", ".", "،", ":", "?", "؟", "؛", "«", "»", "،", "\n"}
train_sentences = re.split(f"[{re.escape(''.join(split_punct))}]", train_data)
val_sentences = re.split(f"[{re.escape(''.join(split_punct))}]", val_data)

print(f"Total train_sentences: {len(train_sentences)}")
print(f"Total val_sentences: {len(val_sentences)}")

Total train_sentences: 225723
Total val_sentences: 11015


In [24]:
train_sentences = list(filter(lambda s: s.strip(), train_sentences))
val_sentences = list(filter(lambda s: s.strip(), val_sentences))
print(f"Total train sentences after removing empty ones: {len(train_sentences)}")
print(f"Total val sentences after removing empty ones: {len(val_sentences)}")

Total train sentences after removing empty ones: 185074
Total val sentences after removing empty ones: 9000


In [25]:
# with open('utils/arabic_letters.pickle', 'rb') as f:
#     letters = pickle.load(f)

# letters.add(' ')
# letters.add('<PAD>')

# letter2idx = {letter: idx for idx, letter in enumerate(letters)}

# with open('utils/letter2idx.pickle', 'wb') as f:
#     pickle.dump(letter2idx, f)


In [26]:
# with open('utils/diacritic2id.pickle', 'rb') as f:
#     diacritic2id = pickle.load(f)

# diacritic2id['<PAD>'] = len(diacritic2id)

# diacritic2idx = {diacritic: idx for idx, diacritic in enumerate(diacritic2id)}

# with open('utils/diacritic2id.pickle', 'wb') as f:
#     pickle.dump(diacritic2idx, f)


In [27]:
X_train = []
y_train = []
X_val = []
y_val = []

with open('utils/diacritic2id.pickle', 'rb') as f:
    diacritic2idx = pickle.load(f)

with open('utils/letter2idx.pickle', 'rb') as f:
    letter2idx = pickle.load(f)

for sentence in train_sentences:
    chars, diacritics = separate_diacritics(sentence.strip(), diacritic2idx)
    X_train.append([letter2idx[char] for char in chars])
    y_train.append([diacritic2idx[diacritic] for diacritic in diacritics])

for sentence in val_sentences:
    chars, diacritics = separate_diacritics(sentence.strip(), diacritic2idx)
    X_val.append([letter2idx[char] for char in chars])
    y_val.append([diacritic2idx[diacritic] for diacritic in diacritics])

print(X_train[0])
print(y_train[0])
print(X_val[0])
print(y_val[0])

[28, 12, 28, 37, 15, 11, 20, 37, 33, 11, 37, 20, 12, 11, 37, 14, 8, 25, 37, 8, 25, 3, 37, 11, 3, 37, 17, 12, 27, 28, 12, 10, 37, 29, 9, 12, 14, 17, 37, 28, 1, 20, 1, 6, 23, 11, 17, 37, 15, 17, 11, 20, 17]
[0, 0, 6, 14, 0, 0, 0, 14, 2, 8, 14, 0, 4, 0, 14, 0, 6, 0, 14, 2, 6, 5, 14, 4, 6, 14, 14, 6, 2, 14, 0, 14, 14, 0, 0, 0, 0, 14, 14, 0, 2, 4, 14, 2, 2, 0, 14, 14, 0, 14, 4, 1, 14]
[17, 12, 26, 23, 17, 6, 32, 37, 0, 17, 23, 8, 32]
[14, 14, 8, 0, 14, 0, 4, 14, 0, 14, 4, 0, 1]


In [28]:
print(len(X_train), len(y_train))
print(len(X_val), len(y_val))

185074 185074
9000 9000


In [29]:
max_train_length = max(len(seq) for seq in X_train)
max_val_length = max(len(seq) for seq in X_val)
max_length = max(max_train_length, max_val_length)
print(f"Maximum sequence length: {max_length}")

X_train_padded = pad_sequences(
    X_train,
    maxlen=max_length,
    padding='post',
    value=letter2idx['<PAD>']
)

y_train_padded = pad_sequences(
    y_train,
    maxlen=max_length,
    padding='post',
    value=diacritic2idx['<PAD>']
)

X_val_padded = pad_sequences(
    X_val,
    maxlen=max_length,
    padding='post',
    value=letter2idx['<PAD>']
)

y_val_padded = pad_sequences(
    y_val,
    maxlen=max_length,
    padding='post',
    value=diacritic2idx['<PAD>']
)

print(f"X_train_padded shape: {X_train_padded.shape}")
print(f"y_train_padded shape: {y_train_padded.shape}")

print(f"First sequence before padding: {len(X_train[0])} chars")
print(f"First sequence after padding: {len(X_train_padded[0])} chars")
print(f"Sample padded X: {X_train_padded[0][:15]}...")
print(f"Sample padded y: {y_train_padded[0][:15]}...")

print(f"PAD token index: {letter2idx['<PAD>']}")
print(f"PAD diacritic index: {diacritic2idx['<PAD>']}")


print(f"X_val_padded shape: {X_val_padded.shape}")
print(f"y_val_padded shape: {y_val_padded.shape}")

print(f"First val sequence before padding: {len(X_val[0])} chars")
print(f"First val sequence after padding: {len(X_val_padded[0])} chars")
print(f"Sample padded X_val: {X_val_padded[0][:15]}...")
print(f"Sample padded y_val: {y_val_padded[0][:15]}...")

print(f"PAD token index: {letter2idx['<PAD>']}")
print(f"PAD diacritic index: {diacritic2idx['<PAD>']}")

Maximum sequence length: 1236
X_train_padded shape: (185074, 1236)
y_train_padded shape: (185074, 1236)
First sequence before padding: 53 chars
First sequence after padding: 1236 chars
Sample padded X: [28 12 28 37 15 11 20 37 33 11 37 20 12 11 37]...
Sample padded y: [ 0  0  6 14  0  0  0 14  2  8 14  0  4  0 14]...
PAD token index: 13
PAD diacritic index: 15
X_val_padded shape: (9000, 1236)
y_val_padded shape: (9000, 1236)
First val sequence before padding: 13 chars
First val sequence after padding: 1236 chars
Sample padded X_val: [17 12 26 23 17  6 32 37  0 17 23  8 32 13 13]...
Sample padded y_val: [14 14  8  0 14  0  4 14  0 14  4  0  1 15 15]...
PAD token index: 13
PAD diacritic index: 15


In [30]:
X_train_padded = np.array(X_train_padded)
y_train_padded = np.array(y_train_padded)

X_val_padded = np.array(X_val_padded)
y_val_padded = np.array(y_val_padded)

print(f"X_train_padded shape: {X_train_padded.shape}")
print(f"y_train_padded shape: {y_train_padded.shape}")

print(f"X_val_padded shape: {X_val_padded.shape}")
print(f"y_val_padded shape: {y_val_padded.shape}")

np.save('data/X_train.npy', X_train_padded)
np.save('data/y_train.npy', y_train_padded)

np.save('data/X_val.npy', X_val_padded)
np.save('data/y_val.npy', y_val_padded)

X_train_padded shape: (185074, 1236)
y_train_padded shape: (185074, 1236)
X_val_padded shape: (9000, 1236)
y_val_padded shape: (9000, 1236)
