In [24]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


In [4]:
# load data
with open('dataset/cleaned_train_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_diacritized_lines = file.readlines()
with open('dataset/cleaned_train_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_lines = file.readlines()
with open('dataset/cleaned_val_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_diacritized_lines = file.readlines()
with open('dataset/cleaned_val_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_lines = file.readlines()

training_data = ""
for line in training_data_lines:
    training_data += ''.join(line.split()).strip()
training_data_diacritized = ""
for line in training_data_diacritized_lines:
    training_data_diacritized += ''.join(line.split()).strip()
validation_data = ""
for line in validation_data_lines:
    validation_data += ''.join(line.split()).strip()
validation_data_diacritized = ""
for line in validation_data_diacritized_lines:
    validation_data_diacritized += ''.join(line.split()).strip()
    
print(training_data[:100])

قولهأوقطعالأوليدهإلخقالالزركشيابنعرفةقولهبلفظيقتضيهكإنكارغيرحديثبالإسلاموجوبماعلموجوبهمنالدينضرورةكإ


In [5]:
# define the labels and their corresponding indices
labels = {
    # fatha
    '\u064E':0,
    # damma
    '\u064F':1,
    # kasra
    '\u0650':2,
    # shadda
    '\u0651':3,
    # sukun
    '\u0652':4,
    # tanween_fatha
    '\u064B':5,
    # tanween_damma
    '\u064C':6,
    # tanween_kasra
    '\u064D':7
}

sequence_to_labels = {
    # fatha
    0:'\u064E',
    # damma
    1:'\u064F',
    # kasra
    2:'\u0650',
    # shadda
    3:'\u0651',
    # sukun
    4:'\u0652',
    # tanween_fatha
    5:'\u064B',
    # tanween_damma
    6:'\u064C',
    # tanween_kasra
    7:'\u064D'
}

In [6]:
# Tokenize the text into sequences at the character level
unique_chars = set(''.join(training_data + validation_data))
diacritization = list(labels.keys())

char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
index_to_char = {idx: char for idx, char in enumerate(unique_chars)}

print(char_to_index)

def text_to_sequence(text):
    return [char_to_index[char] for char in text]

train_sequence = text_to_sequence(training_data)
validation_sequences = text_to_sequence(validation_data)

print("Number of unique characters: ", len(unique_chars))
print(unique_chars)
print(train_sequence[:10])

{'ح': 0, 'ى': 1, 'إ': 2, 'ز': 3, 'ذ': 4, 'آ': 5, 'ف': 6, 'ن': 7, 'س': 8, 'د': 9, 'ع': 10, 'ظ': 11, 'ا': 12, 'ؤ': 13, 'ط': 14, 'غ': 15, 'ة': 16, 'ئ': 17, 'ش': 18, 'م': 19, 'ك': 20, 'ق': 21, 'خ': 22, 'أ': 23, 'ث': 24, 'ء': 25, 'ه': 26, 'ص': 27, 'و': 28, 'ي': 29, 'ب': 30, 'ج': 31, 'ل': 32, 'ض': 33, 'ر': 34, 'ت': 35}
Number of unique characters:  36
{'ح', 'ى', 'إ', 'ز', 'ذ', 'آ', 'ف', 'ن', 'س', 'د', 'ع', 'ظ', 'ا', 'ؤ', 'ط', 'غ', 'ة', 'ئ', 'ش', 'م', 'ك', 'ق', 'خ', 'أ', 'ث', 'ء', 'ه', 'ص', 'و', 'ي', 'ب', 'ج', 'ل', 'ض', 'ر', 'ت'}
[21, 28, 32, 26, 23, 28, 21, 14, 10, 12]


In [None]:
embedding_dim = 100


In [41]:
# Create the embedding layer
embedding = nn.Embedding(len(unique_chars), 100)
# Get sequences of unique chars
sequences = torch.tensor([idx for idx, _ in index_to_char.items()])
# Apply the embedding layer to get the embedding vectors
embedding_vectors = embedding(sequences)

print(embedding_vectors.shape)
print(embedding_vectors)


torch.Size([36, 100])
tensor([[ 1.5781, -0.0868,  1.5066,  ...,  0.5900, -1.2241, -0.7719],
        [-0.5115, -0.4769, -1.1611,  ..., -0.8252, -0.2730, -1.1118],
        [ 0.2217,  1.0923,  0.0639,  ..., -1.4227, -0.4039, -2.2380],
        ...,
        [ 0.3264,  0.8257,  0.9764,  ..., -0.4155, -0.7212, -0.0115],
        [ 1.9498, -0.1098,  0.3070,  ...,  0.1984, -1.2506, -1.3537],
        [-0.0625, -2.2053, -1.8244,  ..., -0.8968,  0.5208,  0.1004]],
       grad_fn=<EmbeddingBackward0>)


In [45]:
print(embedding_vectors.shape)
print(index_to_char)
# Print character embeddings
for idx, char in index_to_char.items():
    print(f'{char}: {embedding_vectors[idx]}')

torch.Size([36, 100])
{0: 'ح', 1: 'ى', 2: 'إ', 3: 'ز', 4: 'ذ', 5: 'آ', 6: 'ف', 7: 'ن', 8: 'س', 9: 'د', 10: 'ع', 11: 'ظ', 12: 'ا', 13: 'ؤ', 14: 'ط', 15: 'غ', 16: 'ة', 17: 'ئ', 18: 'ش', 19: 'م', 20: 'ك', 21: 'ق', 22: 'خ', 23: 'أ', 24: 'ث', 25: 'ء', 26: 'ه', 27: 'ص', 28: 'و', 29: 'ي', 30: 'ب', 31: 'ج', 32: 'ل', 33: 'ض', 34: 'ر', 35: 'ت'}
ح: tensor([ 1.5781, -0.0868,  1.5066, -0.0182,  0.0481, -0.2390, -0.0972,  0.2942,
         1.3686,  0.5327, -0.7098,  0.3583,  0.0358, -0.0853,  0.9069, -1.2180,
        -1.9639, -0.5615, -3.1505,  0.3883,  0.8185,  0.2087, -1.7405, -0.2391,
        -1.1850,  0.2003, -0.0325, -0.2991, -0.1429, -1.3346, -0.0176, -0.8611,
        -1.4873, -0.0081,  0.5568, -0.8177,  1.3593,  0.3312, -1.2980,  0.2650,
         2.1433,  1.8328, -0.6150, -1.1588,  1.5270,  0.0733,  1.8880, -1.2962,
         0.1995,  0.8504, -0.1595, -1.2263, -2.0970, -0.7885,  1.4385, -0.1767,
        -0.8182, -0.2204, -0.7979,  1.0873,  0.1261,  1.5843, -0.3487,  0.1182,
         0.2236, -1.