In [16]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim


In [12]:
# load data
with open('dataset/cleaned_train_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_diacritized_lines = file.readlines()
with open('dataset/cleaned_train_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_lines = file.readlines()
with open('dataset/cleaned_val_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_diacritized_lines = file.readlines()
with open('dataset/cleaned_val_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_lines = file.readlines()

training_data = ""
for line in training_data_lines:
    training_data += ''.join(line.split()).strip()
training_data_diacritized = ""
for line in training_data_diacritized_lines:
    training_data_diacritized += ''.join(line.split()).strip()
validation_data = ""
for line in validation_data_lines:
    validation_data += ''.join(line.split()).strip()
validation_data_diacritized = ""
for line in validation_data_diacritized_lines:
    validation_data_diacritized += ''.join(line.split()).strip()
    
print(training_data[:100])

قولهأوقطعالأوليدهإلخقالالزركشيابنعرفةقولهبلفظيقتضيهكإنكارغيرحديثبالإسلاموجوبماعلموجوبهمنالدينضرورةكإ


In [13]:
# define the labels and their corresponding indices
labels = {
    # fatha
    '\u064E':0,
    # damma
    '\u064F':1,
    # kasra
    '\u0650':2,
    # shadda
    '\u0651':3,
    # sukun
    '\u0652':4,
    # tanween_fatha
    '\u064B':5,
    # tanween_damma
    '\u064C':6,
    # tanween_kasra
    '\u064D':7
}

sequence_to_labels = {
    # fatha
    0:'\u064E',
    # damma
    1:'\u064F',
    # kasra
    2:'\u0650',
    # shadda
    3:'\u0651',
    # sukun
    4:'\u0652',
    # tanween_fatha
    5:'\u064B',
    # tanween_damma
    6:'\u064C',
    # tanween_kasra
    7:'\u064D'
}

In [17]:
# Tokenize the text into sequences at the character level
unique_chars = set(''.join(training_data + validation_data))
diacritization = list(labels.keys())

char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
index_to_char = {idx: char for idx, char in enumerate(unique_chars)}

print(char_to_index)

def text_to_sequence(text):
    return [char_to_index[char] for char in text]

train_sequence = text_to_sequence(training_data)
validation_sequences = text_to_sequence(validation_data)

print("Number of unique characters: ", len(unique_chars))
print(unique_chars)
print(train_sequence[:10])

{'ى': 0, 'ل': 1, 'ط': 2, 'ه': 3, 'ث': 4, 'س': 5, 'ف': 6, 'ظ': 7, 'غ': 8, 'ح': 9, 'ي': 10, 'ص': 11, 'إ': 12, 'ت': 13, 'ج': 14, 'ب': 15, 'ئ': 16, 'ا': 17, 'د': 18, 'و': 19, 'ء': 20, 'أ': 21, 'ز': 22, 'ؤ': 23, 'ذ': 24, 'ك': 25, 'ة': 26, 'ن': 27, 'ق': 28, 'ر': 29, 'آ': 30, 'خ': 31, 'ع': 32, 'ش': 33, 'م': 34, 'ض': 35}
Number of unique characters:  36
{'ى', 'ل', 'ط', 'ه', 'ث', 'س', 'ف', 'ظ', 'غ', 'ح', 'ي', 'ص', 'إ', 'ت', 'ج', 'ب', 'ئ', 'ا', 'د', 'و', 'ء', 'أ', 'ز', 'ؤ', 'ذ', 'ك', 'ة', 'ن', 'ق', 'ر', 'آ', 'خ', 'ع', 'ش', 'م', 'ض'}
[28, 19, 1, 3, 21, 19, 28, 2, 32, 17]


In [18]:
# Implementing word embedding using CBOW
# CBOW context window size
context_window = 2

# Define CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        # embedding layer 
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # linear layer
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        # The forward method specifies how data flows through the model.
        embedded = self.embeddings(context).sum(dim=1)
        output = self.linear(embedded)
        return output
    
# Instantiate CBOW model
embedding_dim = 50
vocab_size = len(unique_chars)
cbow_model = CBOW(vocab_size, embedding_dim)

print(cbow_model)

CBOW(
  (embeddings): Embedding(36, 50)
  (linear): Linear(in_features=50, out_features=36, bias=True)
)
