In [24]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F


In [4]:
# load data
with open('dataset/cleaned_train_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_diacritized_lines = file.readlines()
with open('dataset/cleaned_train_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_lines = file.readlines()
with open('dataset/cleaned_val_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_diacritized_lines = file.readlines()
with open('dataset/cleaned_val_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_lines = file.readlines()

training_data = ""
for line in training_data_lines:
    training_data += ''.join(line.split()).strip()
training_data_diacritized = ""
for line in training_data_diacritized_lines:
    training_data_diacritized += ''.join(line.split()).strip()
validation_data = ""
for line in validation_data_lines:
    validation_data += ''.join(line.split()).strip()
validation_data_diacritized = ""
for line in validation_data_diacritized_lines:
    validation_data_diacritized += ''.join(line.split()).strip()
    
print(training_data[:100])

قولهأوقطعالأوليدهإلخقالالزركشيابنعرفةقولهبلفظيقتضيهكإنكارغيرحديثبالإسلاموجوبماعلموجوبهمنالدينضرورةكإ


In [5]:
# define the labels and their corresponding indices
labels = {
    # fatha
    '\u064E':0,
    # damma
    '\u064F':1,
    # kasra
    '\u0650':2,
    # shadda
    '\u0651':3,
    # sukun
    '\u0652':4,
    # tanween_fatha
    '\u064B':5,
    # tanween_damma
    '\u064C':6,
    # tanween_kasra
    '\u064D':7
}

sequence_to_labels = {
    # fatha
    0:'\u064E',
    # damma
    1:'\u064F',
    # kasra
    2:'\u0650',
    # shadda
    3:'\u0651',
    # sukun
    4:'\u0652',
    # tanween_fatha
    5:'\u064B',
    # tanween_damma
    6:'\u064C',
    # tanween_kasra
    7:'\u064D'
}

In [6]:
# Tokenize the text into sequences at the character level
unique_chars = set(''.join(training_data + validation_data))
diacritization = list(labels.keys())

char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
index_to_char = {idx: char for idx, char in enumerate(unique_chars)}

print(char_to_index)

def text_to_sequence(text):
    return [char_to_index[char] for char in text]

train_sequence = text_to_sequence(training_data)
validation_sequences = text_to_sequence(validation_data)

print("Number of unique characters: ", len(unique_chars))
print(unique_chars)
print(train_sequence[:10])

{'ح': 0, 'ى': 1, 'إ': 2, 'ز': 3, 'ذ': 4, 'آ': 5, 'ف': 6, 'ن': 7, 'س': 8, 'د': 9, 'ع': 10, 'ظ': 11, 'ا': 12, 'ؤ': 13, 'ط': 14, 'غ': 15, 'ة': 16, 'ئ': 17, 'ش': 18, 'م': 19, 'ك': 20, 'ق': 21, 'خ': 22, 'أ': 23, 'ث': 24, 'ء': 25, 'ه': 26, 'ص': 27, 'و': 28, 'ي': 29, 'ب': 30, 'ج': 31, 'ل': 32, 'ض': 33, 'ر': 34, 'ت': 35}
Number of unique characters:  36
{'ح', 'ى', 'إ', 'ز', 'ذ', 'آ', 'ف', 'ن', 'س', 'د', 'ع', 'ظ', 'ا', 'ؤ', 'ط', 'غ', 'ة', 'ئ', 'ش', 'م', 'ك', 'ق', 'خ', 'أ', 'ث', 'ء', 'ه', 'ص', 'و', 'ي', 'ب', 'ج', 'ل', 'ض', 'ر', 'ت'}
[21, 28, 32, 26, 23, 28, 21, 14, 10, 12]


In [7]:
# Model parameters
# CBOW context window size
context_window = 5
learning_rate = 0.01
embedding_dim = 100
num_epochs = 20

In [31]:
# Implementing word embedding using CBOW

# Define CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        # embedding layer 
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # linear layer
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        # The forward method specifies how data flows through the model.
        # now, embedded is of size [batch_size, embedded_dim], we want it to be of size [1, embedded_dim]
        # no problem, as it contains arrays, each array of size [1, embedded_dim], and all elements of the arrays are the same
        embedded = self.embeddings(context).mean(dim=1).squeeze(0)[:1, :]
        output = self.linear(embedded)
        probabilities = F.softmax(output, dim=1)
        return probabilities
    
# Instantiate CBOW model
vocab_size = len(unique_chars)
cbow_model = CBOW(vocab_size, embedding_dim)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to GPU if available
cbow_model = cbow_model.to(device)

print(cbow_model)

CBOW(
  (embeddings): Embedding(36, 100)
  (linear): Linear(in_features=100, out_features=36, bias=True)
)


In [29]:
# Training the CBOW model
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
# The Adam optimizer is an extension to stochastic gradient descent
optimizer = optim.SGD(cbow_model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    total_loss = 0
    for i in range(context_window, len(train_sequence) - context_window):
        # get context window, and get 1-hot encoding for each character in the window
        context = torch.tensor(train_sequence[i-context_window:i] + train_sequence[i+1:i+1+context_window])
        # create 1-hot encoding for each position in the context array
        context = F.one_hot(context, num_classes=vocab_size)
        # get target character
        target = torch.tensor([train_sequence[i]])
        # create 1-hot encoding for target character
        target = F.one_hot(target, num_classes=vocab_size).float()
        
        # Move tensors to GPU if available
        context, target = context.to(device), target.to(device)

        optimizer.zero_grad()
        output = cbow_model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss}')



torch.Size([1, 100])
tensor([1.], device='cuda:0', grad_fn=<SumBackward1>)
torch.Size([1, 36])
tensor([[0.0315, 0.0129, 0.0162, 0.0326, 0.0226, 0.0202, 0.0319, 0.0206, 0.0324,
         0.0175, 0.0184, 0.0305, 0.0154, 0.0649, 0.0062, 0.0389, 0.0529, 0.0199,
         0.0198, 0.0100, 0.0206, 0.0330, 0.0202, 0.0313, 0.0137, 0.0452, 0.0055,
         0.0581, 0.0176, 0.0304, 0.0166, 0.0232, 0.0244, 0.0482, 0.0400, 0.0566]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
torch.Size([1, 100])
tensor([1.], device='cuda:0', grad_fn=<SumBackward1>)
torch.Size([1, 36])
tensor([[0.0315, 0.0129, 0.0162, 0.0326, 0.0226, 0.0202, 0.0319, 0.0206, 0.0324,
         0.0175, 0.0184, 0.0305, 0.0154, 0.0649, 0.0062, 0.0389, 0.0529, 0.0199,
         0.0198, 0.0100, 0.0206, 0.0330, 0.0202, 0.0313, 0.0137, 0.0452, 0.0055,
         0.0580, 0.0179, 0.0304, 0.0166, 0.0232, 0.0244, 0.0482, 0.0400, 0.0566]],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)
torch.Size([1, 100])
tensor([1.], device='cuda:0', gra

KeyboardInterrupt: 

In [101]:
# Get character embeddings
char_embeddings = cbow_model.embeddings.weight.detach().cpu().numpy()

print(char_embeddings.shape)

# Print character embeddings
for idx, char in index_to_char.items():
    print(f'{char}: {char_embeddings[idx]}')

(36, 20)
ى: [-0.02491802  0.03899876 -0.0858681  -0.05633762 -0.01706139 -0.00362067
 -0.00908654 -0.04218103  0.01387355 -0.01394024  0.0263774   0.00491693
 -0.0042674   0.01571666  0.07048664  0.02466692  0.01340346  0.01285902
  0.06158666  0.0092001 ]
ل: [-0.530507    0.4280205   0.62792784  2.230431    0.39901805  0.90821904
  0.5206144   1.1459837   0.42858866  1.4255207  -0.7981823  -0.65918475
  0.56598693 -0.12815654  0.12077828 -0.69428    -1.1426365  -1.6433033
 -0.21235164 -0.4322641 ]
ط: [-1.3274553   0.9719781  -1.3855953  -0.02773895 -0.27838656 -1.1500709
  1.051754    0.53278327 -1.4847202   0.6330003  -0.37871987  0.32195097
  0.20924708 -0.49564987 -2.1044495  -1.3350732  -0.37433508  1.0329229
 -0.67105204 -0.25660402]
ه: [ 1.5211954  -1.4997439   0.64794385  0.2514645  -0.7202958   0.8731724
  0.47653553  1.475459    0.42314875  0.90437627  0.57568526 -0.2645905
  0.2805538  -0.14809914 -0.38314816 -1.5234482  -0.8347636  -0.10202003
  0.7203185   0.04670541]
ث: [