In [16]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim


In [12]:
# load data
with open('dataset/cleaned_train_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_diacritized_lines = file.readlines()
with open('dataset/cleaned_train_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_lines = file.readlines()
with open('dataset/cleaned_val_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_diacritized_lines = file.readlines()
with open('dataset/cleaned_val_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_lines = file.readlines()

training_data = ""
for line in training_data_lines:
    training_data += ''.join(line.split()).strip()
training_data_diacritized = ""
for line in training_data_diacritized_lines:
    training_data_diacritized += ''.join(line.split()).strip()
validation_data = ""
for line in validation_data_lines:
    validation_data += ''.join(line.split()).strip()
validation_data_diacritized = ""
for line in validation_data_diacritized_lines:
    validation_data_diacritized += ''.join(line.split()).strip()
    
print(training_data[:100])

قولهأوقطعالأوليدهإلخقالالزركشيابنعرفةقولهبلفظيقتضيهكإنكارغيرحديثبالإسلاموجوبماعلموجوبهمنالدينضرورةكإ


In [13]:
# define the labels and their corresponding indices
labels = {
    # fatha
    '\u064E':0,
    # damma
    '\u064F':1,
    # kasra
    '\u0650':2,
    # shadda
    '\u0651':3,
    # sukun
    '\u0652':4,
    # tanween_fatha
    '\u064B':5,
    # tanween_damma
    '\u064C':6,
    # tanween_kasra
    '\u064D':7
}

sequence_to_labels = {
    # fatha
    0:'\u064E',
    # damma
    1:'\u064F',
    # kasra
    2:'\u0650',
    # shadda
    3:'\u0651',
    # sukun
    4:'\u0652',
    # tanween_fatha
    5:'\u064B',
    # tanween_damma
    6:'\u064C',
    # tanween_kasra
    7:'\u064D'
}

In [17]:
# Tokenize the text into sequences at the character level
unique_chars = set(''.join(training_data + validation_data))
diacritization = list(labels.keys())

char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
index_to_char = {idx: char for idx, char in enumerate(unique_chars)}

print(char_to_index)

def text_to_sequence(text):
    return [char_to_index[char] for char in text]

train_sequence = text_to_sequence(training_data)
validation_sequences = text_to_sequence(validation_data)

print("Number of unique characters: ", len(unique_chars))
print(unique_chars)
print(train_sequence[:10])

{'ى': 0, 'ل': 1, 'ط': 2, 'ه': 3, 'ث': 4, 'س': 5, 'ف': 6, 'ظ': 7, 'غ': 8, 'ح': 9, 'ي': 10, 'ص': 11, 'إ': 12, 'ت': 13, 'ج': 14, 'ب': 15, 'ئ': 16, 'ا': 17, 'د': 18, 'و': 19, 'ء': 20, 'أ': 21, 'ز': 22, 'ؤ': 23, 'ذ': 24, 'ك': 25, 'ة': 26, 'ن': 27, 'ق': 28, 'ر': 29, 'آ': 30, 'خ': 31, 'ع': 32, 'ش': 33, 'م': 34, 'ض': 35}
Number of unique characters:  36
{'ى', 'ل', 'ط', 'ه', 'ث', 'س', 'ف', 'ظ', 'غ', 'ح', 'ي', 'ص', 'إ', 'ت', 'ج', 'ب', 'ئ', 'ا', 'د', 'و', 'ء', 'أ', 'ز', 'ؤ', 'ذ', 'ك', 'ة', 'ن', 'ق', 'ر', 'آ', 'خ', 'ع', 'ش', 'م', 'ض'}
[28, 19, 1, 3, 21, 19, 28, 2, 32, 17]


In [98]:
# Implementing word embedding using CBOW
# CBOW context window size
context_window = 1

# Define CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        # embedding layer 
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # linear layer
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        # The forward method specifies how data flows through the model.
        embedded = self.embeddings(context).sum(dim=1)
        output = self.linear(embedded)
        # now, output is of size [batch_size, vocab_size], we want it to be of size [1, vocab_size]
        # no problem, as it contains arrays, each array of size [1, vocab_size], and all elements of the arrays are the same
        output = output.squeeze(0)[:1, :]
        return output
    
# Instantiate CBOW model
embedding_dim = 20
vocab_size = len(unique_chars)
cbow_model = CBOW(vocab_size, embedding_dim)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to GPU if available
cbow_model = cbow_model.to(device)

print(cbow_model)

CBOW(
  (embeddings): Embedding(36, 20)
  (linear): Linear(in_features=20, out_features=36, bias=True)
)


In [99]:
# Training the CBOW model
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
# The Adam optimizer is an extension to stochastic gradient descent
optimizer = optim.SGD(cbow_model.parameters(), lr=0.01)

# Training loop
# TODO: make it 50
num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    for i in range(context_window, len(train_sequence) - context_window):
        # get context window, and get 1-hot encoding for each character in the window
        context = torch.tensor(train_sequence[i-context_window:i] + train_sequence[i+1:i+1+context_window])
        # create 1-hot encoding for each position in the context array
        context = torch.nn.functional.one_hot(context, num_classes=vocab_size)
        # get target character
        target = torch.tensor([train_sequence[i]])
        # create 1-hot encoding for target character
        target = torch.nn.functional.one_hot(target, num_classes=vocab_size).float()
        
        # Move tensors to GPU if available
        context, target = context.to(device), target.to(device)

        optimizer.zero_grad()
        output = cbow_model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss}')



Epoch 1/3, Loss: 578282.0670684576
Epoch 2/3, Loss: 575837.2111845016
Epoch 3/3, Loss: 575845.1363406181


In [101]:
# Get character embeddings
char_embeddings = cbow_model.embeddings.weight.detach().cpu().numpy()

print(char_embeddings.shape)

# Print character embeddings
for idx, char in index_to_char.items():
    print(f'{char}: {char_embeddings[idx]}')

(36, 20)
ى: [-0.02491802  0.03899876 -0.0858681  -0.05633762 -0.01706139 -0.00362067
 -0.00908654 -0.04218103  0.01387355 -0.01394024  0.0263774   0.00491693
 -0.0042674   0.01571666  0.07048664  0.02466692  0.01340346  0.01285902
  0.06158666  0.0092001 ]
ل: [-0.530507    0.4280205   0.62792784  2.230431    0.39901805  0.90821904
  0.5206144   1.1459837   0.42858866  1.4255207  -0.7981823  -0.65918475
  0.56598693 -0.12815654  0.12077828 -0.69428    -1.1426365  -1.6433033
 -0.21235164 -0.4322641 ]
ط: [-1.3274553   0.9719781  -1.3855953  -0.02773895 -0.27838656 -1.1500709
  1.051754    0.53278327 -1.4847202   0.6330003  -0.37871987  0.32195097
  0.20924708 -0.49564987 -2.1044495  -1.3350732  -0.37433508  1.0329229
 -0.67105204 -0.25660402]
ه: [ 1.5211954  -1.4997439   0.64794385  0.2514645  -0.7202958   0.8731724
  0.47653553  1.475459    0.42314875  0.90437627  0.57568526 -0.2645905
  0.2805538  -0.14809914 -0.38314816 -1.5234482  -0.8347636  -0.10202003
  0.7203185   0.04670541]
ث: [