In [16]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim


In [12]:
# load data
with open('dataset/cleaned_train_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_diacritized_lines = file.readlines()
with open('dataset/cleaned_train_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_lines = file.readlines()
with open('dataset/cleaned_val_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_diacritized_lines = file.readlines()
with open('dataset/cleaned_val_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_lines = file.readlines()

training_data = ""
for line in training_data_lines:
    training_data += ''.join(line.split()).strip()
training_data_diacritized = ""
for line in training_data_diacritized_lines:
    training_data_diacritized += ''.join(line.split()).strip()
validation_data = ""
for line in validation_data_lines:
    validation_data += ''.join(line.split()).strip()
validation_data_diacritized = ""
for line in validation_data_diacritized_lines:
    validation_data_diacritized += ''.join(line.split()).strip()
    
print(training_data[:100])

قولهأوقطعالأوليدهإلخقالالزركشيابنعرفةقولهبلفظيقتضيهكإنكارغيرحديثبالإسلاموجوبماعلموجوبهمنالدينضرورةكإ


In [13]:
# define the labels and their corresponding indices
labels = {
    # fatha
    '\u064E':0,
    # damma
    '\u064F':1,
    # kasra
    '\u0650':2,
    # shadda
    '\u0651':3,
    # sukun
    '\u0652':4,
    # tanween_fatha
    '\u064B':5,
    # tanween_damma
    '\u064C':6,
    # tanween_kasra
    '\u064D':7
}

sequence_to_labels = {
    # fatha
    0:'\u064E',
    # damma
    1:'\u064F',
    # kasra
    2:'\u0650',
    # shadda
    3:'\u0651',
    # sukun
    4:'\u0652',
    # tanween_fatha
    5:'\u064B',
    # tanween_damma
    6:'\u064C',
    # tanween_kasra
    7:'\u064D'
}

In [17]:
# Tokenize the text into sequences at the character level
unique_chars = set(''.join(training_data + validation_data))
diacritization = list(labels.keys())

char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
index_to_char = {idx: char for idx, char in enumerate(unique_chars)}

print(char_to_index)

def text_to_sequence(text):
    return [char_to_index[char] for char in text]

train_sequence = text_to_sequence(training_data)
validation_sequences = text_to_sequence(validation_data)

print("Number of unique characters: ", len(unique_chars))
print(unique_chars)
print(train_sequence[:10])

{'ى': 0, 'ل': 1, 'ط': 2, 'ه': 3, 'ث': 4, 'س': 5, 'ف': 6, 'ظ': 7, 'غ': 8, 'ح': 9, 'ي': 10, 'ص': 11, 'إ': 12, 'ت': 13, 'ج': 14, 'ب': 15, 'ئ': 16, 'ا': 17, 'د': 18, 'و': 19, 'ء': 20, 'أ': 21, 'ز': 22, 'ؤ': 23, 'ذ': 24, 'ك': 25, 'ة': 26, 'ن': 27, 'ق': 28, 'ر': 29, 'آ': 30, 'خ': 31, 'ع': 32, 'ش': 33, 'م': 34, 'ض': 35}
Number of unique characters:  36
{'ى', 'ل', 'ط', 'ه', 'ث', 'س', 'ف', 'ظ', 'غ', 'ح', 'ي', 'ص', 'إ', 'ت', 'ج', 'ب', 'ئ', 'ا', 'د', 'و', 'ء', 'أ', 'ز', 'ؤ', 'ذ', 'ك', 'ة', 'ن', 'ق', 'ر', 'آ', 'خ', 'ع', 'ش', 'م', 'ض'}
[28, 19, 1, 3, 21, 19, 28, 2, 32, 17]


In [96]:
# Implementing word embedding using CBOW
# CBOW context window size
context_window = 1

# Define CBOW model
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        # embedding layer 
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        # linear layer
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, context):
        # The forward method specifies how data flows through the model.
        embedded = self.embeddings(context).sum(dim=1)
        output = self.linear(embedded)
        # now, output is of size [batch_size, vocab_size], we want it to be of size [1, vocab_size]
        # no problem, as it contains arrays, each array of size [1, vocab_size], and all elements of the arrays are the same
        output = output.squeeze(0)[:1, :]
        return output
    
# Instantiate CBOW model
embedding_dim = 20
vocab_size = len(unique_chars)
cbow_model = CBOW(vocab_size, embedding_dim)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to GPU if available
cbow_model = cbow_model.to(device)

print(cbow_model)

CBOW(
  (embeddings): Embedding(36, 50)
  (linear): Linear(in_features=50, out_features=36, bias=True)
)


In [97]:
# Training the CBOW model
# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
# The Adam optimizer is an extension to stochastic gradient descent
optimizer = optim.SGD(cbow_model.parameters(), lr=0.01)

# Training loop
# TODO: make it 50
num_epochs = 3
for epoch in range(num_epochs):
    total_loss = 0
    for i in range(context_window, len(train_sequence) - context_window):
        # get context window, and get 1-hot encoding for each character in the window
        context = torch.tensor(train_sequence[i-context_window:i] + train_sequence[i+1:i+1+context_window])
        # create 1-hot encoding for each position in the context array
        context = torch.nn.functional.one_hot(context, num_classes=vocab_size)
        # get target character
        target = torch.tensor([train_sequence[i]])
        # create 1-hot encoding for target character
        target = torch.nn.functional.one_hot(target, num_classes=vocab_size).float()
        
        # Move tensors to GPU if available
        context, target = context.to(device), target.to(device)

        optimizer.zero_grad()
        output = cbow_model(context)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss}')



KeyboardInterrupt: 

In [89]:
# Get character embeddings
char_embeddings = cbow_model.embeddings.weight.detach().numpy()

print(char_embeddings.shape)

# Print character embeddings
for idx, char in index_to_char.items():
    print(f'{char}: {char_embeddings[idx]}')

(36, 50)
ى: [ 0.00693972 -0.02702003  0.04268251 -0.02664188 -0.0603625   0.03998989
  0.02844273 -0.0110855   0.02957191 -0.06428575  0.01258888 -0.09369535
 -0.03476622  0.01324764  0.02049855 -0.05624948 -0.02535872  0.01915876
 -0.01098966  0.04600329 -0.01591044  0.0281318   0.03814961 -0.02958758
  0.00338939  0.01921298  0.0182109   0.01304104  0.01887392  0.02705112
 -0.0191082  -0.05718313 -0.02194045 -0.0413122   0.03207927  0.02224422
 -0.01703501  0.05798257 -0.06179385 -0.04293663  0.02309863 -0.05869397
  0.04289445  0.02252923  0.01320013  0.01109794 -0.04698668  0.09566227
 -0.00432673 -0.05210153]
ل: [-0.4372555  -0.49678177 -0.9207096   0.8717278   2.2575328  -0.2881389
  0.22909097 -0.15122634 -1.1989657   1.2530845  -0.13212207  2.94664
  1.1043385   0.17992999 -2.0714617   0.6319345   0.50512385 -0.0947079
  0.930131   -1.3984638   1.3758057  -0.11205965 -1.1877218  -0.37510827
  0.09779697 -0.04472011 -0.69965106 -0.50004363 -0.3746642  -0.9417012
 -0.06043649  1.