In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pickle
import numpy as np
import re

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu"); print(device)

cuda


In [6]:
# load data
with open('dataset/cleaned_train_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data_diacritized = re.compile(r'[\n\r\t\s]').sub('', file.read())
with open('dataset/cleaned_train_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data = re.compile(r'[\n\r\t\s]').sub('', file.read())
with open('dataset/cleaned_val_data_with_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data_diacritized = re.compile(r'[\n\r\t\s]').sub('', file.read())
with open('dataset/cleaned_val_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data = re.compile(r'[\n\r\t\s]').sub('', file.read())

print(training_data[:100])

قولهأوقطعالأوليدهإلخقالالزركشيابنعرفةقولهبلفظيقتضيهكإنكارغيرحديثبالإسلاموجوبماعلموجوبهمنالدينضرورةكإ


In [8]:
# define the labels and their corresponding indices
labels = {
    # fatha
    '\u064E':0,
    # damma
    '\u064F':1,
    # kasra
    '\u0650':2,
    # shadda
    '\u0651':3,
    # sukun
    '\u0652':4,
    # tanween_fatha
    '\u064B':5,
    # tanween_damma
    '\u064C':6,
    # tanween_kasra
    '\u064D':7
}

sequence_to_labels = {
    # fatha
    0:'\u064E',
    # damma
    1:'\u064F',
    # kasra
    2:'\u0650',
    # shadda
    3:'\u0651',
    # sukun
    4:'\u0652',
    # tanween_fatha
    5:'\u064B',
    # tanween_damma
    6:'\u064C',
    # tanween_kasra
    7:'\u064D'
}

In [9]:
# Tokenize the text into sequences at the character level
unique_chars = set(''.join(training_data + validation_data))
diacritization = list(labels.keys())

char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
index_to_char = {idx: char for idx, char in enumerate(unique_chars)}

print(char_to_index)

def text_to_sequence(text):
    return [char_to_index[char] for char in text]

train_sequence = text_to_sequence(training_data)
validation_sequences = text_to_sequence(validation_data)

print("Number of unique characters: ", len(unique_chars))
print(unique_chars)
print(train_sequence[:10])

{'د': 0, 'ظ': 1, 'إ': 2, 'ه': 3, 'ش': 4, 'س': 5, 'غ': 6, 'ض': 7, 'ع': 8, 'ء': 9, 'ط': 10, 'ا': 11, 'ج': 12, 'ي': 13, 'ب': 14, 'خ': 15, 'أ': 16, 'ف': 17, 'ن': 18, 'ؤ': 19, 'ر': 20, 'ح': 21, 'ئ': 22, 'ى': 23, 'ذ': 24, 'ل': 25, 'ز': 26, 'ث': 27, 'آ': 28, 'و': 29, 'ت': 30, 'م': 31, 'ك': 32, 'ق': 33, 'ص': 34, 'ة': 35}
Number of unique characters:  36
{'د', 'ظ', 'إ', 'ه', 'ش', 'س', 'غ', 'ض', 'ع', 'ء', 'ط', 'ا', 'ج', 'ي', 'ب', 'خ', 'أ', 'ف', 'ن', 'ؤ', 'ر', 'ح', 'ئ', 'ى', 'ذ', 'ل', 'ز', 'ث', 'آ', 'و', 'ت', 'م', 'ك', 'ق', 'ص', 'ة'}
[33, 29, 25, 3, 16, 29, 33, 10, 8, 11]


In [10]:
embedding_dim = 100


In [11]:
# Create the embedding layer
embedding = nn.Embedding(len(unique_chars), 100)
# Get sequences of unique chars
sequences = torch.tensor([idx for idx, _ in index_to_char.items()])
# Apply the embedding layer to get the embedding vectors
embedding_vectors = embedding(sequences)

print(embedding_vectors.shape)
print(embedding_vectors)


torch.Size([36, 100])
tensor([[ 0.0828,  1.2768,  0.3655,  ...,  0.6516,  0.4314, -1.0468],
        [ 1.9343, -0.5438, -0.7299,  ..., -0.9210,  0.6727, -0.6077],
        [-1.1366, -1.4670, -1.5183,  ...,  1.8812,  0.3244,  0.9405],
        ...,
        [ 0.5995,  2.3082,  1.1846,  ...,  0.9747, -0.2665,  0.7691],
        [ 0.1318,  1.6042, -0.4952,  ...,  0.4760, -0.9791,  0.1288],
        [-1.4619, -0.5068,  0.5793,  ..., -0.2832, -0.3736,  0.3120]],
       grad_fn=<EmbeddingBackward0>)


In [12]:
print(embedding_vectors.shape)
print(index_to_char)
# Print character embeddings
for idx, char in index_to_char.items():
    print(f'{char}: {embedding_vectors[idx]}')

torch.Size([36, 100])
{0: 'د', 1: 'ظ', 2: 'إ', 3: 'ه', 4: 'ش', 5: 'س', 6: 'غ', 7: 'ض', 8: 'ع', 9: 'ء', 10: 'ط', 11: 'ا', 12: 'ج', 13: 'ي', 14: 'ب', 15: 'خ', 16: 'أ', 17: 'ف', 18: 'ن', 19: 'ؤ', 20: 'ر', 21: 'ح', 22: 'ئ', 23: 'ى', 24: 'ذ', 25: 'ل', 26: 'ز', 27: 'ث', 28: 'آ', 29: 'و', 30: 'ت', 31: 'م', 32: 'ك', 33: 'ق', 34: 'ص', 35: 'ة'}
د: tensor([ 0.0828,  1.2768,  0.3655,  1.4360, -0.6314, -1.6149,  0.6157,  1.8519,
        -0.0042,  0.5802,  0.0650,  0.0211,  0.1921,  0.1649,  1.1179,  1.0314,
         0.6597, -0.0376,  1.3617,  0.0397,  0.3935, -1.1012, -2.2355, -0.3061,
        -1.2561,  2.5552, -0.9204,  1.3314, -1.0655,  0.6096, -0.9947, -1.1143,
        -0.9542,  0.0387,  1.3768, -0.0646, -0.9791,  0.2031, -1.1209, -0.9599,
        -1.0819, -1.2827, -1.3782, -2.4927, -0.0544, -0.4447,  0.1446,  0.5300,
        -0.8599, -1.3236,  0.0833, -0.5426, -0.8103,  1.3424,  0.4589, -0.1129,
        -0.6621, -0.1347,  1.1366,  1.0769, -0.4538, -0.1665, -0.7314,  0.6223,
         0.4506, -0.

In [13]:
# save the embedding vectors in pickle file
with open('embedding_vectors.pickle', 'wb') as file:
    pickle.dump(embedding_vectors, file)

In [14]:
# load the embedding vectors from pickle file
with open('embedding_vectors.pickle', 'rb') as file:
    embedding_vectors = pickle.load(file)

In [20]:
# extract the labels of the training data based on the diacritization labels
train_labels = []

is_bared = False
for char in training_data_diacritized:
    if char in diacritization and char != '\u0651':
        train_labels.append(labels[char])
        is_bared = False
    elif is_bared == True:
        train_labels.append(-1)
        is_bared = False
    else: 
        is_bared = True

print(len(train_labels))

In [19]:
print(len(train_sequence))
print(train_sequence[:20])
print(training_data_diacritized[:20]) # 8771713
print(training_data[:10])

8351478
[33, 29, 25, 3, 16, 29, 33, 10, 8, 11, 25, 16, 29, 25, 13, 0, 3, 2, 25, 15]
قَوْلُهُأَوْقَطَعَال
قولهأوقطعا


# B-LSTM Model
## Creating The Model

In [74]:
# Create the model B-LSTM
class SequenceModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim,  hidden_dim, output_dim,  num_layers, drop_prob=0.5):
        super().__init__()

        # Network Dimensions
        self.vocab_size, self.hidden_dim, self.output_dim  = vocab_size, hidden_dim, output_dim
        self.num_layers = num_layers
    
        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # LSTM Layer
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=self.hidden_dim, num_layers=num_layers, batch_first=True)
        
        # Drop out layer
        self.dropout = nn.Dropout(drop_prob)
    
        # Output Layer
        self.output = nn.Linear(self.hidden_dim, output_dim)      # Or make it linear and use Sofrmax (in loss)

    def forward(self, x):
        
        x = self.embedding(x)                                      # B * Seq_Len * embedding_dim
        
        lstm_out, _ = self.lstm(x)                                     # B * Seq_Len * hidden_dim
        
        lstm_out =  lstm_out[:, -1, :]                                      # B * hidden_dim

        drop_out = self.dropout(lstm_out)                                   # B * hidden_dim

        
        out = self.output(drop_out)                                         # B * output_dim
        
        return out
        

## Initializing The Model

In [76]:
num_layers = 2
vocab_size = 36
embedding_dim = 100
output_dim = 8
hidden_dim = 100
drop_prob = 0.5
lr=0.001
clip = 5
epochs = 5 
WeightedLoss = True
# w = torch.tensor(np.load("../../Dataset/w1.npy").astype(np.float32)).to(device)
w = torch.tensor([0.3, 0.25, 0.7, 0.5, 0.5, 0.25, 0.25, 0.25]).to(device)
model = SequenceModel(vocab_size, embedding_dim,  hidden_dim, output_dim,  num_layers).to(device)

print(model)

SequenceModel(
  (embedding): Embedding(36, 100)
  (lstm): LSTM(100, 100, num_layers=2, batch_first=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (output): Linear(in_features=100, out_features=8, bias=True)
)


## Training the model

In [77]:
# loss and optimization functions
if WeightedLoss:
    criterion = nn.CrossEntropyLoss(weight=w).to(device)
else:
    criterion = nn.CrossEntropyLoss().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [None]:
epoch_tr_loss, epoch_vl_loss = [], []

for epoch in range(epochs):
    train_losses = []
    for inputs, labels in tqdm(train_loader):
        
        inputs, labels = inputs.to(device), labels.to(device)   
        
        output = model(inputs)
        loss = criterion(output, labels)
        
        optimizer.zero_grad()
        
        loss.backward()
        
        optimizer.step()
        
        train_losses.append(loss.item())

        nn.utils.clip_grad_norm_(model.parameters(), clip)
        
    val_losses = []
    for inputs, labels in valid_loader:
        with torch.no_grad():
            inputs, labels = inputs.to(device), labels.to(device)
            output = model(inputs)
            val_loss = criterion(output, labels)
            val_losses.append(val_loss.item())

    epoch_tr_loss.append(np.mean(train_losses))
    epoch_vl_loss.append(np.mean(val_losses))
    print(f'Epoch {epoch+1}') 
    print(25*'==')
    if epoch == epochs-1 or epoch_vl_loss[-1] > epoch_tr_loss[-1]:
        F1 = f1_score(labels.cpu().numpy(), output.argmax(1).cpu().numpy(), average='macro')
        Report = classification_report(labels.cpu().numpy(), output.argmax(1).cpu().numpy(), output_dict=True)
        print("Done")
        print(classification_report(labels.cpu().numpy(), output.argmax(1).cpu().numpy()))
        print("Macro F1 Score: ", F1)         
        break