In [1]:
from util import *

In [2]:
EXTRACT_FEATRURES = False

In [3]:
train_dir = 'Dataset/train_clean.txt'
val_dir = 'Dataset/val_clean.txt'
test_dir = 'Dataset/test_clean.txt'

In [4]:
with open(train_dir, 'r', encoding='utf8') as f:
    train = f.read()
with open(val_dir, 'r', encoding='utf8') as f:
    val = f.read()
with open(test_dir, 'r', encoding='utf8') as f:
    test = f.read()

In [5]:
max_len = 500

In [6]:
X_train_dir = 'pkl_dir/X_train.txt'
y_train_dir = 'pkl_dir/y_train.txt'
X_val_dir = 'pkl_dir/X_val.txt'
y_val_dir = 'pkl_dir/y_val.txt'
X_test_dir = 'pkl_dir/X_test.txt'
y_test_dir = 'pkl_dir/y_test.txt'

if EXTRACT_FEATRURES:
    X_train, y_train = extract_features(train, max_len)
    X_val, y_val = extract_features(val, max_len)
    X_test, y_test = extract_features(test, max_len)

    with open(X_train_dir, 'wb') as f:
        pickle.dump(X_train, f)
    with open(y_train_dir, 'wb') as f:
        pickle.dump(y_train, f)
    with open(X_val_dir, 'wb') as f:
        pickle.dump(X_val, f)
    with open(y_val_dir, 'wb') as f:
        pickle.dump(y_val, f)
    with open(X_test_dir, 'wb') as f:
        pickle.dump(X_test, f)
    with open(y_test_dir, 'wb') as f:
        pickle.dump(y_test, f)

else:
    with open(X_train_dir, 'rb') as f:
        X_train = pickle.load(f)
    with open(y_train_dir, 'rb') as f:
        y_train = pickle.load(f)
    with open(X_val_dir, 'rb') as f:
        X_val = pickle.load(f)
    with open(y_val_dir, 'rb') as f:
        y_val = pickle.load(f)
    with open(X_test_dir, 'rb') as f:
        X_test = pickle.load(f)
    with open(y_test_dir, 'rb') as f:
        y_test = pickle.load(f)

In [7]:
j = 0
i = 20
print(chr(X_train[j][i]))
print(id2diacritic[y_train[j][i]])

و
َّ


In [8]:
del train, val, test

In [9]:
test_len = 10000
np.random.seed(42)
indices = np.arange(test_len)
np.random.shuffle(indices)
X_test = X_test[indices]
y_test = y_test[indices]

In [10]:
sentence_encoder = LabelEncoder().fit(X_train.flatten())
X_train = sentence_encoder.transform(X_train.flatten()).reshape(X_train.shape).astype(np.int16)
X_val = sentence_encoder.transform(X_val.flatten()).reshape(X_val.shape).astype(np.int16)
X_test = sentence_encoder.transform(X_test.flatten()).reshape(X_test.shape).astype(np.int16)

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=np.unique(X_train).shape[0], embedding_dim=25)

        # Add Conv1d layer
        self.conv1d = nn.Conv1d(in_channels=25, out_channels=256, kernel_size=11, padding=5)
        self.bn_conv = nn.BatchNorm1d(max_len)

        self.lstm1 = nn.LSTM(input_size=256, hidden_size=256, batch_first=True, bidirectional=True)
        self.bn1 = nn.BatchNorm1d(max_len)
        self.dropout1 = nn.Dropout(p=0.5)

        self.lstm2 = nn.LSTM(input_size=512, hidden_size=256, batch_first=True, bidirectional=True)
        self.bn2 = nn.BatchNorm1d(max_len)
        self.dropout2 = nn.Dropout(p=0.5)

        self.dense1 = nn.Linear(in_features=512, out_features=512)
        self.bn3 = nn.BatchNorm1d(max_len)

        self.dense2 = nn.Linear(in_features=512, out_features=512)
        self.bn4 = nn.BatchNorm1d(max_len)
        
        self.dense3 = nn.Linear(in_features=512, out_features=np.unique(y_train).shape[0])

    def forward(self, x):
        x = self.embedding(x)

        # Apply Conv1d layer
        x = x.permute(0, 2, 1)  # Change to (batch_size, channels, seq_len)
        x = F.relu(self.conv1d(x))
        x = x.permute(0, 2, 1)  # Change back to (batch_size, seq_len, channels)
        x = self.bn_conv(x)

        x, _ = self.lstm1(x)
        x = self.bn1(x)
        x = self.dropout1(x)

        x, _ = self.lstm2(x)
        x = self.bn2(x)
        x = self.dropout2(x)

        x = F.relu(self.bn3(self.dense1(x)))
        x = F.relu(self.bn4(self.dense2(x)))

        x = self.dense3(x)
        return x

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = Model().to(device)
model = torch.load('model_conv.ckpt')
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.9, 0.999), eps=1e-7)


In [17]:
batch_size = 200

In [18]:
def batch_eval(x_np, y_np):
    X_tensor = torch.tensor(x_np, dtype=torch.int32)
    y_tensor = torch.tensor(y_np, dtype=torch.int32)
    model.eval()
    predictions = torch.zeros_like(y_tensor)

    with torch.no_grad():
        for i in range(0, len(X_tensor), batch_size):
            inputs = X_tensor[i:i+batch_size].to(device)
            outputs = model(inputs)
            _, predictions[i:i+batch_size] = torch.max(outputs.data, 2)
            del inputs, outputs

    return predictions.cpu().numpy()

In [19]:
def compute_DER(X, y):
    predictions = batch_eval(X, y)
    ignore = {'!', '«', ']', '[', '}', ':', '"', '-', '»', '؛', ')', '،', '؟', '(', '{', '/', ' ', PAD, SOS, EOS}
    cnt = 0
    for itm in ignore:
        cnt += np.sum(X == sentence_encoder.transform(np.array([ord(itm)]))[0])
    accuracy = (np.sum(predictions == y) - cnt) / (predictions.shape[0] * predictions.shape[1] - cnt)
    return 1 - accuracy

In [21]:
num_epochs = 30
for epoch in range(num_epochs):
    model.train()

    X_train_tensor = torch.tensor(X_train, dtype=torch.int64).to(device)
    y_train_tensor = torch.tensor(y_train, dtype=torch.int64).to(device)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    accuracy = 0
    loss_cum_sum = 0
    len_ = 0
    for i, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs.view(-1, np.unique(y_train).shape[0]), y.view(-1))
        loss.backward()
        optimizer.step()
        _, predicted = torch.max(outputs.data, 2)
        accuracy += (predicted == y).sum().item()
        loss_cum_sum += loss.item()
        len_ += y.size(0) * y.size(1)
        if (i + 1) % 1 == 0:
            print('Step [{}/{}], Epoch [{}/{}], Loss: {:.4f}, Accuracy: {:.4f}'.format(i + 1, len(train_loader), epoch + 1, num_epochs, loss_cum_sum / (i + 1), accuracy / len_), end='\r')
    print()
    del X_train_tensor, y_train_tensor, train_dataset, train_loader
    print('DER: {:.5f}'.format(compute_DER(X_val, y_val)))

Step [3/200], Epoch [1/30], Loss: 0.0790, Accuracy: 0.9732

KeyboardInterrupt: 

In [22]:
torch.save(model, 'model_conv.ckpt')

In [None]:
print('DER: {:.5f}'.format(compute_DER(X_test, y_test)))

DER: 0.02614


In [None]:
def diacritize_string(sentence_test_str, model, sentence_encoder, max_len):
    sentence = SOS + sentence_test_str + EOS
    sentence_no_diacritics, labels = extract_data_single(sentence)
    sentence_no_diac_clamped, labels_clamped = clamp_sentence(sentence_no_diacritics, labels, max_len)
    sentence, labels_encoded = encode_sentences(sentence_no_diac_clamped, labels_clamped)
    sentence = sentence_encoder.transform(sentence.reshape(-1)).reshape(1, -1)
    
    sentence = torch.tensor(sentence, dtype=torch.int32).to(device)
    outputs = model(sentence)
    _, pred = torch.max(outputs.data, 2)
    pred = pred.cpu().numpy().reshape(-1)

    sentence = ''
    for i in range(len(sentence_no_diacritics)):
        sentence += sentence_no_diacritics[i]
        sentence += id2diacritic[pred[i]]
            
    return sentence

In [None]:
sentence_test_str = 'كَانَ مَعَهُ أَلْفٌ فَقَالَ هِيَ مُضَارَبَةٌ لِفُلَانٍ بِالنِّصْفِ وَقَدْ رَبِحَ أَلْفًا فَقَالَ فُلَانٌ هِيَ بِضَاعَةٌ فَالْقَوْلُ قَوْلُ رَبِّ الْمَالِ'
print(sentence_test_str)
print(diacritize_string(sentence_test_str, model, sentence_encoder, max_len)[1:-1])

كَانَ مَعَهُ أَلْفٌ فَقَالَ هِيَ مُضَارَبَةٌ لِفُلَانٍ بِالنِّصْفِ وَقَدْ رَبِحَ أَلْفًا فَقَالَ فُلَانٌ هِيَ بِضَاعَةٌ فَالْقَوْلُ قَوْلُ رَبِّ الْمَالِ
كَانَ مَعَهُ أَلْفٌ فَقَالَ هِيَ مُضَارَبَةٌ لِفُلَانٍ بِالنِّصْفِ وَقَدْ رَبِحَ أَلْفًا فَقَالَ فُلَانٌ هِيَ بِضَاعَةٌ فَالْقَوْلُ قَوْلُ رَبِّ الْمَالِ
