In [1]:
# Import necessary libraries
import torch
import torch.nn as nn
import pickle
import re
import sys
import os
sys.path.insert(0, os.path.abspath('../..'))

from preprocessing.preprocess import preprocess_data

In [None]:
limit = 1000
dataset_path='../../dataset'
#  preprocess and save the data
preprocess_data(data_type='train', limit=limit, dataset_path=dataset_path)

In [6]:
# load data
with open(f'{dataset_path}/cleaned_train_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    training_data = re.compile(r'[\n\r\t\s]').sub('', file.read())
with open(f'{dataset_path}/cleaned_val_data_without_diacritics.txt', 'r', encoding='utf-8') as file:
    validation_data = re.compile(r'[\n\r\t\s]').sub('', file.read())

print(training_data[:100])

قولهأوقطعالأوليدهإلخقالالزركشيابنعرفةقولهبلفظيقتضيهكإنكارغيرحديثبالإسلاموجوبماعلموجوبهمنالدينضرورةكإ


In [11]:
# Tokenize the text into sequences at the character level
unique_chars = set(''.join(training_data + validation_data))

char_to_index = {char: idx for idx, char in enumerate(unique_chars)}
index_to_char = {idx: char for idx, char in enumerate(unique_chars)}

In [8]:
# Parameters
embedding_dim = 100

In [12]:
# Create the embedding layer
embedding = nn.Embedding(len(unique_chars), embedding_dim)
# Get sequences of unique chars
sequences = torch.tensor([idx for idx, _ in index_to_char.items()])
# Apply the embedding layer to get the embedding vectors
embedding_vectors = embedding(sequences)

print(embedding_vectors.shape)
print(embedding_vectors)

torch.Size([36, 100])
tensor([[ 0.0520, -1.0540,  0.3679,  ..., -0.1937,  0.7133, -0.1218],
        [-0.0401, -0.1679, -0.9302,  ...,  0.5472,  0.5104,  0.3971],
        [-1.6366, -0.5670,  0.3022,  ..., -0.3640, -0.8381, -1.5639],
        ...,
        [ 0.8510,  0.3840, -1.5808,  ...,  2.9645,  0.0687, -0.3998],
        [ 1.7538, -0.9311, -0.7679,  ...,  0.5655,  1.7029,  1.7577],
        [-1.5972, -0.4510, -1.2270,  ...,  1.0900,  1.4372,  1.9056]],
       grad_fn=<EmbeddingBackward0>)


In [13]:
# Print character embeddings
for idx, char in index_to_char.items():
    print(f'{char}: {embedding_vectors[idx]}')

ع: tensor([ 0.0520, -1.0540,  0.3679, -0.4531, -0.8351,  0.1102,  0.9444, -1.0055,
         2.2741,  1.3318, -0.2730,  0.2096,  0.1382, -1.5930, -0.4084, -0.1038,
         0.7027, -1.0251, -0.9216,  1.7645,  0.3640,  2.0650, -0.1007,  0.3099,
        -1.3778, -1.7314, -0.1767, -0.6114,  0.1121, -0.5083,  0.0935,  0.3929,
         1.3313,  0.2099,  0.5503, -1.0919, -0.8916,  0.0978,  0.3524, -1.3674,
         0.3764,  0.0353,  0.0049,  1.7620, -1.4374,  1.3112, -1.1892, -0.5316,
         0.3872,  0.3704,  2.5682, -1.2289,  0.0413,  0.6754, -0.0164, -0.0908,
         1.6015, -1.1818,  0.8009,  0.6013,  0.3764, -0.3357, -0.4114, -0.8059,
        -1.2952, -0.5601, -0.8545,  0.1091, -0.3991, -0.4318,  0.8881, -0.3530,
         0.7602,  1.0309, -1.7459, -0.7075, -0.5135, -0.9154,  0.0798, -1.2567,
        -0.5984,  1.3134, -0.7120, -0.7108, -0.4308, -1.9611, -0.9993, -0.7097,
        -0.2716, -1.3274,  0.3083,  2.5460, -2.0643,  0.0582,  0.7822,  0.1307,
         0.3475, -0.1937,  0.7133, -0

In [14]:
# save the embedding vectors
with open(f'{dataset_path}/char_embedding_vectors.pkl', 'wb') as file:
    pickle.dump(embedding_vectors, file)

In [15]:
# test loading the embedding vectors
del embedding_vectors
with open(f'{dataset_path}/char_embedding_vectors.pkl', 'rb') as file:
    embedding_vectors = pickle.load(file)
    
print(embedding_vectors.shape)
print(embedding_vectors)

torch.Size([36, 100])
tensor([[ 0.0520, -1.0540,  0.3679,  ..., -0.1937,  0.7133, -0.1218],
        [-0.0401, -0.1679, -0.9302,  ...,  0.5472,  0.5104,  0.3971],
        [-1.6366, -0.5670,  0.3022,  ..., -0.3640, -0.8381, -1.5639],
        ...,
        [ 0.8510,  0.3840, -1.5808,  ...,  2.9645,  0.0687, -0.3998],
        [ 1.7538, -0.9311, -0.7679,  ...,  0.5655,  1.7029,  1.7577],
        [-1.5972, -0.4510, -1.2270,  ...,  1.0900,  1.4372,  1.9056]],
       requires_grad=True)
