In [None]:
# install a library to work with docx files
# pip install python-docx

In [29]:
# Import Python libraries and helper functions (in utils) 
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
from collections import Counter
from utils import getText, softmax, relu, get_batches, compute_pca, get_dict, cosine_similarity, euclidean_distance
import re #  Load the Regex-modul


nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mohammad\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Download sentence tokenizer
nltk.data.path.append('.')

In [4]:
# get arabic stopwords
arb_stopwords = set(nltk.corpus.stopwords.words("arabic"))

In [55]:
path = './data/almizan-2.docx'
data = getText(path)

In [54]:
# tokenize and process the data
def tokenize(corpus):
    #  Punktuations are replaced by ' '
    data = re.sub(r'[=*":(),!؟;-]', ' ', corpus)
    #  Tokenize string to words
    data = ' '.join(data.split('،'))
    data = nltk.word_tokenize(data) 
    for i, word in enumerate(data):
        data[i] = re.sub('[ َ ُ ِ ّةء ً ٌ ٍ ْ]', '', word)

    #  drop non-alphabetical tokens
    data = [ ch for ch in data if not(ch.isnumeric()) or ch.isalpha()]  
    data = [ ch for ch in data if ch not in arb_stopwords]
    return data


In [56]:
data = tokenize(data)
print("Number of tokens:", len(data),'\n', data[:30]) #  print data sample

Number of tokens: 94982 
 ['بسم', 'الله', 'الرحمن', 'الرحيم', 'پايگاه', 'قرآن', 'شناسي', 'حوزه', 'علميه', 'ميبد', 'تفسير', 'الميزان', 'السيد', 'الطباطبائي', 'الجز', 'الثاني', 'سور', 'البقر', 'يأيها', 'امنوا', 'كتب', 'عليكم', 'الصيام', 'كتب', 'قبلكم', 'لعلكم', 'تتقون', 'أياما', 'معدودت', 'منكم']


In [57]:
# Compute the frequency distribution of the words in the dataset (vocabulary)
fdist = nltk.FreqDist(word for word in data)
print("Size of vocabulary: ", len(fdist))
print("Most frequent tokens: ", fdist.most_common(20)) # print the 20 most frequent words and their freq.

Size of vocabulary:  19308
Most frequent tokens:  [('الله', 2444), ('.', 2101), ('تعالى', 1806), ('قوله', 1292), ('قال', 1060), ('الآي', 652), ('السلام', 458), ('الإنسان', 338), ('الناس', 314), ('سبحانه', 280), ('أنه', 267), ('صلى', 257), ('وآله', 249), ('وسلم', 249), ('بن', 239), ('أمر', 230), ('معنى', 224), ('إليه', 218), ('الكلام', 216), ('القرآن', 215)]


In [9]:
# get_dict creates two dictionaries, converting words to indices and viceversa.
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
print("Size of vocabulary: ", V)

Size of vocabulary:  21364


In [10]:
# example of word to index mapping
# replace هذا with 'this' because of nice format otherwise we have an ugly format
print(f"Index of the word 'way' :  {word2Ind['صراط']}")
print(f"Word which has index 12578:  {Ind2word[12578]}")

Index of the word 'way' :  12578
Word which has index 12578:  صراط


# Training the Model

In [11]:
def initialize_model(N,V, random_seed=1):
    '''
    Inputs: 
        N:  dimension of hidden vector 
        V:  dimension of vocabulary
        random_seed: random seed for consistent results in the unit tests
     Outputs: 
        W1, W2, b1, b2: initialized weights and biases
    '''
    
    np.random.seed(random_seed)
    # W1 has shape (N,V)
    W1 = np.random.rand(N,V)
    
    # W2 has shape (V,N)
    W2 = np.random.rand(V, N)
    
    # b1 has shape (N,1)
    b1 = np.random.rand(N,1)
    
    # b2 has shape (V,1)
    b2 = np.random.rand(V,1)
    
    return W1, W2, b1, b2

In [12]:
# Test initialize_model function.
tmp_N = 4
tmp_V = 10
tmp_W1, tmp_W2, tmp_b1, tmp_b2 = initialize_model(tmp_N,tmp_V)
assert tmp_W1.shape == ((tmp_N,tmp_V))
assert tmp_W2.shape == ((tmp_V,tmp_N))
print(f"tmp_W1.shape: {tmp_W1.shape}")
print(f"tmp_W2.shape: {tmp_W2.shape}")
print(f"tmp_b1.shape: {tmp_b1.shape}")
print(f"tmp_b2.shape: {tmp_b2.shape}")

tmp_W1.shape: (4, 10)
tmp_W2.shape: (10, 4)
tmp_b1.shape: (4, 1)
tmp_b2.shape: (10, 1)


In [13]:
def forward_prop(x, W1, W2, b1, b2):
    '''
    Inputs: 
        x:  average one hot vector for the context 
        W1, W2, b1, b2:  matrices and biases to be learned
     Outputs: 
        z:  output score vector
    '''
    
    # Calculate h
    h = np.dot(W1, x) + b1
  
    # Apply the relu on h, 
    # store the relu in h
    h = relu(h)

    # Calculate z
    z = np.dot(W2, h) + b2

    return z, h

In [14]:
# Test the forward_prop function

# Create some inputs
tmp_N = 2
tmp_V = 3
tmp_x = np.array([[0,1,0]]).T

tmp_W1, tmp_W2, tmp_b1, tmp_b2 = initialize_model(N=tmp_N,V=tmp_V, random_seed=1)

print(f"x has shape {tmp_x.shape}")
print(f"N is {tmp_N} and vocabulary size V is {tmp_V}")

# call function
tmp_z, tmp_h = forward_prop(tmp_x, tmp_W1, tmp_W2, tmp_b1, tmp_b2)
print("call forward_prop")
print()
# Look at output
print(f"z has shape {tmp_z.shape}")
print("z has values:")
print(tmp_z)

print()

print(f"h has shape {tmp_h.shape}")
print("h has values:")
print(tmp_h)

x has shape (3, 1)
N is 2 and vocabulary size V is 3
call forward_prop

z has shape (3, 1)
z has values:
[[0.55379268]
 [1.58960774]
 [1.50722933]]

h has shape (2, 1)
h has values:
[[0.92477674]
 [1.02487333]]


In [15]:
# compute_cost: cross-entropy cost function
def compute_cost(y, yhat, batch_size):

    # cost function 
    logprobs = np.multiply(np.log(yhat),y)
    cost = -1/batch_size * np.sum(logprobs)
    cost = np.squeeze(cost)
    return cost

In [16]:
# Test the compute_cost function
tmp_C = 2
tmp_N = 50
tmp_batch_size = 4
tmp_word2Ind, tmp_Ind2word = get_dict(data)
tmp_V = len(word2Ind)

tmp_x, tmp_y = next(get_batches(data, tmp_word2Ind, tmp_V,tmp_C, tmp_batch_size))
        
print(f"tmp_x.shape {tmp_x.shape}")
print(f"tmp_y.shape {tmp_y.shape}")

tmp_W1, tmp_W2, tmp_b1, tmp_b2 = initialize_model(tmp_N,tmp_V)

print(f"tmp_W1.shape {tmp_W1.shape}")
print(f"tmp_W2.shape {tmp_W2.shape}")
print(f"tmp_b1.shape {tmp_b1.shape}")
print(f"tmp_b2.shape {tmp_b2.shape}")

tmp_z, tmp_h = forward_prop(tmp_x, tmp_W1, tmp_W2, tmp_b1, tmp_b2)
print(f"tmp_z.shape: {tmp_z.shape}")
print(f"tmp_h.shape: {tmp_h.shape}")

tmp_yhat = softmax(tmp_z)
print(f"tmp_yhat.shape: {tmp_yhat.shape}")

tmp_cost = compute_cost(tmp_y, tmp_yhat, tmp_batch_size)
print("call compute_cost")
print(f"tmp_cost {tmp_cost:.4f}")

tmp_x.shape (21364, 4)
tmp_y.shape (21364, 4)
tmp_W1.shape (50, 21364)
tmp_W2.shape (21364, 50)
tmp_b1.shape (50, 1)
tmp_b2.shape (21364, 1)
tmp_z.shape: (21364, 4)
tmp_h.shape: (50, 4)
tmp_yhat.shape: (21364, 4)
call compute_cost
tmp_cost 13.0368


In [17]:
def back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size):
    '''
    Inputs: 
        x:  average one hot vector for the context 
        yhat: prediction (estimate of y)
        y:  target vector
        h:  hidden vector (see eq. 1)
        W1, W2, b1, b2:  matrices and biases  
        batch_size: batch size 
     Outputs: 
        grad_W1, grad_W2, grad_b1, grad_b2:  gradients of matrices and biases   
    '''
    # Compute l1 as W2^T (Yhat - Y)
    l1 = np.dot(W2.T, yhat - y)

    # Apply relu to l1
    l1 = relu(l1)

    # compute the gradient for W1
    grad_W1 = (1/batch_size) * np.dot(l1, x.T)

    # Compute gradient of W2
    grad_W2 = (1/batch_size) * np.dot(yhat - y, h.T)
    
    # compute gradient for b1
    grad_b1 = (1/batch_size) * np.sum(l1, axis=1, keepdims=True)

    # compute gradient for b2
    grad_b2 = (1/batch_size) * np.sum((yhat - y), axis=1, keepdims=True)
    
    return grad_W1, grad_W2, grad_b1, grad_b2

In [18]:
# Test the back_prop function
tmp_C = 2
tmp_N = 50
tmp_batch_size = 4
tmp_word2Ind, tmp_Ind2word = get_dict(data)
tmp_V = len(word2Ind)


# get a batch of data
tmp_x, tmp_y = next(get_batches(data, tmp_word2Ind, tmp_V,tmp_C, tmp_batch_size))

print("get a batch of data")
print(f"tmp_x.shape {tmp_x.shape}")
print(f"tmp_y.shape {tmp_y.shape}")

print()
print("Initialize weights and biases")
tmp_W1, tmp_W2, tmp_b1, tmp_b2 = initialize_model(tmp_N,tmp_V)

print(f"tmp_W1.shape {tmp_W1.shape}")
print(f"tmp_W2.shape {tmp_W2.shape}")
print(f"tmp_b1.shape {tmp_b1.shape}")
print(f"tmp_b2.shape {tmp_b2.shape}")

print()
print("Forwad prop to get z and h")
tmp_z, tmp_h = forward_prop(tmp_x, tmp_W1, tmp_W2, tmp_b1, tmp_b2)
print(f"tmp_z.shape: {tmp_z.shape}")
print(f"tmp_h.shape: {tmp_h.shape}")

print()
print("Get yhat by calling softmax")
tmp_yhat = softmax(tmp_z)
print(f"tmp_yhat.shape: {tmp_yhat.shape}")

tmp_m = (2*tmp_C)
tmp_grad_W1, tmp_grad_W2, tmp_grad_b1, tmp_grad_b2 = back_prop(tmp_x, tmp_yhat, tmp_y, tmp_h, tmp_W1, tmp_W2, tmp_b1, tmp_b2, tmp_batch_size)

print()
print("call back_prop")
print(f"tmp_grad_W1.shape {tmp_grad_W1.shape}")
print(f"tmp_grad_W2.shape {tmp_grad_W2.shape}")
print(f"tmp_grad_b1.shape {tmp_grad_b1.shape}")
print(f"tmp_grad_b2.shape {tmp_grad_b2.shape}")

get a batch of data
tmp_x.shape (21364, 4)
tmp_y.shape (21364, 4)

Initialize weights and biases
tmp_W1.shape (50, 21364)
tmp_W2.shape (21364, 50)
tmp_b1.shape (50, 1)
tmp_b2.shape (21364, 1)

Forwad prop to get z and h
tmp_z.shape: (21364, 4)
tmp_h.shape: (50, 4)

Get yhat by calling softmax
tmp_yhat.shape: (21364, 4)

call back_prop
tmp_grad_W1.shape (50, 21364)
tmp_grad_W2.shape (21364, 50)
tmp_grad_b1.shape (50, 1)
tmp_grad_b2.shape (21364, 1)


In [66]:
def gradient_descent(data, word2Ind, N, V, num_iters, alpha=0.01, 
                     random_seed=282, initialize_model=initialize_model, 
                     get_batches=get_batches, forward_prop=forward_prop, 
                     softmax=softmax, compute_cost=compute_cost, 
                     back_prop=back_prop):
    
    '''
    This is the gradient_descent function
    
      Inputs: 
        data:      text
        word2Ind:  words to Indices
        N:         dimension of hidden vector  
        V:         dimension of vocabulary 
        num_iters: number of iterations  
        random_seed: random seed to initialize the model's matrices and vectors
        initialize_model: implementation of the function to initialize the model
        get_batches: function to get the data in batches
        forward_prop: implementation of the function to perform forward propagation
        softmax: implementation of the softmax function
        compute_cost: cost function (Cross entropy)
        back_prop: implementation of the function to perform backward propagation
     Outputs: 
        W1, W2, b1, b2:  updated matrices and biases after num_iters iterations

    '''
    W1, W2, b1, b2 = initialize_model(N,V, random_seed=random_seed) #W1=(N,V) and W2=(V,N)

#     batch_size = 512
    batch_size = 256
    iters = 0
    C = 10
    for x, y in get_batches(data, word2Ind, V, C, batch_size):
        # get z and h
        z, h = forward_prop(x, W1, W2, b1, b2)
                
        # get yhat
        yhat = softmax(z)
        
        # get cost
        cost = compute_cost(y, yhat, batch_size)
        if ( (iters+1) % 10 == 0):
            print(f"iters: {iters + 1} cost: {cost:.6f}")
            
        # get gradients
        grad_W1, grad_W2, grad_b1, grad_b2 = back_prop(x, yhat, y, h, W1, W2, b1, b2, batch_size)
        
        # update weights and biases
        W1 = W1 - alpha * grad_W1
        W2 = W2 - alpha * grad_W2
        b1 = b1 - alpha * grad_b1
        b2 = b2 - alpha * grad_b2

        iters +=1 
        if iters == num_iters: 
            break
        if iters % 100 == 0:
            if cost < 8.0:
                alpha *= 0.4
            else:
                alpha *= 0.66
            
    return W1, W2, b1, b2

In [65]:
# test gradient_descent function
data = getText(path)
data = tokenize(data)
C = 2
N = 300
word2Ind, Ind2word = get_dict(data)
V = len(word2Ind)
num_iters = 1000
print("Call gradient_descent")
W1, W2, b1, b2 = gradient_descent(data, word2Ind, N, V, num_iters)

Call gradient_descent
iters: 10 cost: 24.037803
iters: 20 cost: 15.549697
iters: 30 cost: 26.401212
iters: 40 cost: 21.733962
iters: 50 cost: 21.932020
iters: 60 cost: 12.955119
iters: 70 cost: 9.094137
iters: 80 cost: 16.779770
iters: 90 cost: 14.748732
iters: 100 cost: 13.836443
iters: 110 cost: 16.422891
iters: 120 cost: 7.378118
iters: 130 cost: 5.993194
iters: 140 cost: 5.649824
iters: 150 cost: 14.666813
iters: 160 cost: 8.195413
iters: 170 cost: 20.653989
iters: 180 cost: 19.179364
iters: 190 cost: 10.840543
iters: 200 cost: 16.612573
iters: 210 cost: 19.501566
iters: 220 cost: 10.001060
iters: 230 cost: 15.930420
iters: 240 cost: 4.261406
iters: 250 cost: 11.207951
iters: 260 cost: 19.882439
iters: 270 cost: 15.615543
iters: 280 cost: 9.894656
iters: 290 cost: 13.299403
iters: 300 cost: 14.486851
iters: 310 cost: 8.284487
iters: 320 cost: 12.977202
iters: 330 cost: 14.620620
iters: 340 cost: 7.088863
iters: 350 cost: 5.461401
iters: 360 cost: 7.402425
iters: 370 cost: 9.679696


In [33]:
# To extract word vectors we have three approaches
# First approach: to assume W1 as word vectors 
# embs = W1.T

# Second approach: to assume W2 as word vectors 
# embs = W2

# Third approach: or to assume the mean of W1+W2 as word vectors 
embs = (W1.T + W2)/2.0

word_embeddings = {}
for i, vec in enumerate(embs):
    word_embeddings[Ind2word[i]] = vec

# Visualizing the word vectors

In [None]:
# download 2 libraries for dispalying arabic text on a plot properly
# pip install python-bidi

In [None]:
# pip install --upgrade arabic-reshaper

In [None]:
# visualizing the word vectors here
from bidi.algorithm import get_display
import matplotlib.pyplot as plt
import arabic_reshaper

%config InlineBackend.figure_format = 'svg'
words = ['كُتِب', 'صراط','الآية','المستقيم', 'الْعُسرَ','الكلام','واحدة',
         'التسهيل','التخفيف','معدودة']
 
# given a list of words and the embeddings, it returns a matrix with all the embeddings
idx = [word2Ind[word] for word in words]
X = embs[idx, :]
print(X.shape, idx)  # X.shape:  Number of words of dimension N each 

In [None]:
result= compute_pca(X, 4)
plt.scatter(result[:, 1], result[:, 3])
for i, word in enumerate(words):
    reshaped_text = arabic_reshaper.reshape(word)
    artext = get_display(reshaped_text)
    plt.annotate(artext, xy=(result[i, 1], result[i, 3]))
plt.show()

# Evaluation of word vectors

In [None]:
# get vectors
v = word_embeddings['صراط']
w = word_embeddings['مستقيم']

from numpy import linalg

# Calculate Euclidean distance d
d = linalg.norm(v-w)
print("The Euclidean distance between v and w is: ", d)

# Calculate Cosine similarity c
c = np.dot(v,w) / (linalg.norm(v)*linalg.norm(w))
print("The Cosine similarity of v and w is: ", c)

In [None]:
# let's evaluate some word vectors
words = ['كُتِب', 'سياق','المستقيم','صراط', 'الْعُسرَ','الكلام','واحدة',
         'التسهيل','التخفيف','معدودة']
for i in range(len(words)-1):
    v = word_embeddings[words[i]]
    w = word_embeddings[words[i+1]]
    
    # Calculate Euclidean distance d
    d = linalg.norm(v-w)
    print(f"The Euclidean distance between {words[i]} and {words[i+1]} is: {d}")

    # Calculate Cosine similarity c
    c = np.dot(v,w) / (linalg.norm(v)*linalg.norm(w))
    print(f"The Cosine similarity of {words[i]} and {words[i+1]} is: {c}")
    print()

In [None]:
# Create a dataframe out of the dictionary embedding.
keys = word_embeddings.keys()
df = []
for key in keys:
    df.append(word_embeddings[key])
    
embedding = pd.DataFrame(data=df, index=keys)

In [None]:
embedding.head(10)

In [None]:
# Define a function to find K closest words to a vector:
def find_closest_word(word, k, embeddings=word_embeddings):
    
    most_closest_words = []
    word_emb = embeddings[word]
    similar_word = ''
    
    for w in embeddings.keys():
        if word != w:
            # get the word embedding
            w_emb = embeddings[w]
            # calculating cosine similarity
            cur_similarity = cosine_similarity(word_emb, w_emb)
            # store the similar_word as a tuple, which contains the word and the similarity
            similar_word = (w, cur_similarity)
            # append each tuple to list
            most_closest_words.append(similar_word)
    # sort based on more similarity
    most_closest_words.sort(key=lambda y: -y[1])
    return most_closest_words[:k]
    

In [None]:
find_closest_word('موهبة',20)

In [None]:
find_closest_word('موهبة',20)

In [None]:
find_closest_word('صراط',20)

In [None]:
find_closest_word('صراط',20)

In [41]:
# pip install pickle-mixin

In [34]:
import pickle # to store word embeddings result

In [35]:
data = {'embeddings': word_embeddings}
with open('embeddings.pkl', 'wb') as file:
    pickle.dump(data, file)

In [36]:
# with open('embeddings.pkl', 'rb') as file:
#     saved_data = pickle.load(file)

# new_embeddings = saved_data['embeddings']