## Input Libraries

In [1]:
import fasttext
from hazm import Normalizer, word_tokenize
import numpy as np
import pandas as pd
import re

## Import Dataset

In [2]:
data = pd.read_excel('/home/mahdi/word_embedding_Narenjestan/dataset/narenjestan_orginal.xlsx')

In [3]:
data.drop('Categories', axis=1 , inplace = True)

## Preprocessing

In [5]:
def is_valid_token(token):
    # Persian Unicode range: \u0600 - \u06FF 
    return re.fullmatch(r'[\u0600-\u06FF]+', token) is not None

In [6]:
normalizer = Normalizer()
def preprocess_text(text):
    
    normalized_text = normalizer.normalize(text)
    tokens = word_tokenize(normalized_text)
    return ' '.join(tokens)

In [7]:
normalizer = Normalizer()

def preprocess_persian_sentence(sentence):
    # Normalize the sentence (this handles common orthographic issues)
    normalized_sentence = normalizer.normalize(sentence)
    # Tokenize the sentence into words
    tokens = word_tokenize(normalized_sentence)
    return tokens


In [24]:
# Example
sample_sentence = "چگونه می‌توانم رمز عبور خود را تغییر دهم؟"
sample_sentence = '. لیبلیلل و بیبل و بیلیل منم .سلام.. خوبی. هوا سرده'

tokens = preprocess_persian_sentence(sample_sentence)
print(tokens)

['.', 'لیبلیلل', 'و', 'بیبل', 'و', 'بیلیل', 'منم', '.', 'سلام', '..', 'خوبی', '.', 'هوا', 'سرده']


In [12]:
training_filename = "/home/mahdi/word_embedding_Narenjestan/dataset/preprocessed_dataset_Q.txt"
with open(training_filename, "w", encoding="utf-8") as f:
    for idx, row in data.iterrows():
        # Preprocess question and answer separately
        preprocessed_question = preprocess_text(row["Question"])
        # preprocessed_answer = preprocess_text(row["Answer"])
        # Write each preprocessed sentence as a new line in the file
        f.write(preprocessed_question + "\n")
        # f.write(preprocessed_answer + "\n")

## Using FastText

In [13]:
model = fasttext.train_unsupervised('/home/mahdi/word_embedding_Narenjestan/dataset/preprocessed_dataset.txt', model='skipgram', dim=200, ws=5, epoch=20)
print("FastText model trained.")

Read 0M words
Number of words:  1733
Number of labels: 0
Progress:  84.5% words/sec/thread:  243734 lr:  0.007767 avg.loss:  2.061281 ETA:   0h 0m 0s

FastText model trained.


Progress: 100.0% words/sec/thread:  240436 lr:  0.000000 avg.loss:  2.017205 ETA:   0h 0m 0s


### Save model

In [14]:
model.save_model("/home/mahdi/word_embedding_Narenjestan/model/persian_fasttext_model_v3.bin")

### Load model

In [15]:

model = fasttext.load_model("./model/persian_fasttext_model_v3.bin")

# word_vector = model.get_word_vector("سلام")
# print(word_vector)




## Sentence Embedding

In [20]:
# get sentence embeddings ---> using average
def sentence_embedding(sentence, model):
    normalized = normalizer.normalize(sentence)
    tokens = word_tokenize(normalized)
    # Filter out tokens that are not valid Persian words
    valid_tokens = [token for token in tokens if is_valid_token(token)]
    
    print(len(tokens), len(valid_tokens))
    word_vectors = [model.get_word_vector(token) for token in valid_tokens]
    if not word_vectors:
        return np.zeros(model.get_dimension())
    return np.mean(word_vectors, axis=0)

## Calculate similarity

In [21]:
def cosine_similarity(vec1, vec2):
   
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0  
    return dot_product / (norm1 * norm2)

## Example and Test the model

In [23]:
sentence1 = data.iloc[0]["Question"]
# sentence2 = data.iloc[1]["Question"]
# sentence2 = 'سلام.. خوبی. هوا سرده'
sentence2 = '. لیبلیلل و بیبل و بیلیل منم .سلام.. خوبی. هوا سرده'
sentence2 = normalizer.normalize(sentence2)
# sentence1 = 'یسبیسب لیبتات ککننتنک ععععغعو نمتم.'
emb1 = sentence_embedding(sentence1, model)
emb2 = sentence_embedding(sentence2, model)
similarity = cosine_similarity(emb1, emb2)

print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Cosine Similarity:", similarity)

13 12
14 10
Sentence 1: جهت اصلاح تاریخ اسناد اسکن شده از چه طریق می بایست اقدام گردد؟
Sentence 2: . لیبلیلل و بیبل و بیلیل منم. سلام.. خوبی. هوا سرده
Cosine Similarity: 0.61641866


### Using TF-IDF weights

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

with open(training_filename, "r", encoding="utf-8") as f:
    corpus = f.readlines()

tfidf_vectorizer = TfidfVectorizer(token_pattern=r'[\u0600-\u06FF]+')
tfidf_vectorizer.fit(corpus)

In [26]:
# def sentence_embedding_weighted(sentence, model, vectorizer):
#     normalized = normalizer.normalize(sentence)
#     tokens = word_tokenize(normalized)
#     valid_tokens = [token for token in tokens if is_valid_token(token)]
#     if not valid_tokens:
#         return np.zeros(model.get_dimension())
    
#     # Get TF-IDF weights for the tokens
#     weights = []
#     for token in valid_tokens:
#         try:
#             # Get the weight; if token is unseen, assign a small weight (e.g., 0.0)
#             weight = vectorizer.idf_[vectorizer.vocabulary_[token]]
#         except KeyError:
#             weight = 0.0
#         weights.append(weight)
    
#     # Get word vectors and compute weighted average
#     word_vectors = [model.get_word_vector(token) for token in valid_tokens]
#     weighted_vec = np.average(word_vectors, axis=0, weights=weights)
#     return weighted_vec

In [27]:
def sentence_embedding_weighted(sentence, model, vectorizer):
    normalized = normalizer.normalize(sentence)
    tokens = word_tokenize(normalized)
    valid_tokens = [token for token in tokens if is_valid_token(token)]
    
    if not valid_tokens:
        return np.zeros(model.get_dimension())
    
    weights = []
    for token in valid_tokens:
        
        try:
            weight = vectorizer.idf_[vectorizer.vocabulary_[token]]
        except KeyError:
            weight = 0.00
        weights.append(weight)
    
    # Get the corresponding word vectors
    word_vectors = [model.get_word_vector(token) for token in valid_tokens]
    
    
    if np.sum(weights) == 0:
        return np.mean(word_vectors, axis=0)
    
    weighted_vec = np.average(word_vectors, axis=0, weights=weights)
    return weighted_vec

In [36]:
# Example usage:
sentence1 = data.iloc[0]["Question"]
# sentence2 = 'اصلاح تاریخ اسناد اسکن شده'
# sentence2 = data.iloc[1]["Question"]
# sentence2 = 'ینی برم داخل سایت اقدام کنم'
# sentence2 = ' .من الان نمیدونم چی باید بگم. اینکه درست کارنمیکنه سلام.. خوبی. هوا سرده'
# sentence2 = ' تاخحهح ضصضص ق ثصضصثض ممم '
# sentence2 = '. لیبلیلل و بیبل و بیلیل منم .سلام.. خوبی. هوا سرده'
sentence1 = 'سشیب سیبب صثقخهصثق صثهق صثقصثقخص صحثخقصثق خح'
# sentence1 = 'هوا خیلی سرده'
sentence2 = 'اسناد اسکن شده رو چجوری تاریخشون رو اصلاح کنم'
# sentence2 = 'مدیر بانک ملت کیه'
emb1 = sentence_embedding_weighted(sentence1, model, tfidf_vectorizer)
emb2 = sentence_embedding_weighted(sentence2, model, tfidf_vectorizer)
similarity = cosine_similarity(emb1, emb2)
print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Cosine Similarity:", similarity)

Sentence 1: سشیب سیبب صثقخهصثق صثهق صثقصثقخص صحثخقصثق خح
Sentence 2: اسناد اسکن شده رو چجوری تاریخشون رو اصلاح کنم
Cosine Similarity: 0.2728028487784654


In [50]:
print(emb1)

[-0.0742312   0.01529884 -0.08503975 -0.06978415  0.09423897  0.14119788
 -0.06587392 -0.07845861  0.05789316 -0.01308028  0.12882135 -0.13347987
 -0.00179481  0.0058102  -0.12328282 -0.06123976  0.07276879 -0.06208477
 -0.02829298 -0.06030533 -0.04819648 -0.12052343 -0.03484042 -0.05421103
 -0.00143709  0.14497285  0.2067364   0.06911253 -0.07858744  0.01158994
 -0.04882151 -0.17099192 -0.03603374 -0.20018625  0.07949732  0.05324553
  0.01187842  0.03828251  0.14524337  0.02032678  0.28883006  0.17388081
  0.00342117 -0.05048787  0.07155843  0.03800254 -0.00280513 -0.05785795
  0.0173445   0.04581879 -0.12574502 -0.03897532 -0.29394641 -0.20729682
  0.08222646  0.10048467 -0.31940025 -0.15310051  0.21027264  0.06941052
 -0.02333741  0.00050289 -0.0478079   0.19091916 -0.16045903  0.14716901
 -0.23328159  0.01163705  0.19255248 -0.02965643 -0.18234849  0.01993098
 -0.13545732 -0.06003323  0.11391118 -0.00136041  0.06958413  0.0972524
 -0.05816548  0.11131536  0.11828363  0.09341313  0.