## Input Libraries

In [23]:
import fasttext
from hazm import Normalizer, word_tokenize
import numpy as np
import pandas as pd
import re

## Import Dataset

In [8]:
# Step 2: Load your Persian QA dataset
# Assumption: The CSV file has two columns: "question" and "answer"
data = pd.read_excel('/home/mahdi/word_embedding_Narenjestan/dataset/narenjestan_orginal.xlsx')  # Ensure UTF-8 encoding

In [10]:
data.drop('Categories', axis=1 , inplace = True)

## Preprocessing

In [24]:
def is_valid_token(token):
    # Persian Unicode range: \u0600 - \u06FF (adjust as needed)
    return re.fullmatch(r'[\u0600-\u06FF]+', token) is not None


In [12]:
normalizer = Normalizer()
def preprocess_text(text):
    
    normalized_text = normalizer.normalize(text)
    tokens = word_tokenize(normalized_text)
    return ' '.join(tokens)

In [3]:
normalizer = Normalizer()

def preprocess_persian_sentence(sentence):
    # Normalize the sentence (this handles common orthographic issues)
    normalized_sentence = normalizer.normalize(sentence)
    # Tokenize the sentence into words
    tokens = word_tokenize(normalized_sentence)
    return tokens


In [4]:
# Example usage:
sample_sentence = "چگونه می‌توانم رمز عبور خود را تغییر دهم؟"
tokens = preprocess_persian_sentence(sample_sentence)
print(tokens)

['چگونه', 'می\u200cتوانم', 'رمز', 'عبور', 'خود', 'را', 'تغییر', 'دهم', '؟']


In [15]:
training_filename = "/home/mahdi/word_embedding_Narenjestan/dataset/preprocessed_dataset.txt"
with open(training_filename, "w", encoding="utf-8") as f:
    for idx, row in data.iterrows():
        # Preprocess question and answer separately
        preprocessed_question = preprocess_text(row["Question"])
        preprocessed_answer = preprocess_text(row["Answer"])
        # Write each preprocessed sentence as a new line in the file
        f.write(preprocessed_question + "\n")
        f.write(preprocessed_answer + "\n")

## Using Fasttext

In [16]:
model = fasttext.train_unsupervised('/home/mahdi/word_embedding_Narenjestan/dataset/preprocessed_dataset.txt', model='skipgram', dim=300, ws=5, epoch=5)
print("FastText model trained.")

Read 0M words
Number of words:  1733
Number of labels: 0


FastText model trained.


Progress: 100.0% words/sec/thread:  120421 lr:  0.000000 avg.loss:  2.563948 ETA:   0h 0m 0s


### Save model

model.save_model("persian_fasttext_model.bin")

## Sentence Embedding

In [17]:
# get sentence embeddings ---> using average
def sentence_embedding(sentence, model):
    normalized = normalizer.normalize(sentence)
    tokens = word_tokenize(normalized)
    # Filter out tokens that are not valid Persian words
    valid_tokens = [token for token in tokens if is_valid_token(token)]
    word_vectors = [model.get_word_vector(token) for token in valid_tokens]
    if not word_vectors:
        return np.zeros(model.get_dimension())
    return np.mean(word_vectors, axis=0)

## Calculate similarity

In [18]:
def cosine_similarity(vec1, vec2):
   
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0  
    return dot_product / (norm1 * norm2)

## Example and Test the model

In [22]:
# sentence1 = data.iloc[0]["Question"]
# sentence2 = data.iloc[1]["Question"]
sentence2 = 'سلام.. خوبی. هوا سرده'
sentence1 = 'یسبیسب لیبتات ککننتنک ععععغعو نمتم.'
emb1 = sentence_embedding(sentence1, model)
emb2 = sentence_embedding(sentence2, model)
similarity = cosine_similarity(emb1, emb2)

print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Cosine Similarity:", similarity)

Sentence 1: یسبیسب لیبتات ککننتنک ععععغعو نمتم.
Sentence 2: سلام.. خوبی. هوا سرده
Cosine Similarity: 0.9711847


### Using TF-IDF weights

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

with open(training_filename, "r", encoding="utf-8") as f:
    corpus = f.readlines()

tfidf_vectorizer = TfidfVectorizer(token_pattern=r'[\u0600-\u06FF]+')
tfidf_vectorizer.fit(corpus)

In [29]:
# def sentence_embedding_weighted(sentence, model, vectorizer):
#     normalized = normalizer.normalize(sentence)
#     tokens = word_tokenize(normalized)
#     valid_tokens = [token for token in tokens if is_valid_token(token)]
#     if not valid_tokens:
#         return np.zeros(model.get_dimension())
    
#     # Get TF-IDF weights for the tokens
#     weights = []
#     for token in valid_tokens:
#         try:
#             # Get the weight; if token is unseen, assign a small weight (e.g., 0.0)
#             weight = vectorizer.idf_[vectorizer.vocabulary_[token]]
#         except KeyError:
#             weight = 0.0
#         weights.append(weight)
    
#     # Get word vectors and compute weighted average
#     word_vectors = [model.get_word_vector(token) for token in valid_tokens]
#     weighted_vec = np.average(word_vectors, axis=0, weights=weights)
#     return weighted_vec

In [30]:
def sentence_embedding_weighted(sentence, model, vectorizer):
    normalized = normalizer.normalize(sentence)
    tokens = word_tokenize(normalized)
    valid_tokens = [token for token in tokens if is_valid_token(token)]
    
    if not valid_tokens:
        return np.zeros(model.get_dimension())
    
    weights = []
    for token in valid_tokens:
        # Try to get the TF-IDF weight; if the token is unseen, weight is 0
        try:
            weight = vectorizer.idf_[vectorizer.vocabulary_[token]]
        except KeyError:
            weight = 0.0
        weights.append(weight)
    
    # Get the corresponding word vectors
    word_vectors = [model.get_word_vector(token) for token in valid_tokens]
    
    # Check if the sum of weights is zero; if so, use an unweighted average
    if np.sum(weights) == 0:
        return np.mean(word_vectors, axis=0)
    
    weighted_vec = np.average(word_vectors, axis=0, weights=weights)
    return weighted_vec

In [34]:
# Example usage:
# sentence1 = data.iloc[0]["Question"]
# sentence2 = data.iloc[1]["Question"]
sentence2 = 'سلام.. خوبی. هوا سرده'
sentence1 = 'یسبیسب لیبتات ککننتنک ععععغعو نمتم.'
emb1 = sentence_embedding_weighted(sentence1, model, tfidf_vectorizer)
emb2 = sentence_embedding_weighted(sentence2, model, tfidf_vectorizer)
similarity = cosine_similarity(emb1, emb2)
print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Cosine Similarity:", similarity)

Sentence 1: یسبیسب لیبتات ککننتنک ععععغعو نمتم.
Sentence 2: سلام.. خوبی. هوا سرده
Cosine Similarity: 0.9285773
