## Input Libraries

In [5]:
import fasttext
from hazm import Normalizer, word_tokenize
import numpy as np
import pandas as pd

## Import Dataset

In [8]:
# Step 2: Load your Persian QA dataset
# Assumption: The CSV file has two columns: "question" and "answer"
data = pd.read_excel('/home/mahdi/word_embedding_Narenjestan/dataset/narenjestan_orginal.xlsx')  # Ensure UTF-8 encoding

In [10]:
data.drop('Categories', axis=1 , inplace = True)

## Preprocessing

In [12]:
normalizer = Normalizer()
def preprocess_text(text):
    
    normalized_text = normalizer.normalize(text)
    tokens = word_tokenize(normalized_text)
    return ' '.join(tokens)

In [3]:
normalizer = Normalizer()

def preprocess_persian_sentence(sentence):
    # Normalize the sentence (this handles common orthographic issues)
    normalized_sentence = normalizer.normalize(sentence)
    # Tokenize the sentence into words
    tokens = word_tokenize(normalized_sentence)
    return tokens


In [4]:
# Example usage:
sample_sentence = "چگونه می‌توانم رمز عبور خود را تغییر دهم؟"
tokens = preprocess_persian_sentence(sample_sentence)
print(tokens)

['چگونه', 'می\u200cتوانم', 'رمز', 'عبور', 'خود', 'را', 'تغییر', 'دهم', '؟']


In [15]:
training_filename = "/home/mahdi/word_embedding_Narenjestan/dataset/preprocessed_dataset.txt"
with open(training_filename, "w", encoding="utf-8") as f:
    for idx, row in data.iterrows():
        # Preprocess question and answer separately
        preprocessed_question = preprocess_text(row["Question"])
        preprocessed_answer = preprocess_text(row["Answer"])
        # Write each preprocessed sentence as a new line in the file
        f.write(preprocessed_question + "\n")
        f.write(preprocessed_answer + "\n")

## Using Fasttext

In [16]:
model = fasttext.train_unsupervised('/home/mahdi/word_embedding_Narenjestan/dataset/preprocessed_dataset.txt', model='skipgram', dim=300, ws=5, epoch=5)
print("FastText model trained.")

Read 0M words
Number of words:  1733
Number of labels: 0


FastText model trained.


Progress: 100.0% words/sec/thread:  120421 lr:  0.000000 avg.loss:  2.563948 ETA:   0h 0m 0s


### Save model

model.save_model("persian_fasttext_model.bin")

## Sentece Embedding

In [17]:
# get sentence embeddings ---> using average
def sentence_embedding(sentence, model):

    # Preprocess the sentence using Hazm
    normalized = normalizer.normalize(sentence)
    tokens = word_tokenize(normalized)
    word_vectors = [model.get_word_vector(token) for token in tokens if token.strip()]
    if not word_vectors:
        return np.zeros(model.get_dimension())
    return np.mean(word_vectors, axis=0)

## Calculate similarity

In [18]:
def cosine_similarity(vec1, vec2):
    """
    Compute the cosine similarity between two vectors.
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    if norm1 == 0 or norm2 == 0:
        return 0.0  
    return dot_product / (norm1 * norm2)

## Example and Test the model

In [22]:
# sentence1 = data.iloc[0]["Question"]
# sentence2 = data.iloc[1]["Question"]
sentence2 = 'سلام.. خوبی. هوا سرده'
sentence1 = 'یسبیسب لیبتات ککننتنک ععععغعو نمتم.'
emb1 = sentence_embedding(sentence1, model)
emb2 = sentence_embedding(sentence2, model)
similarity = cosine_similarity(emb1, emb2)

print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)
print("Cosine Similarity:", similarity)

Sentence 1: یسبیسب لیبتات ککننتنک ععععغعو نمتم.
Sentence 2: سلام.. خوبی. هوا سرده
Cosine Similarity: 0.9711847
