## Input Libraries

In [5]:
import fasttext
from hazm import Normalizer, word_tokenize
import numpy as np
import pandas as pd

## Import Dataset

In [8]:
# Step 2: Load your Persian QA dataset
# Assumption: The CSV file has two columns: "question" and "answer"
data = pd.read_excel('/home/mahdi/word_embedding_Narenjestan/dataset/narenjestan_orginal.xlsx')  # Ensure UTF-8 encoding

In [10]:
data.drop('Categories', axis=1 , inplace = True)

## Preprocessing

In [12]:
normalizer = Normalizer()
def preprocess_text(text):
    
    normalized_text = normalizer.normalize(text)
    tokens = word_tokenize(normalized_text)
    return ' '.join(tokens)

In [3]:
normalizer = Normalizer()

def preprocess_persian_sentence(sentence):
    # Normalize the sentence (this handles common orthographic issues)
    normalized_sentence = normalizer.normalize(sentence)
    # Tokenize the sentence into words
    tokens = word_tokenize(normalized_sentence)
    return tokens


In [4]:
# Example usage:
sample_sentence = "چگونه می‌توانم رمز عبور خود را تغییر دهم؟"
tokens = preprocess_persian_sentence(sample_sentence)
print(tokens)

['چگونه', 'می\u200cتوانم', 'رمز', 'عبور', 'خود', 'را', 'تغییر', 'دهم', '؟']


In [15]:
training_filename = "/home/mahdi/word_embedding_Narenjestan/dataset/preprocessed_dataset.txt"
with open(training_filename, "w", encoding="utf-8") as f:
    for idx, row in data.iterrows():
        # Preprocess question and answer separately
        preprocessed_question = preprocess_text(row["Question"])
        preprocessed_answer = preprocess_text(row["Answer"])
        # Write each preprocessed sentence as a new line in the file
        f.write(preprocessed_question + "\n")
        f.write(preprocessed_answer + "\n")

## Using Fasttext

In [None]:
model = fasttext.train_unsupervised('/home/mahdi/word_embedding_Narenjestan/dataset/preprocessed_dataset.txt', model='skipgram', dim=300, ws=5, epoch=5)