In [2]:
import pandas as pd
data = pd.read_excel('/home/mahdi/word_embedding_Narenjestan/dataset/narenjestan_khowledgebase_editable.xlsx')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18875 entries, 0 to 18874
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   سوال     18875 non-null  object
 1   سطح سوم  18874 non-null  object
 2   تعداد    18875 non-null  int64 
 3   جواب     18874 non-null  object
dtypes: int64(1), object(3)
memory usage: 590.0+ KB


## Merged and generate txt file

In [None]:
import pandas as pd

data = pd.read_excel('/home/mahdi/word_embedding_Narenjestan/dataset/narenjestan_khowledgebase_editable.xlsx')
df = data.drop(['سطح سوم' , 'تعداد'], axis=1)

print("Columns in Excel:", df.columns.tolist())

col1 = "سوال"
col2 = "جواب"

merged_series = df[col1].astype(str) + " " + df[col2].astype(str)

merged_series.to_csv("MEC-merge_Narenjestan_khnowledgebase-V0.1.txt", index=False, header=False)

print("Merged text file saved as 'output.txt'.")

Columns in Excel: ['سوال', 'جواب']
Merged text file saved as 'output.txt'.


## Using TF-IDF

In [None]:
from hazm import Normalizer, WordTokenizer, Stemmer, stopwords_list

normalizer = Normalizer()
tokenizer = WordTokenizer()
stemmer = Stemmer()
persian_stopwords = set(stopwords_list())

def hazm_tokenizer(doc):
    # Normalize the entire document
    doc_norm = normalizer.normalize(doc)
    # Tokenize
    tokens = tokenizer.tokenize(doc_norm)
    # Keep only alphabetic tokens (drop punctuation, numbers)
    words = [t for t in tokens if t.isalpha()]
    # Drop stop-words
    words = [w for w in words if w not in persian_stopwords]
    return words


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18875 entries, 0 to 18874
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   سوال    18875 non-null  object
 1   جواب    18874 non-null  object
dtypes: object(2)
memory usage: 295.1+ KB


#### All documents in one merged file

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Read all lines from the merged file (each line is one document)
with open("MEC-merge_Narenjestan_khnowledgebase-V0.1.txt", "r", encoding="utf-8") as f:
    # documents = [line.strip() for line in f if line.strip()]
    full_text = f.read()

# Wrap it in a list so that vectorizer sees exactly one document
documents = [full_text]


# Initialize TF-IDF vectorizer (example for English; adapt tokenizer for Persian if needed)
vectorizer = TfidfVectorizer(
    tokenizer=hazm_tokenizer,
    lowercase=False,      # Hazm’s Normalizer already handles lowercasing/unifying forms
    preprocessor=None,    # we do all normalization inside hazm_tokenizer
    token_pattern=None,   # disable scikit-learn’s default regex tokenizer
    min_df=1,             # drop tokens that appear in fewer than 2 documents
    max_df=1.0           # drop tokens that appear in >85% of docs
)

# Fit on all documents at once
X_tfidf = vectorizer.fit_transform(documents)
# X_tfidf now has shape (n_docs × n_terms)

# Inspect vocabulary size and top tokens per document
print("Number of documents:", len(documents))
print("Number of unique tokens after filtering:", len(vectorizer.get_feature_names_out()))

# Example
import numpy as np
feature_names = vectorizer.get_feature_names_out()
dense0 = X_tfidf[0].todense().A1
top100_idx = np.argsort(dense0)[-100:][::-1]
print("Top 100 terms in doc 0:", [(feature_names[i],dense0[i]) for i in top100_idx])


# Extract nonzero (term_index, score) pairs for this single document
row0 = X_tfidf[0].tocoo()                   # sparse row in COO format
nonzero_pairs = list(zip(row0.col, row0.data))
nonzero_pairs.sort(key=lambda x: x[1], reverse=True)

# Write each (term, score) tuple to a text file, one per line
with open("tfidf_results.txt", "w", encoding="utf-8") as out_file:
    for idx, score in nonzero_pairs:      # <-- iterate over nonzero_pairs, not dense0
        term = feature_names[idx]
        out_file.write(f"{term}\t{score:.6f}\n")

print("Saved TF-IDF results to 'tfidf_results.txt'")


Number of documents: 1
Number of unique tokens after filtering: 5342
Top 100 terms in doc 0: [('پردازش', 0.3751819401139471), ('شماره', 0.3254274087081434), ('بررسی', 0.3107954290554311), ('جهت', 0.2789357072546294), ('اطلاعات', 0.2223509851486994), ('خطای', 0.21820356574267788), ('گروه', 0.21228696742919262), ('فنی', 0.2088356184129929), ('ارجاع', 0.2073129644352577), ('دریافت', 0.2032380523615093), ('حساب', 0.16683937156041131), ('پیغام', 0.15900857967491613), ('طریق', 0.1435210135013812), ('مشتری', 0.1389675530346303), ('گزارش', 0.13418206910460545), ('صورت', 0.13399355004069538), ('درخواست', 0.12159479622199468), ('سامانه', 0.10964558786338721), ('کارت', 0.09985709800651824), ('چک', 0.09357796303166746), ('رفع', 0.09189579292293146), ('ثبت', 0.09095319760338111), ('استعلام', 0.08896649669909808), ('وضعیت', 0.08339793358052372), ('مشکل', 0.08046863735669034), ('کد', 0.07735082206894689), ('قرارداد', 0.07429101264702191), ('اقدام', 0.06853393046453748), ('مبلغ', 0.06255932628523375),

#### consider each row as one document

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# with open("MEC-merge_Narenjestan_khnowledgebase-V0.1.txt", "r", encoding="utf-8") as f:
#     documents = [line.strip() for line in f if line.strip()]

documents = merged_series 

# documents = merged_series

vectorizer = TfidfVectorizer(
    tokenizer=hazm_tokenizer,
    lowercase=False,      # Hazm’s Normalizer already handles lowercasing/unifying forms
    preprocessor=None,    # we do all normalization inside hazm_tokenizer
    token_pattern=None,   # disable scikit-learn’s default regex tokenizer
    min_df=2,             # drop tokens that appear in fewer than 2 documents
    max_df=0.85           # drop tokens that appear in >85% of docs
)



X_tfidf = vectorizer.fit_transform(documents)
# X_tfidf.shape == (n_rows, n_unique_tokens)
feature_names = vectorizer.get_feature_names_out()
print("TF-IDF matrix shape:", X_tfidf.shape)
print("Vocabulary size:", len(feature_names))



# -----------------------------
# Convert to CSC to efficiently sum over rows for each column
X_csc = X_tfidf.tocsc()

# Sum TF-IDF weights column by column
# This yields a NumPy array of length n_tokens, where each entry is sum of column i.
global_tfidf_sum = np.array(X_csc.sum(axis=0)).ravel()

# Pair each token with its summed TF-IDF and sort descending
token_and_sum = list(zip(feature_names, global_tfidf_sum))
token_and_sum.sort(key=lambda x: x[1], reverse=True)

# Write results to a file: one "token<TAB>global_tfidf_sum" per line
with open("tfidf_results_pertoken-V0.1.txt", "w", encoding="utf-8") as out_f:
    out_f.write("token\tglobal_tfidf_sum\n")
    for tok, score in token_and_sum:
        out_f.write(f"{tok}\t{score:.6f}\n")

print("Saved per-token aggregated TF-IDF to 'global_tfidf_per_token.txt'.")


TF-IDF matrix shape: (18875, 3350)
Vocabulary size: 3350
Saved per-token aggregated TF-IDF to 'global_tfidf_per_token.txt'.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

with open("MEC-merge_Narenjestan_khnowledgebase-V0.1.txt", "r", encoding="utf-8") as f:
    documents = [line.strip() for line in f if line.strip()]

# documents = merged_series

# Build & fit TfidfVectorizer
vectorizer = TfidfVectorizer(
    tokenizer=hazm_tokenizer,
    lowercase=False,      # Hazm’s Normalizer already handles lowercasing/unifying forms
    preprocessor=None,    # we do all normalization inside hazm_tokenizer
    token_pattern=None,   # disable scikit-learn’s default regex tokenizer
    min_df=2,             # drop tokens that appear in fewer than 2 documents
    max_df=0.85           # drop tokens that appear in >85% of docs
)

X_tfidf = vectorizer.fit_transform(documents)
# X_tfidf is a scipy.sparse matrix of shape (n_docs × n_terms)


# Extract vocabulary
feature_names = vectorizer.get_feature_names_out()
# feature_names[i] is the i-th token (stemmed Persian word)

print(f"Total unique tokens (after stemming filtering): {len(feature_names)}")
# e.g. “Total unique tokens …: 12,345”


# Find top‐N highest TF-IDF terms per document
def top_n_terms_for_doc(doc_vector, feature_names, n=5):
    
    # Convert sparse 1×n_terms row to a 1D numpy array
    dense = doc_vector.todense().A1
    # Get indices of the top‐n weights
    top_idxs = np.argsort(dense)[-n:][::-1]
    return [(feature_names[i], float(dense[i])) for i in top_idxs]

# Example: print top 5 terms for the first 5 documents
for doc_idx in range(10):
    doc_vec = X_tfidf[doc_idx]  
    top_terms = top_n_terms_for_doc(doc_vec, feature_names, n=5)
    print(f"\nDocument #{doc_idx} top 5 terms:")
    for term, score in top_terms:
        print(f"  {term}: {score:.4f}")

Total unique tokens (after stemming filtering): 3350

Document #0 top 5 terms:
  شناسه: 0.4651
  انتقالی: 0.3406
  مانده: 0.2957
  کاربر: 0.2460
  کدملی: 0.2267

Document #1 top 5 terms:
  مسائل: 0.4805
  تماس: 0.3820
  رمز: 0.3500
  بازنشستگان: 0.2461
  پیگیری: 0.2169

Document #2 top 5 terms:
  بازخرید: 0.4430
  گواهی: 0.3701
  فروش: 0.3204
  ردیف: 0.3021
  سپرده: 0.2863

Document #3 top 5 terms:
  حقوقی: 0.5224
  کارت: 0.4340
  مشتری: 0.2850
  انتخاب: 0.2132
  نماینده: 0.1993

Document #4 top 5 terms:
  چک: 0.4634
  کلیه: 0.2564
  درخواست: 0.2271
  وضعیت: 0.2097
  تحویل: 0.2018

Document #5 top 5 terms:
  ازدواج: 0.4920
  مرکزی: 0.3339
  بانک: 0.2587
  درخواست: 0.2494
  سامانه: 0.2334

Document #6 top 5 terms:
  کارت: 0.5201
  درخواست: 0.3295
  تحویل: 0.2929
  حذف: 0.2513
  ارسال: 0.2346

Document #7 top 5 terms:
  شهاب: 0.4350
  سو: 0.3876
  چک: 0.3387
  برگشتی: 0.2615
  استعلام: 0.2017

Document #8 top 5 terms:
  ا: 0.3883
  مسدودی: 0.3320
  مرکزی: 0.2943
  بانک: 0.2280
  برگشتی: 

## NGRAM (bigram and Trigram)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# with open("MEC-merge_Narenjestan_khnowledgebase-V0.1.txt", "r", encoding="utf-8") as f:
#     documents = [line.strip() for line in f if line.strip()]

documents = merged_series 

# documents = merged_series

vectorizer = TfidfVectorizer(
    tokenizer=hazm_tokenizer,
    lowercase=False,     # Hazm normalizer already lowercases
    preprocessor=None,
    token_pattern=None,  # rely solely on hazm_tokenizer
    ngram_range=(1,3),  #  unigrams , bigram and trigrams
    min_df=2,            # drop n-grams that appear in fewer than 2 docs
    max_df=0.85          # drop n-grams that appear in >85% of docs
)

X_tfidf = vectorizer.fit_transform(documents)
# X_tfidf.shape == (n_rows, n_unique_tokens)
feature_names = vectorizer.get_feature_names_out()
print("TF-IDF matrix shape:", X_tfidf.shape)
print("Vocabulary size:", len(feature_names))


X_tfidf = vectorizer.fit_transform(documents)
# Now X_tfidf has shape (18000 rows, n_ngrams columns)

feature_names = vectorizer.get_feature_names_out()
print("TF-IDF matrix shape:", X_tfidf.shape)
print("Number of unique uni-gram/2-gram/3-gram tokens:", len(feature_names))

# -----------------------------
# Convert to CSC to efficiently sum over rows for each column
X_csc = X_tfidf.tocsc()

# Sum TF-IDF weights column by column
# This yields a NumPy array of length n_tokens, where each entry is sum of column i.
global_tfidf_sum = np.array(X_csc.sum(axis=0)).ravel()


# Pair each n-gram with its aggregated TF-IDF
token_and_score = list(zip(feature_names, global_tfidf_sum))
token_and_score.sort(key=lambda x: x[1], reverse=True)  # sort by descending score

# Write to a file, one "n-gram<TAB>score" per line
with open("tfidf_results_pertoken_123gram-V0.2.txt", "w", encoding="utf-8") as out_f:
    out_f.write("ngram\tglobal_tfidf_sum\n")
    for ngram, score in token_and_score:
        out_f.write(f"{ngram}\t{score:.6f}\n")

print("Saved corpus-wide TF-IDF for bigrams/trigrams to 'global_tfidf_ngrams.txt'")

TF-IDF matrix shape: (18875, 85841)
Vocabulary size: 85841
TF-IDF matrix shape: (18875, 85841)
Number of unique uni-gram/2-gram/3-gram tokens: 85841
Saved corpus-wide TF-IDF for bigrams/trigrams to 'global_tfidf_ngrams.txt'


In [14]:
### Test to see the difference between unigram only and unigram and bigram without normalization
# Unigrams-only, no normalization
vec1 = TfidfVectorizer(ngram_range=(1,1), tokenizer=hazm_tokenizer,
                       token_pattern=None, lowercase=False,
                       use_idf=True, norm=None)
X1 = vec1.fit_transform(documents)

# Unigrams+bi-grams, no normalization
vec2 = TfidfVectorizer(ngram_range=(1,2), tokenizer=hazm_tokenizer,
                       token_pattern=None, lowercase=False,
                       use_idf=True, norm=None)
X2 = vec2.fit_transform(documents)

# Now check the raw TFxIDF for "شماره":
idx1 = vec1.vocabulary_.get("شماره")
idx2 = vec2.vocabulary_.get("شماره")
raw1 = X1[:, idx1].sum()   # sum of TF×IDF across all docs
raw2 = X2[:, idx2].sum()   # sum of TF×IDF across all docs

print("Raw sum for 'شماره' (unigrams only):", raw1)
print("Raw sum for 'شماره' (uni+bi-grams):", raw2)


Raw sum for 'شماره' (unigrams only): 34064.71289657068
Raw sum for 'شماره' (uni+bi-grams): 34064.71289657068


## Count unique unigram, bigram and trigram terms

In [19]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

# Read the corpus (one document per line)

documents = merged_series 

# Create a CountVectorizer to count unigrams, bigrams, and trigrams
vectorizer = CountVectorizer(
    tokenizer=hazm_tokenizer,
    lowercase=False,     # Hazm already handles casing
    token_pattern=None,  # disable sklearn’s default regex
    ngram_range=(1, 3)   # unigrams, bigrams, trigrams
)

# 4. Fit and transform to get a document-term matrix of counts
X_counts = vectorizer.fit_transform(documents)  # shape: (n_docs, n_ngrams)

# 5. Sum counts over all documents for each n-gram (column)
#    Convert to CSC for efficient column sum
X_csc = X_counts.tocsc()
ngram_counts = np.array(X_csc.sum(axis=0)).ravel()  # length = number of n-grams

# 6. Invert the vocabulary mapping to get index → n-gram
vocab_inv = {v: k for k, v in vectorizer.vocabulary_.items()}

# 7. Create a DataFrame of (ngram, count) and sort descending
ngram_list = [(vocab_inv[i], ngram_counts[i]) for i in range(len(ngram_counts))]
df_counts = pd.DataFrame(ngram_list, columns=["ngram", "count"])
df_counts = df_counts.sort_values(by="count", ascending=False).reset_index(drop=True)

# # 8. Save full counts to a CSV file
# df_counts.to_csv("ngram_counts.csv", index=False)

# 9. (Optional) Save to a text file as "ngram<TAB>count"
with open("ngram_counts.txt", "w", encoding="utf-8") as out_f:
    for _, row in df_counts.iterrows():
        out_f.write(f"{row['ngram']}\t{int(row['count'])}\n")

# 10. Display top 20 n-grams to the user
# 10. Print the top 20 n-grams
print("\nTop 20 n-grams by count:")
print(df_counts.head(20).to_string(index=False))



Top 20 n-grams by count:
         ngram  count
        پردازش  25872
         شماره  22441
         بررسی  21432
           جهت  19235
       اطلاعات  15333
          خطای  15047
          گروه  14639
           فنی  14401
         ارجاع  14296
      گروه فنی  14286
     فنی ارجاع  14198
گروه فنی ارجاع  14193
        دریافت  14015
     جهت بررسی  13914
    بررسی گروه  13852
بررسی گروه فنی  13850
جهت بررسی گروه  13845
          حساب  11505
         پیغام  10965
    پیغام خطای  10339
