In [2]:
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from sentence_transformers import SentenceTransformer, util
import stanza
import time




options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

# load url 
BASE_URL = "https://www.digikala.com"
driver.get(BASE_URL)
time.sleep(5)  # Wait for JS to load


soup = BeautifulSoup(driver.page_source, 'html.parser')

# link
linked_text = []
for a in soup.find_all('a', href=True):
    text = a.get_text(strip=True)
    url = a['href']

    # short text or None 
    if not text or len(text) < 3:
        continue

    #  url
    full_url = urljoin(BASE_URL, url)

    linked_text.append((text, full_url))

driver.quit()
linked_text

[('سوپرمارکت', 'https://www.digikala.com/main/food-beverage/'),
 ('پرفروش\u200cترین\u200cها', 'https://www.digikala.com/best-selling/'),
 ('خرید کالای کارکرده',
  'https://www.digikala.com/landing/used/?utm_source=DKHEADER&utm_medium=HEADERBTM&utm_campaign=DKHEADER-USED&utm_id=DK-HEADER-USEDLANDING'),
 ('شگفت\u200cانگیزها', 'https://www.digikala.com/incredible-offers/'),
 ('سوالی دارید؟', 'https://www.digikala.com/faq/'),
 ('در دیجی\u200cکالا بفروشید',
  'https://www.digikala.com/landings/seller-introduction/'),
 ('همه موارد این دسته',
  'https://www.digikala.com/search/category-mobile-phone/'),
 ('انتخاب موبایل', 'https://www.digikala.com/landing/mobile/'),
 ('همه موارد این دسته',
  'https://www.digikala.com/search/category-mobile-phone/apple/'),
 ('آیفون ۱۶', 'https://www.digikala.com/tags/iphone-16/'),
 ('آیفون ۱۶ ای',
  'https://www.digikala.com/product/dkp-19656793/%DA%AF%D9%88%D8%B4%DB%8C-%D9%85%D9%88%D8%A8%D8%A7%DB%8C%D9%84-%D8%A7%D9%BE%D9%84-%D9%85%D8%AF%D9%84-iphone-16e-hn-%D8

In [3]:

#############################################################
# linked_text = linked_text[:40]
#############################################################
# pipline 
stanza.download('fa')  # Run once
nlp = stanza.Pipeline('fa')

#  query
input_text = "قیمت تلفن همراه امروز قیمت تلفن گران شد فردا ارزان میشود "

# keywords 
doc = nlp(input_text)
keywords = []
for sentence in doc.sentences:
    for word in sentence.words:
        # if word.upos in ['NOUN', 'PROPN', 'ADJ', 'VERB']:#include more 
        if word.upos in ['NOUN', 'PROPN']:
            keywords.append(word.text)


unique_keywords = list(set(keywords))
keyword_text = " ".join(unique_keywords)
unique_keywords

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 433kB [00:00, 2.29MB/s]                    
2025-07-28 14:48:07 INFO: Downloaded file to C:\Users\MM\stanza_resources\resources.json
2025-07-28 14:48:07 INFO: Downloading default packages for language: fa (Persian) ...
2025-07-28 14:48:07 INFO: File exists: C:\Users\MM\stanza_resources\fa\default.zip
2025-07-28 14:48:10 INFO: Finished downloading models and saved to C:\Users\MM\stanza_resources
2025-07-28 14:48:10 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 433kB [00:00, 2.27MB/s]                    
2025-07-28 14:48:11 INFO: Downloaded file to C:\Users\MM\stanza_resources\resources.json
2025-07-28 14:48:11 INFO: Loading these models fo

['تلفن', 'امروز', 'فردا', 'قیمت']

In [4]:



model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')  # Better multilingual model

# encode  query
reference_embedding = model.encode(keyword_text, convert_to_tensor=True)

# encode links
texts = [text for text, _ in linked_text]
text_embeddings = model.encode(texts, convert_to_tensor=True)

# similarity
cosine_scores = util.pytorch_cos_sim(reference_embedding, text_embeddings)[0]

# filter 
sorted_results = sorted(
    zip(linked_text, cosine_scores.tolist()),
    key=lambda x: x[1],
    reverse=True
)

# 
top_n = 10
top_links = sorted_results[:top_n]

# --- Display Results ---
print("\n--- top  links ---\n")
for (text, url), score in top_links:
    print(f"{score:.2f} | {text} --> {url}")



--- top  links ---

0.69 | قیمت گوشی --> https://www.digikala.com/search/category-mobile-phone/
0.67 | تلفن --> https://www.digikala.com/search/category-telephone/
0.66 | تماس --> tel:02161930000
0.65 | گوشی ناتینگ فون --> https://www.digikala.com/search/category-mobile-phone/nothing/
0.65 | گوشی میان رده --> https://www.digikala.com/search/facet/category-mobile-phone/division-mid-range/
0.64 | گوشی بلک ویو --> https://www.digikala.com/search/category-mobile-phone/blackview/
0.63 | گوشی تا ۵ میلیون تومان --> https://www.digikala.com/search/facet/category-mobile-phone/from-35000000-up-to-59000000/
0.63 | گوشی آنر --> https://www.digikala.com/search/category-mobile-phone/honor/
0.62 | گوشی یک ترابایت --> https://www.digikala.com/search/facet/category-mobile-phone/internal-memory-1-tb/
0.62 | هولدر گوشی موبایل --> https://www.digikala.com/search/category-cell-phone-holder/
