In [42]:
!pip -q install stanza requests beautifulsoup4 matplotlib

In [52]:
import requests, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import Counter
import matplotlib.pyplot as plt
import stanza

Initialize Persian NLP

In [53]:
stanza.download("fa")
nlp = stanza.Pipeline("fa", processors="tokenize,pos")

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: fa (Persian) ...
INFO:stanza:File exists: /root/stanza_resources/fa/default.zip


KeyboardInterrupt: 

web scraping

In [54]:
url = "https://trtfarsi.com"
html = requests.get(url, timeout=10).text
soup = BeautifulSoup(html, "html.parser")

titles, links = [], []
for a in soup.select("span a,div a,span, .title a,div,a")[:5]:
    title = a.get_text(strip=True)
    titles.append(title)
    links.append(a.get("href"))

texts = []
for L in links:
    try:
        if not L or not L.startswith("http"):
            L = urljoin(url, L)
        p = BeautifulSoup(requests.get(L, timeout=8).text, "html.parser")
        para = p.find("p")
        texts.append(para.get_text(strip=True)[:300] if para else "")
    except:
        texts.append("")

print(f"۵ خبر از TRT فارسی: {len(titles)} تا")

۵ خبر از TRT فارسی: 5 تا


Clean + normalize Persian text

In [55]:
raw_text = " ".join(titles + texts)
clean_text = re.sub(r'[^\u0600-\u06FF\s]', ' ', raw_text)  # keep Persian chars only
clean_text = re.sub(r'\s+', ' ', clean_text).strip()

Process with Stanza

In [56]:
doc = nlp(clean_text)

Extract nouns and proper nouns

In [57]:
nouns = [w.text for s in doc.sentences for w in s.words if w.upos in ["NOUN", "PROPN"]]
counts = Counter(nouns)

In [58]:

persian_stopwords = {
    "ها", "های", "هایی", "ای", "آن", "این", "و", "را", "با", "در", "از", "که",
    "به", "برای", "است", "شود", "کرد", "می", "بر", "تا", "اما"
}

def clean_token(t):
    # remove suffixes like "ها" or "های" from nouns
    t = re.sub(r"(‌ها|ها|های)$", "", t)
    return t.strip()

filtered_nouns = []
for w in nouns:
    base = clean_token(w)
    if base and base not in persian_stopwords and len(base) > 2:
        filtered_nouns.append(base)

counts = Counter(filtered_nouns)


Print top 5

In [59]:
print("\n۵ اسم یا نام پرتکرار:")
for w, c in counts.most_common(5):
    print(f"   {w} → {c} بار")



۵ اسم یا نام پرتکرار:
   ترکیه → 18 بار
   ایران → 16 بار
   آمریکا → 14 بار
   کشور → 12 بار
   اتحادیه → 12 بار
