<a href="https://colab.research.google.com/github/MehraeenTimas/nlp-course/blob/main/mehraeen_persianNER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip -q install stanza requests beautifulsoup4 matplotlib

In [None]:
import requests, re
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from collections import Counter
import matplotlib.pyplot as plt
import stanza

Initialize Persian NLP

In [None]:
stanza.download("fa")
nlp = stanza.Pipeline("fa", processors="tokenize,pos")

web scraping

In [None]:
url = "https://trtfarsi.com"
html = requests.get(url, timeout=10).text
soup = BeautifulSoup(html, "html.parser")

titles, links = [], []
for a in soup.select("span a,div a,span, .title a,div,a")[:5]:
    title = a.get_text(strip=True)
    titles.append(title)
    links.append(a.get("href"))

texts = []
for L in links:
    try:
        if not L or not L.startswith("http"):
            L = urljoin(url, L)
        p = BeautifulSoup(requests.get(L, timeout=8).text, "html.parser")
        para = p.find("p")
        texts.append(para.get_text(strip=True)[:300] if para else "")
    except:
        texts.append("")

print(f"۵ خبر از TRT فارسی: {len(titles)} تا")

Clean + normalize Persian text

In [None]:
raw_text = " ".join(titles + texts)
clean_text = re.sub(r'[^\u0600-\u06FF\s]', ' ', raw_text)  # keep Persian chars only
clean_text = re.sub(r'\s+', ' ', clean_text).strip()

Process with Stanza

In [None]:
doc = nlp(clean_text)

Extract nouns and proper nouns

In [None]:
nouns = [w.text for s in doc.sentences for w in s.words if w.upos in ["NOUN", "PROPN"]]
counts = Counter(nouns)

In [None]:

persian_stopwords = {
    "ها", "های", "هایی", "ای", "آن", "این", "و", "را", "با", "در", "از", "که",
    "به", "برای", "است", "شود", "کرد", "می", "بر", "تا", "اما"
}

def clean_token(t):
    # remove suffixes like "ها" or "های" from nouns
    t = re.sub(r"(‌ها|ها|های)$", "", t)
    return t.strip()

filtered_nouns = []
for w in nouns:
    base = clean_token(w)
    if base and base not in persian_stopwords and len(base) > 2:
        filtered_nouns.append(base)

counts = Counter(filtered_nouns)


Print top 5

In [None]:
print("\n۵ اسم یا نام پرتکرار:")
for w, c in counts.most_common(5):
    print(f"   {w} → {c} بار")
