In [1]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
from datasets import load_dataset

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
INDIA_RSS_FEEDS = [
    "https://www.thehindu.com/news/national/feeder/default.rss",
    "https://www.thehindu.com/news/international/feeder/default.rss",
    "https://indianexpress.com/section/india/feed/",
    "https://www.hindustantimes.com/rss/india/rssfeed.xml",
    "https://www.business-standard.com/rss/latest.rss",
    "https://economictimes.indiatimes.com/rssfeedsdefault.cms",
    "https://sports.ndtv.com/rss/all",
    "https://www.isro.gov.in/feeds.xml",
    "https://indianexpress.com/section/entertainment/feed/",
    "https://indianexpress.com/section/technology/feed/"
]

print("Loaded Indian RSS sources:", len(INDIA_RSS_FEEDS))


Loaded Indian RSS sources: 10


In [3]:
import feedparser
from datetime import datetime

def fetch_india_rss():
    articles = []
    for feed in INDIA_RSS_FEEDS:
        print("Fetching:", feed)
        try:
            parsed = feedparser.parse(feed)
            for entry in parsed.entries:
                articles.append({
                    "title": entry.get("title"),
                    "text": entry.get("summary", ""),
                    "url": entry.get("link"),
                    "published": entry.get("published", ""),
                    "source": feed.split("/")[2]
                })
        except Exception as e:
            print("Error fetching:", feed, e)
    print("Total India articles:", len(articles))
    return pd.DataFrame(articles)

df_india = fetch_india_rss()
df_india.head()


Fetching: https://www.thehindu.com/news/national/feeder/default.rss
Fetching: https://www.thehindu.com/news/international/feeder/default.rss
Fetching: https://indianexpress.com/section/india/feed/
Fetching: https://www.hindustantimes.com/rss/india/rssfeed.xml
Fetching: https://www.business-standard.com/rss/latest.rss
Fetching: https://economictimes.indiatimes.com/rssfeedsdefault.cms
Fetching: https://sports.ndtv.com/rss/all
Fetching: https://www.isro.gov.in/feeds.xml
Fetching: https://indianexpress.com/section/entertainment/feed/
Fetching: https://indianexpress.com/section/technology/feed/
Total India articles: 876


Unnamed: 0,title,text,url,published,source
0,Pakistani drone sighted along International Bo...,"The drone, which was seen coming from the Chak...",https://www.thehindu.com/news/national/pakista...,"Sat, 22 Nov 2025 09:29:51 +0530",www.thehindu.com
1,Bank staff under scanner as Telangana cybercri...,Cyber Security Bureau (TGCSB) is also pushing ...,https://www.thehindu.com/news/national/telanga...,"Sat, 22 Nov 2025 09:29:18 +0530",www.thehindu.com
2,Four former MLAs in the fray in local body pol...,While the UDF has fielded Anil Akkara and K.S....,https://www.thehindu.com/news/national/kerala/...,"Sat, 22 Nov 2025 09:03:00 +0530",www.thehindu.com
3,"Supreme Court issues notice to Centre, BCI on ...",The petition argued that the Bombay High Court...,https://www.thehindu.com/news/national/supreme...,"Sat, 22 Nov 2025 08:59:35 +0530",www.thehindu.com
4,‘Kodi Kada’ is in non-stop election mode,"Since 1984, the small shop is the primary sour...",https://www.thehindu.com/news/national/kerala/...,"Sat, 22 Nov 2025 08:59:00 +0530",www.thehindu.com


In [4]:
ag = load_dataset("ag_news")
df_ag = ag["train"].to_pandas()

category_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}

df_ag["category"] = df_ag["label"].map(category_map)
df_ag["source"] = "ag_news"
df_ag["title"] = df_ag["text"].str.split(".").str[0]

df_ag = df_ag[["title", "text", "source", "category"]]
df_ag.head()


Unnamed: 0,title,text,source,category
0,Wall St,Wall St. Bears Claw Back Into the Black (Reute...,ag_news,Business
1,Carlyle Looks Toward Commercial Aerospace (Reu...,Carlyle Looks Toward Commercial Aerospace (Reu...,ag_news,Business
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,Oil and Economy Cloud Stocks' Outlook (Reuters...,ag_news,Business
3,Iraq Halts Oil Exports from Main Southern Pipe...,Iraq Halts Oil Exports from Main Southern Pipe...,ag_news,Business
4,"Oil prices soar to all-time record, posing new...","Oil prices soar to all-time record, posing new...",ag_news,Business


In [5]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

STOP = set(stopwords.words("english"))
lemm = WordNetLemmatizer()

def clean_text(text):
    if text is None:
        return ""
    text = re.sub(r"http\S+|www\S+", " ", text)
    text = re.sub(r"[^a-zA-Z ]", " ", text)
    tokens = text.lower().split()
    tokens = [lemm.lemmatize(t) for t in tokens if t not in STOP and len(t) > 2]
    return " ".join(tokens)


In [6]:
print("Cleaning AG News...")
df_ag["clean_text"] = df_ag["text"].apply(clean_text)

print("Cleaning Indian RSS...")
df_india["clean_text"] = df_india["text"].apply(clean_text)


Cleaning AG News...
Cleaning Indian RSS...


In [7]:
df_merged = pd.concat([df_ag, df_india], ignore_index=True)
df_merged.dropna(subset=["text"], inplace=True)
df_merged.reset_index(drop=True, inplace=True)

print("Merged dataset size:", len(df_merged))
df_merged.head()


Merged dataset size: 120876


Unnamed: 0,title,text,source,category,clean_text,url,published
0,Wall St,Wall St. Bears Claw Back Into the Black (Reute...,ag_news,Business,wall bear claw back black reuters reuters shor...,,
1,Carlyle Looks Toward Commercial Aerospace (Reu...,Carlyle Looks Toward Commercial Aerospace (Reu...,ag_news,Business,carlyle look toward commercial aerospace reute...,,
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,Oil and Economy Cloud Stocks' Outlook (Reuters...,ag_news,Business,oil economy cloud stock outlook reuters reuter...,,
3,Iraq Halts Oil Exports from Main Southern Pipe...,Iraq Halts Oil Exports from Main Southern Pipe...,ag_news,Business,iraq halt oil export main southern pipeline re...,,
4,"Oil prices soar to all-time record, posing new...","Oil prices soar to all-time record, posing new...",ag_news,Business,oil price soar time record posing new menace e...,,


In [8]:
os.makedirs("data/topic_corpus", exist_ok=True)
MERGED_PATH = "data/topic_corpus/ag_bbc_india_merged.csv"

df_merged.to_csv(MERGED_PATH, index=False)
MERGED_PATH


'data/topic_corpus/ag_bbc_india_merged.csv'

In [9]:
from sentence_transformers import SentenceTransformer

#EMBEDDER = "all-mpnet-base-v2"   # Strongest
EMBEDDER = "all-MiniLM-L6-v2" # Faster CPU

sbert = SentenceTransformer(EMBEDDER)
print("Loaded SBERT:", EMBEDDER)



Loaded SBERT: all-MiniLM-L6-v2


In [10]:
from bertopic import BERTopic
import umap
import hdbscan

umap_model = umap.UMAP(
    n_neighbors=15,
    n_components=5,
    metric='cosine',
    low_memory=True
)

hdb_model = hdbscan.HDBSCAN(
    min_cluster_size=20,
    min_samples=5,
    metric='euclidean',
    prediction_data=True,
    cluster_selection_method='eom'
)

topic_model = BERTopic(
    embedding_model=sbert,
    n_gram_range=(1,2),
    calculate_probabilities=True,
    verbose=True,
    umap_model=umap_model,
    hdbscan_model=hdb_model
)

print("BERTopic model ready.")


BERTopic model ready.


In [13]:
df_merged = df_merged.sample(30000, random_state=42)
df_merged = df_merged.reset_index(drop=True)


In [14]:
docs = df_merged["text"].astype(str).tolist()

print("Training BERTopic on", len(docs), "documents...")
topics, probs = topic_model.fit_transform(docs)

print("Training complete!")


2025-11-22 17:29:15,551 - BERTopic - Embedding - Transforming documents to embeddings.


Training BERTopic on 30000 documents...


Batches:   0%|          | 0/938 [00:00<?, ?it/s]

2025-11-22 17:32:19,959 - BERTopic - Embedding - Completed ✓
2025-11-22 17:32:19,961 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2025-11-22 17:32:26,868 - BERTopic - Dimensionality - Completed ✓
2025-11-22 17:32:26,883 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-11-22 17:34:38,541 - BERTopic - Cluster - Completed ✓
2025-11-22 17:34:38,644 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-11-22 17:34:44,087 - BERTopic - Representation - Completed ✓


Training complete!


In [15]:
MODEL_DIR = "backend/app/ml/models/topics"
os.makedirs(MODEL_DIR, exist_ok=True)

topic_model.save(f"{MODEL_DIR}/bertopic_india_global")

topic_info = topic_model.get_topic_info()
topic_info.to_csv(f"{MODEL_DIR}/bertopic_info_india_global.csv", index=False)

with open(f"{MODEL_DIR}/bertopic_keywords_india_global.json", "w") as f:
    json.dump({t: topic_model.get_topic(t) for t in topic_info.Topic}, f, indent=2)

topic_info.head()




Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,9602,-1_of_the_to_and,"[of, the, to, and, in, for, on, that, 39, is]",[Digital Eye on Ivan The Internet is a powerfu...
1,0,1171,0_sox_red sox_red_yankees,"[sox, red sox, red, yankees, series, boston, a...",[Yankees and Red Sox ready to rumble (Sports N...
2,1,581,1_no_ranked_oklahoma_bowl,"[no, ranked, oklahoma, bowl, state, yards, poi...","[BCS still ranks as a failure FORT LAUDERDALE,..."
3,2,389,2_cricket_test_australia_india,"[cricket, test, australia, india, england, tro...",[Cricket: Harbhajan #39;s dual strike halts Au...
4,3,271,3_darfur_sudan_region_in darfur,"[darfur, sudan, region, in darfur, darfur regi...",[Talks begin in Abuja on conflict in Sudan's D...


In [16]:
frac_outliers = (pd.Series(topics) == -1).mean()
print("Outlier rate:", frac_outliers)

topic_info.head(10)


Outlier rate: 0.32006666666666667


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,9602,-1_of_the_to_and,"[of, the, to, and, in, for, on, that, 39, is]",[Digital Eye on Ivan The Internet is a powerfu...
1,0,1171,0_sox_red sox_red_yankees,"[sox, red sox, red, yankees, series, boston, a...",[Yankees and Red Sox ready to rumble (Sports N...
2,1,581,1_no_ranked_oklahoma_bowl,"[no, ranked, oklahoma, bowl, state, yards, poi...","[BCS still ranks as a failure FORT LAUDERDALE,..."
3,2,389,2_cricket_test_australia_india,"[cricket, test, australia, india, england, tro...",[Cricket: Harbhajan #39;s dual strike halts Au...
4,3,271,3_darfur_sudan_region_in darfur,"[darfur, sudan, region, in darfur, darfur regi...",[Talks begin in Abuja on conflict in Sudan's D...
5,4,271,4_points_nba_scored_knicks,"[points, nba, scored, knicks, points and, lake...",[Rockets Nip Kings As NBA Travels to China (AP...
6,5,243,5_peoplesoft_oracle_peoplesoft inc_takeover,"[peoplesoft, oracle, peoplesoft inc, takeover,...",[PeopleSoft Rejects Oracle Bid PHILADELPHIA/S...
7,6,241,6_arsenal_chelsea_manchester_league,"[arsenal, chelsea, manchester, league, champio...",[Manchester United Ends Arsenal's Streak (AP) ...
8,7,240,7_stocks_oil_oil prices_prices,"[stocks, oil, oil prices, prices, higher, reut...",[Stocks Seen Flat as Microsoft Weighs NEW YOR...
9,8,237,8_us airways_delta_airways_pilots,"[us airways, delta, airways, pilots, airlines,...",[US Air Pilots OK 18 Percent Pay Cut CHICAGO ...


In [None]:
topic_model.visualize_topics()


In [None]:
topic_model.visualize_barchart(top_n_topics=20)


In [None]:
topic_model.visualize_hierarchy()


In [None]:
df_merged["bertopic_topic"] = topics
df_merged["bertopic_prob"] = probs

df_merged.to_csv("data/topic_corpus/ag_bbc_india_with_topics.csv", index=False)

df_merged.head()
