In [56]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import spacy

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("all-MiniLM-L6-v2")

print("All imports working now! ðŸš€")


All imports working now! ðŸš€


In [57]:
import pandas as pd
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer("all-MiniLM-L6-v2")

data = [
    {"id": "N1", "title": "HDFC Bank announces 15% dividend, board approves stock buyback"},
    {"id": "N2", "title": "RBI raises repo rate by 25bps to 6.75%, citing inflation concerns"},
    {"id": "N3", "title": "Reserve Bank hikes interest rates by 0.25% to fight rising prices"},
    {"id": "N4", "title": "ICICI Bank opens 500 new branches across Tier-2 cities"},
    {"id": "N5", "title": "Central bank increases policy rate by 25 basis points, signals hawkish stance"},
    {"id": "N6", "title": "Banking sector NPAs decline to 5-year low, credit growth at 16%"},
]

df = pd.DataFrame(data)
df


Unnamed: 0,id,title
0,N1,"HDFC Bank announces 15% dividend, board approv..."
1,N2,"RBI raises repo rate by 25bps to 6.75%, citing..."
2,N3,Reserve Bank hikes interest rates by 0.25% to ...
3,N4,ICICI Bank opens 500 new branches across Tier-...
4,N5,Central bank increases policy rate by 25 basis...
5,N6,"Banking sector NPAs decline to 5-year low, cre..."


In [58]:
company_to_symbol = {
    "HDFC Bank": "HDFCBANK",
    "ICICI Bank": "ICICIBANK",
}

company_to_sector = {
    "HDFC Bank": "Banking",
    "ICICI Bank": "Banking",
}

sector_keywords = {
    "banking sector": "Banking",
    "banking": "Banking",
}

regulator_keywords = {
    "RBI": "RBI",
    "Reserve Bank": "RBI",
    "Central bank": "RBI",
    "central bank": "RBI",
}


In [59]:
def extract_entities(text: str):
    doc = nlp(text)
    
    companies = set()
    sectors = set()
    regulators = set()
    
    lower_text = text.lower()
    
    # 1) spaCy ORG â†’ company
    for ent in doc.ents:
        if ent.label_ == "ORG":
            for comp in company_to_symbol.keys():
                if comp.lower() in ent.text.lower():
                    companies.add(comp)
    
    # 2) direct company substring
    for comp in company_to_symbol.keys():
        if comp.lower() in lower_text:
            companies.add(comp)
    
    # 3) sector keywords
    for kw, sector in sector_keywords.items():
        if kw in lower_text:
            sectors.add(sector)
    
    # 4) regulator keywords
    for kw, reg in regulator_keywords.items():
        if kw.lower() in lower_text:
            regulators.add(reg)
    
    return {
        "companies": list(companies),
        "sectors": list(sectors),
        "regulators": list(regulators),
    }

entity_results = df["title"].apply(extract_entities)
df["companies"] = entity_results.apply(lambda x: x["companies"])
df["sectors"] = entity_results.apply(lambda x: x["sectors"])
df["regulators"] = entity_results.apply(lambda x: x["regulators"])

df[["id", "title", "companies", "sectors", "regulators"]]


Unnamed: 0,id,title,companies,sectors,regulators
0,N1,"HDFC Bank announces 15% dividend, board approv...",[HDFC Bank],[],[]
1,N2,"RBI raises repo rate by 25bps to 6.75%, citing...",[],[],[RBI]
2,N3,Reserve Bank hikes interest rates by 0.25% to ...,[],[],[RBI]
3,N4,ICICI Bank opens 500 new branches across Tier-...,[ICICI Bank],[],[]
4,N5,Central bank increases policy rate by 25 basis...,[],[],[RBI]
5,N6,"Banking sector NPAs decline to 5-year low, cre...",[],[Banking],[]


In [60]:
def map_stock_impact(companies, sectors, regulators):
    impacts = []

    # 1) Direct company impacts
    for comp in companies:
        symbol = company_to_symbol.get(comp)
        if symbol:
            impacts.append({
                "symbol": symbol,
                "confidence": 1.0,
                "type": "direct",
                "source": comp,
            })

    # 2) Sector-wide impacts (Banking sector â†’ all Banking companies)
    if sectors:
        for sector in sectors:
            for comp, sec in company_to_sector.items():
                if sec == sector:
                    symbol = company_to_symbol.get(comp)
                    if symbol:
                        impacts.append({
                            "symbol": symbol,
                            "confidence": 0.7,
                            "type": "sector",
                            "source": sector,
                        })

    # 3) Regulator impacts (RBI â†’ all Banking companies)
    if regulators:
        for reg in regulators:
            if reg == "RBI":
                for comp, sec in company_to_sector.items():
                    if sec == "Banking":
                        symbol = company_to_symbol.get(comp)
                        if symbol:
                            impacts.append({
                                "symbol": symbol,
                                "confidence": 0.6,
                                "type": "regulatory",
                                "source": reg,
                            })

    return impacts

df["impacted_stocks"] = df.apply(
    lambda row: map_stock_impact(row["companies"], row["sectors"], row["regulators"]),
    axis=1
)

df[["id", "title", "companies", "sectors", "regulators", "impacted_stocks"]]


Unnamed: 0,id,title,companies,sectors,regulators,impacted_stocks
0,N1,"HDFC Bank announces 15% dividend, board approv...",[HDFC Bank],[],[],"[{'symbol': 'HDFCBANK', 'confidence': 1.0, 'ty..."
1,N2,"RBI raises repo rate by 25bps to 6.75%, citing...",[],[],[RBI],"[{'symbol': 'HDFCBANK', 'confidence': 0.6, 'ty..."
2,N3,Reserve Bank hikes interest rates by 0.25% to ...,[],[],[RBI],"[{'symbol': 'HDFCBANK', 'confidence': 0.6, 'ty..."
3,N4,ICICI Bank opens 500 new branches across Tier-...,[ICICI Bank],[],[],"[{'symbol': 'ICICIBANK', 'confidence': 1.0, 't..."
4,N5,Central bank increases policy rate by 25 basis...,[],[],[RBI],"[{'symbol': 'HDFCBANK', 'confidence': 0.6, 'ty..."
5,N6,"Banking sector NPAs decline to 5-year low, cre...",[],[Banking],[],"[{'symbol': 'HDFCBANK', 'confidence': 0.7, 'ty..."


In [61]:
print("N2:", df.loc[df["id"]=="N2", "impacted_stocks"].iloc[0])
print("N6:", df.loc[df["id"]=="N6", "impacted_stocks"].iloc[0])


N2: [{'symbol': 'HDFCBANK', 'confidence': 0.6, 'type': 'regulatory', 'source': 'RBI'}, {'symbol': 'ICICIBANK', 'confidence': 0.6, 'type': 'regulatory', 'source': 'RBI'}]
N6: [{'symbol': 'HDFCBANK', 'confidence': 0.7, 'type': 'sector', 'source': 'Banking'}, {'symbol': 'ICICIBANK', 'confidence': 0.7, 'type': 'sector', 'source': 'Banking'}]


In [62]:
# Create embeddings for all news titles (if not already done)
titles = df["title"].tolist()
news_embeddings = model.encode(titles, convert_to_numpy=True)
news_embeddings.shape


(6, 384)

In [63]:
def understand_query(query: str):
    q_lower = query.lower()
    
    q_companies = []
    q_sectors = []
    q_regulators = []
    
    # Check for known companies in query text
    for comp in company_to_symbol.keys():
        if comp.lower() in q_lower:
            q_companies.append(comp)
    
    # Check for sector keywords in query
    for kw, sector in sector_keywords.items():
        if kw in q_lower:
            q_sectors.append(sector)
    
    # Check for regulator keywords in query
    for kw, reg in regulator_keywords.items():
        if kw.lower() in q_lower:
            q_regulators.append(reg)
    
    # If company is mentioned, also add its sector (so company â†’ sector news)
    for comp in q_companies:
        sector = company_to_sector.get(comp)
        if sector and sector not in q_sectors:
            q_sectors.append(sector)
    
    return {
        "companies": q_companies,
        "sectors": q_sectors,
        "regulators": q_regulators,
    }

# Quick test:
print("HDFC Bank news ->", understand_query("HDFC Bank news"))
print("Banking sector update ->", understand_query("Banking sector update"))
print("RBI policy changes ->", understand_query("RBI policy changes"))
print("Interest rate impact ->", understand_query("Interest rate impact"))


HDFC Bank news -> {'companies': ['HDFC Bank'], 'sectors': ['Banking'], 'regulators': []}
Banking sector update -> {'companies': [], 'sectors': ['Banking', 'Banking'], 'regulators': []}
RBI policy changes -> {'companies': [], 'sectors': [], 'regulators': ['RBI']}
Interest rate impact -> {'companies': [], 'sectors': [], 'regulators': []}


In [64]:
def search_news(query: str, top_k: int = 5):
    ents = understand_query(query)
    
    q_companies = set(ents["companies"])
    q_sectors = set(ents["sectors"])
    q_regulators = set(ents["regulators"])
    
    # Start with all news
    candidates = df.copy()
    
    # If query mentions company/sector/regulator â†’ filter by those first
    if q_companies or q_sectors or q_regulators:
        def row_matches(row):
            has_company = bool(q_companies.intersection(row["companies"]))
            has_sector = bool(q_sectors.intersection(row["sectors"]))
            has_reg = bool(q_regulators.intersection(row["regulators"]))
            return has_company or has_sector or has_reg
        
        mask = candidates.apply(row_matches, axis=1)
        candidates = candidates[mask]
    
    # If filtering removed everything, fall back to all news
    if candidates.empty:
        candidates = df.copy()
    
    # Semantic similarity between query and candidate titles
    q_emb = model.encode([query], convert_to_numpy=True)
    cand_titles = candidates["title"].tolist()
    cand_embs = model.encode(cand_titles, convert_to_numpy=True)
    
    sims = cosine_similarity(q_emb, cand_embs)[0]
    candidates = candidates.copy()
    candidates["similarity"] = sims
    
    # Sort by similarity (descending)
    results = candidates.sort_values(by="similarity", ascending=False).head(top_k)
    
    return results[["id", "title", "companies", "sectors", "regulators", "impacted_stocks", "similarity"]]


In [54]:
print("Query: HDFC Bank news")
display(search_news("HDFC Bank news"))

print("\nQuery: Banking sector update")
display(search_news("Banking sector update"))

print("\nQuery: RBI policy changes")
display(search_news("RBI policy changes"))

print("\nQuery: Interest rate impact")
display(search_news("Interest rate impact"))


Query: HDFC Bank news


Unnamed: 0,id,title,companies,sectors,regulators,impacted_stocks,similarity
0,N1,"HDFC Bank announces 15% dividend, board approv...",[HDFC Bank],[],[],"[{'symbol': 'HDFCBANK', 'confidence': 1.0, 'ty...",0.540365
5,N6,"Banking sector NPAs decline to 5-year low, cre...",[],[Banking],[],"[{'symbol': 'HDFCBANK', 'confidence': 0.7, 'ty...",0.245887



Query: Banking sector update


Unnamed: 0,id,title,companies,sectors,regulators,impacted_stocks,similarity
5,N6,"Banking sector NPAs decline to 5-year low, cre...",[],[Banking],[],"[{'symbol': 'HDFCBANK', 'confidence': 0.7, 'ty...",0.537151



Query: RBI policy changes


Unnamed: 0,id,title,companies,sectors,regulators,impacted_stocks,similarity
1,N2,"RBI raises repo rate by 25bps to 6.75%, citing...",[],[],[RBI],"[{'symbol': 'HDFCBANK', 'confidence': 0.6, 'ty...",0.45481
4,N5,Central bank increases policy rate by 25 basis...,[],[],[RBI],"[{'symbol': 'HDFCBANK', 'confidence': 0.6, 'ty...",0.432617
2,N3,Reserve Bank hikes interest rates by 0.25% to ...,[],[],[RBI],"[{'symbol': 'HDFCBANK', 'confidence': 0.6, 'ty...",0.339448



Query: Interest rate impact


Unnamed: 0,id,title,companies,sectors,regulators,impacted_stocks,similarity
2,N3,Reserve Bank hikes interest rates by 0.25% to ...,[],[],[RBI],"[{'symbol': 'HDFCBANK', 'confidence': 0.6, 'ty...",0.591648
1,N2,"RBI raises repo rate by 25bps to 6.75%, citing...",[],[],[RBI],"[{'symbol': 'HDFCBANK', 'confidence': 0.6, 'ty...",0.498819
4,N5,Central bank increases policy rate by 25 basis...,[],[],[RBI],"[{'symbol': 'HDFCBANK', 'confidence': 0.6, 'ty...",0.462694
5,N6,"Banking sector NPAs decline to 5-year low, cre...",[],[Banking],[],"[{'symbol': 'HDFCBANK', 'confidence': 0.7, 'ty...",0.443357
3,N4,ICICI Bank opens 500 new branches across Tier-...,[ICICI Bank],[],[],"[{'symbol': 'ICICIBANK', 'confidence': 1.0, 't...",0.193613


In [65]:
print("N2 impacts:", df.loc[df["id"]=="N2", "impacted_stocks"].iloc[0])
print("N6 impacts:", df.loc[df["id"]=="N6", "impacted_stocks"].iloc[0])


N2 impacts: [{'symbol': 'HDFCBANK', 'confidence': 0.6, 'type': 'regulatory', 'source': 'RBI'}, {'symbol': 'ICICIBANK', 'confidence': 0.6, 'type': 'regulatory', 'source': 'RBI'}]
N6 impacts: [{'symbol': 'HDFCBANK', 'confidence': 0.7, 'type': 'sector', 'source': 'Banking'}, {'symbol': 'ICICIBANK', 'confidence': 0.7, 'type': 'sector', 'source': 'Banking'}]
