In [4]:
pip install newspaper3k pandas tqdm langdetect ollama

Defaulting to user installation because normal site-packages is not writeable
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     -------------------------------------- 981.5/981.5 kB 6.5 MB/s eta 0:00:00
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py): started
  Building wheel for langdetect (setup.py): finished with status 'done'
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993251 sha256=60ace21539cd091c3ba6a407bb9cae6c89f3c582d300ad3a0f9fcab9b652653a
  Stored in directory: c:\users\acer\appdata\local\pip\cache\wheels\c1\67\88\e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9
Note: you may need to re

### Scrape News Articles from multiple news channels using keywords (+Embedding)

In [None]:
import os
import time
import json
import sqlite3
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from newspaper import build
from urllib.parse import urlparse
from langdetect import detect
import ollama

# === CONFIG ===
keywords_file = "search_keywords.txt"
output_file = "Scraped_News_Articles.csv"
db_file = "Scraped_News_Articles.db"
chunk_size = 10

# === LIST OF NEWS SOURCES ===
news_sources = [
    "https://timesofindia.indiatimes.com",
    "https://www.ndtv.com",
    "https://www.hindustantimes.com",
    "https://indianexpress.com",
    "https://www.news18.com",
    "https://www.thehindu.com",
    "https://www.deccanherald.com"
]

# === UTILITY: Generate Embeddings ===
def generate_embeddings(texts):
    embeddings = []
    try:
        default_dim = len(ollama.embeddings(model='nomic-embed-text', prompt="test")['embedding'])
    except Exception:
        default_dim = 768
    for text in texts:
        try:
            emb = ollama.embeddings(model='nomic-embed-text', prompt=text)['embedding']
            embeddings.append(emb)
        except Exception as e:
            print(f"⚠️ Failed to embed text: {e}")
            embeddings.append([0] * default_dim)
    return embeddings

# === UTILITY: Save to CSV and DB ===
def save_to_files(df, csv_file, db_file):
    df['embedding'] = df['embedding'].apply(lambda x: json.dumps(x))
    df.to_csv(csv_file, mode='a', header=not os.path.exists(csv_file), index=False, encoding='utf-8-sig')

    conn = sqlite3.connect(db_file)
    cursor = conn.cursor()
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS articles (
            search_word TEXT,
            domain TEXT,
            url TEXT,
            language TEXT,
            title TEXT,
            keywords TEXT,
            text TEXT,
            embedding TEXT
        )
    """)
    cursor.execute("CREATE INDEX IF NOT EXISTS idx_url ON articles (url)")
    df.to_sql('articles', conn, if_exists='append', index=False)
    conn.commit()
    conn.close()

# === MAIN SCRAPER ===
def scrape_keyword_from_sources(query, sources, chunk_size=10, output_file=None, db_file=None):
    total_scraped = 0
    total_articles_found = 0
    for base_url in sources:
        try:
            source = build(base_url, memoize_articles=False)
        except Exception as e:
            print(f" Failed to build source {base_url}: {e}")
            continue

        texts_to_embed = []
        articles_info = []

        for article in tqdm(source.articles, desc=f"[{query}] {base_url}", unit="article"):
            try:
                article.download()
                article.parse()
                article.nlp()
                text = article.text.strip()
                if not text or query.lower() not in (article.title + text).lower():
                    continue

                lang = detect(text)
                domain = urlparse(article.source_url).netloc if article.source_url else urlparse(article.url).netloc

                texts_to_embed.append(text)
                articles_info.append({
                    "search_word": query,
                    "domain": domain,
                    "url": article.url,
                    "language": lang,
                    "title": article.title,
                    "keywords": ', '.join(article.keywords) if article.keywords else '',
                    "text": text
                })

                if len(texts_to_embed) >= chunk_size:
                    embeddings = generate_embeddings(texts_to_embed)
                    for idx, article_info in enumerate(articles_info):
                        article_info["embedding"] = embeddings[idx]
                    df_chunk = pd.DataFrame(articles_info)
                    save_to_files(df_chunk, output_file, db_file)
                    texts_to_embed.clear()
                    articles_info.clear()

                total_scraped += 1
            except Exception as e:
                with open("scrape_errors.log", "a", encoding="utf-8") as log:
                    log.write(f"{datetime.now()} - {query} - {getattr(article, 'url', 'unknown')}: {e}\n")
                continue

        if texts_to_embed:
            embeddings = generate_embeddings(texts_to_embed)
            for idx, article_info in enumerate(articles_info):
                article_info["embedding"] = embeddings[idx]
            df_chunk = pd.DataFrame(articles_info)
            save_to_files(df_chunk, output_file, db_file)

        total_articles_found += len(source.articles)

    return total_scraped, total_articles_found

# === MAIN EXECUTION ===
if __name__ == "__main__":
    start_time = time.time()

    if not os.path.exists(keywords_file):
        raise FileNotFoundError(f"Keyword file '{keywords_file}' not found.")

    with open(keywords_file, 'r', encoding='utf-8') as f:
        query_list = [line.strip() for line in f if line.strip()]

    for f in [output_file, db_file, "scrape_errors.log"]:
        if os.path.exists(f):
            os.remove(f)

    keyword_results = []
    for idx, query in enumerate(query_list, 1):
        print(f"\n🔍 [{idx}/{len(query_list)}] Processing keyword: '{query}'")
        scraped, total = scrape_keyword_from_sources(
            query, news_sources, chunk_size=chunk_size,
            output_file=output_file, db_file=db_file
        )
        keyword_results.append((query, scraped, total))

    print("\n Scraping Summary:")
    for idx, (query, scraped, total) in enumerate(keyword_results, 1):
        print(f"{idx}. {query}: {scraped}/{total} articles matched and scraped")

    print(f"\n Results saved to:\n {output_file}\n {db_file}")
    print(f" Total time: {round(time.time() - start_time, 2)} seconds")





🔍 [1/9] Processing keyword: 'modi'


[modi] https://timesofindia.indiatimes.com: 100%|██████████| 266/266 [03:37<00:00,  1.22article/s]
[modi] https://www.ndtv.com: 100%|██████████| 908/908 [12:12<00:00,  1.24article/s]  
[modi] https://www.hindustantimes.com: 0article [00:00, ?article/s]
[modi] https://indianexpress.com: 100%|██████████| 362/362 [03:44<00:00,  1.61article/s]
[modi] https://www.news18.com: 100%|██████████| 2337/2337 [27:35<00:00,  1.41article/s]  
[modi] https://www.thehindu.com: 100%|██████████| 65/65 [00:33<00:00,  1.94article/s]
[modi] https://www.deccanherald.com: 100%|██████████| 102/102 [01:20<00:00,  1.26article/s]



🔍 [2/9] Processing keyword: 'elon musk'


[elon musk] https://timesofindia.indiatimes.com: 100%|██████████| 330/330 [02:30<00:00,  2.20article/s]
[elon musk] https://www.ndtv.com: 100%|██████████| 903/903 [17:03<00:00,  1.13s/article]  
[elon musk] https://www.hindustantimes.com: 0article [00:00, ?article/s]
[elon musk] https://indianexpress.com: 100%|██████████| 364/364 [04:08<00:00,  1.47article/s]
[elon musk] https://www.news18.com: 100%|██████████| 2334/2334 [32:13<00:00,  1.21article/s]   
[elon musk] https://www.thehindu.com: 100%|██████████| 66/66 [00:24<00:00,  2.70article/s]
[elon musk] https://www.deccanherald.com: 100%|██████████| 105/105 [03:27<00:00,  1.98s/article]



🔍 [3/9] Processing keyword: 'trending news'


[trending news] https://timesofindia.indiatimes.com: 100%|██████████| 330/330 [01:48<00:00,  3.03article/s]
[trending news] https://www.ndtv.com: 100%|██████████| 906/906 [25:55<00:00,  1.72s/article]   
[trending news] https://www.hindustantimes.com: 0article [00:00, ?article/s]
[trending news] https://indianexpress.com: 100%|██████████| 363/363 [02:01<00:00,  2.99article/s]
[trending news] https://www.news18.com: 100%|██████████| 2338/2338 [49:16<00:00,  1.26s/article]    
[trending news] https://www.thehindu.com: 100%|██████████| 68/68 [00:28<00:00,  2.35article/s]
[trending news] https://www.deccanherald.com: 100%|██████████| 110/110 [01:11<00:00,  1.54article/s]



🔍 [4/9] Processing keyword: 'sports'


[sports] https://timesofindia.indiatimes.com: 100%|██████████| 332/332 [01:58<00:00,  2.81article/s]
[sports] https://www.ndtv.com: 100%|██████████| 907/907 [22:33<00:00,  1.49s/article]    
[sports] https://www.hindustantimes.com: 0article [00:00, ?article/s]
[sports] https://indianexpress.com: 100%|██████████| 367/367 [06:03<00:00,  1.01article/s]
[sports] https://www.news18.com: 100%|██████████| 2363/2363 [36:38<00:00,  1.07article/s]  
[sports] https://www.thehindu.com: 100%|██████████| 68/68 [01:40<00:00,  1.48s/article]
[sports] https://www.deccanherald.com: 100%|██████████| 89/89 [03:08<00:00,  2.11s/article]



🔍 [5/9] Processing keyword: 'business'


[business] https://timesofindia.indiatimes.com: 100%|██████████| 328/328 [04:48<00:00,  1.14article/s]
[business] https://www.ndtv.com: 100%|██████████| 905/905 [20:31<00:00,  1.36s/article]    
[business] https://www.hindustantimes.com: 0article [00:00, ?article/s]
[business] https://indianexpress.com: 100%|██████████| 354/354 [02:36<00:00,  2.27article/s]
[business] https://www.news18.com: 100%|██████████| 2319/2319 [37:39<00:00,  1.03article/s]    
[business] https://www.thehindu.com: 100%|██████████| 68/68 [11:06<00:00,  9.80s/article] 
[business] https://www.deccanherald.com: 100%|██████████| 115/115 [01:31<00:00,  1.26article/s]



🔍 [6/9] Processing keyword: 'india economy'


[india economy] https://timesofindia.indiatimes.com: 100%|██████████| 330/330 [01:29<00:00,  3.68article/s]
[india economy] https://www.ndtv.com: 100%|██████████| 908/908 [1:36:46<00:00,  6.39s/article]      
[india economy] https://www.hindustantimes.com: 0article [00:00, ?article/s]
[india economy] https://indianexpress.com: 100%|██████████| 361/361 [02:04<00:00,  2.91article/s]
[india economy] https://www.news18.com: 100%|██████████| 2342/2342 [15:22<00:00,  2.54article/s]  
[india economy] https://www.thehindu.com: 100%|██████████| 67/67 [00:22<00:00,  2.95article/s]
[india economy] https://www.deccanherald.com: 100%|██████████| 107/107 [02:24<00:00,  1.35s/article]



🔍 [7/9] Processing keyword: 'startup funding'


[startup funding] https://timesofindia.indiatimes.com: 100%|██████████| 328/328 [01:29<00:00,  3.65article/s]
[startup funding] https://www.ndtv.com: 100%|██████████| 906/906 [10:10<00:00,  1.48article/s]  
[startup funding] https://www.hindustantimes.com: 0article [00:00, ?article/s]
[startup funding] https://indianexpress.com: 100%|██████████| 358/358 [03:51<00:00,  1.55article/s]
[startup funding] https://www.news18.com: 100%|██████████| 2344/2344 [16:20<00:00,  2.39article/s]  
[startup funding] https://www.thehindu.com: 100%|██████████| 67/67 [00:22<00:00,  2.94article/s]
[startup funding] https://www.deccanherald.com: 100%|██████████| 104/104 [58:23<00:00, 33.69s/article]    



🔍 [8/9] Processing keyword: 'lok sabha'


[lok sabha] https://timesofindia.indiatimes.com: 0article [00:00, ?article/s]
[lok sabha] https://www.ndtv.com: 0article [00:00, ?article/s]
[lok sabha] https://www.hindustantimes.com: 0article [00:00, ?article/s]
[lok sabha] https://indianexpress.com: 0article [00:00, ?article/s]
[lok sabha] https://www.news18.com: 0article [00:00, ?article/s]
[lok sabha] https://www.thehindu.com: 0article [00:00, ?article/s]
[lok sabha] https://www.deccanherald.com: 0article [00:00, ?article/s]



🔍 [9/9] Processing keyword: 'kashmir'


[kashmir] https://timesofindia.indiatimes.com: 0article [00:00, ?article/s]
[kashmir] https://www.ndtv.com: 0article [00:00, ?article/s]
[kashmir] https://www.hindustantimes.com: 0article [00:00, ?article/s]
[kashmir] https://indianexpress.com: 0article [00:00, ?article/s]
[kashmir] https://www.news18.com: 0article [00:00, ?article/s]
[kashmir] https://www.thehindu.com: 0article [00:00, ?article/s]
[kashmir] https://www.deccanherald.com: 0article [00:00, ?article/s]


 Scraping Summary:
1. modi: 174/4040 articles matched and scraped
2. elon musk: 13/4102 articles matched and scraped
3. trending news: 2/4115 articles matched and scraped
4. sports: 208/4126 articles matched and scraped
5. business: 170/4089 articles matched and scraped
6. india economy: 0/4115 articles matched and scraped
7. startup funding: 0/4107 articles matched and scraped
8. lok sabha: 0/0 articles matched and scraped
9. kashmir: 0/0 articles matched and scraped

 Results saved to:
 Scraped_News_Articles.csv
 Scraped_News_Articles.db
 Total time: 43779.98 seconds





## Rag Pipeline

In [2]:
import os
import re
import json
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from typing import List, Tuple, Dict, Any
import textwrap

# LangChain imports
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
from langchain.embeddings.base import Embeddings

# Download required NLTK resources
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# ------------------- SETUP -------------------
CHROMA_DB_PATH = "./chroma_news_csv_db"
CSV_FILE_PATH = "Scraped_News_Articles.csv"  

# Custom embedding class to use pre-computed embeddings
class PrecomputedEmbeddings(Embeddings):
    """Use pre-computed embeddings stored in documents"""
    
    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        This is a placeholder implementation that should not be called directly.
        Instead, we'll manually pass embeddings when adding documents.
        """
        # Return dummy embeddings - this won't be used as we'll override the add_texts flow
        print("[!] Warning: embed_documents called on PrecomputedEmbeddings class.")
        return [[0.0] * 768] * len(texts)  # Return dummy embeddings with proper dimensions
    
    def embed_query(self, text: str) -> List[float]:
        """Embed a query using Ollama's nomic-embed-text model"""
        try:
            import ollama
            embedding = ollama.embeddings(model='nomic-embed-text', prompt=text)['embedding']
            return embedding
        except Exception as e:
            print(f"[!] Error embedding query: {e}")
            # Return a zero vector as fallback (with appropriate dimension)
            return [0.0] * 768  # Default dimension for nomic-embed-text

# ------------------- CSV FUNCTIONS -------------------

def load_articles_from_csv(csv_path=CSV_FILE_PATH):
    """Load articles from CSV file"""
    print(f"[*] Loading articles from CSV: {csv_path}")
    
    articles = []
    try:
        # Read the CSV using pandas for better handling of complex data
        df = pd.read_csv(csv_path)
        print(f"[*] CSV loaded with {len(df)} rows")
        
        # Convert DataFrame rows to dictionaries
        for _, row in df.iterrows():
            # Parse embedding from JSON string to actual list
            embedding_str = row.get("embedding", "[]")
            try:
                embedding = json.loads(embedding_str)
            except (json.JSONDecodeError, TypeError):
                print(f"[!] Warning: Could not parse embedding for article: {row.get('title', 'Unknown')}")
                embedding = []
            
            article = {
                "search_word": row.get("search_word", ""),
                "domain": row.get("domain", ""),
                "url": row.get("url", ""),
                "language": row.get("language", ""),
                "title": row.get("title", ""),
                "keywords": row.get("keywords", ""),
                "text": row.get("text", ""),
                "embedding": embedding
            }
            articles.append(article)
        
        print(f"[+] Successfully loaded {len(articles)} articles from CSV")
        
    except Exception as e:
        print(f"[!] Error loading CSV: {e}")
    
    return articles

# ------------------- TEXT PREPROCESSING -------------------

def clean_text(text):
    """Clean and normalize text content"""
    if not text:
        return ""
    
    # Remove extra whitespace, tabs, and newlines
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Segment into sentences for better processing
    sentences = sent_tokenize(text)
    
    # Remove very short sentences (likely noise)
    filtered_sentences = [s for s in sentences if len(s.split()) > 3]
    
    return ' '.join(filtered_sentences)

def process_documents(articles):
    """Process articles into document chunks for embedding"""
    docs = []
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # Larger chunks for more context
        chunk_overlap=200,
        separators=["\n\n", "\n", ". ", " ", ""]
    )
    
    print(f"[*] Processing {len(articles)} articles...")
    
    for i, article in enumerate(articles):
        # Skip articles with no text
        if not article.get("text"):
            continue
            
        # Clean the text
        cleaned_text = clean_text(article["text"])
        
        # Skip if cleaning removed all content
        if not cleaned_text:
            continue
            
        # Create a comprehensive title + text combination for better context
        full_content = f"{article.get('title', '')}\n\n{cleaned_text}"
        
        # Split into chunks
        chunks = splitter.split_text(full_content)
        
        # Create Document objects with rich metadata
        for j, chunk in enumerate(chunks):
            docs.append(Document(
                page_content=chunk,
                metadata={
                    "url": article["url"],
                    "title": article.get("title", ""),
                    "source": article.get("domain", ""),
                    "keywords": article.get("keywords", ""),
                    "search_word": article.get("search_word", ""),
                    "language": article.get("language", ""),
                    "chunk_id": j,
                    "article_id": i
                }
            ))
    
    print(f"[+] Created {len(docs)} document chunks")
    return docs

# ------------------- EMBEDDINGS & VECTOR STORE -------------------

def create_chroma_from_articles(articles):
    """Create a Chroma DB using the pre-computed article embeddings from the CSV"""
    print(f"[*] Creating Chroma DB from pre-computed article embeddings...")
    
    # Process articles into documents with embeddings
    docs = []
    embeddings_list = []
    
    for i, article in enumerate(articles):
        # Skip articles with no text or embeddings
        if not article.get("text") or not article.get("embedding"):
            continue
            
        # Clean the text
        cleaned_text = clean_text(article["text"])
        
        # Skip if cleaning removed all content
        if not cleaned_text:
            continue
            
        # Get the pre-computed embedding
        embedding = article.get("embedding")
        if not embedding or len(embedding) < 10:  # Basic validation
            continue
            
        # Create a document with metadata
        doc = Document(
            page_content=f"{article.get('title', '')}\n\n{cleaned_text}",
            metadata={
                "url": article["url"],
                "title": article.get("title", ""),
                "source": article.get("domain", ""),
                "keywords": article.get("keywords", ""),
                "search_word": article.get("search_word", ""),
                "language": article.get("language", ""),
                "article_id": i
            }
        )
        
        docs.append(doc)
        embeddings_list.append(embedding)
    
    print(f"[+] Created {len(docs)} documents with embeddings")
    
    if not docs:
        print("[!] No valid documents with embeddings created")
        return None
    
    # Create a Chroma collection from documents and their pre-computed embeddings
    embedding_func = PrecomputedEmbeddings()
    
    # Initialize an empty Chroma collection
    vectorstore = Chroma(
        embedding_function=embedding_func,
        persist_directory=CHROMA_DB_PATH
    )
    
    # FIXED APPROACH: Use Chroma's from_documents method which properly handles both documents and embeddings
    try:
        # First clear any existing collection
        if os.path.exists(CHROMA_DB_PATH):
            print(f"[*] Clearing existing Chroma collection at {CHROMA_DB_PATH}")
            vectorstore.delete_collection()
            vectorstore = Chroma(
                embedding_function=embedding_func,
                persist_directory=CHROMA_DB_PATH
            )
        
        # Extract texts and metadatas from documents
        texts = [doc.page_content for doc in docs]
        metadatas = [doc.metadata for doc in docs]
        
        # Create document IDs if needed
        ids = [f"doc_{i}" for i in range(len(docs))]
        
        # Now add texts, metadatas, and embeddings separately
        vectorstore._collection.add(
            embeddings=embeddings_list,
            documents=texts,
            metadatas=metadatas,
            ids=ids
        )
        
        # Save to disk
        vectorstore.persist()
        print(f"[+] Chroma DB saved at: {CHROMA_DB_PATH}")
        
        return vectorstore
    
    except Exception as e:
        print(f"[!] Error creating Chroma DB: {e}")
        print(f"[*] Traceback: {traceback.format_exc()}")
        return None

def load_chroma_db():
    """Load existing Chroma DB"""
    if not os.path.exists(CHROMA_DB_PATH):
        print(f"[!] Chroma DB not found at {CHROMA_DB_PATH}")
        return None
        
    print(f"[*] Loading existing Chroma DB from {CHROMA_DB_PATH}")
    
    try:
        vectorstore = Chroma(
            persist_directory=CHROMA_DB_PATH,
            embedding_function=PrecomputedEmbeddings()
        )
        return vectorstore
    except Exception as e:
        print(f"[!] Error loading Chroma DB: {e}")
        return None

# ------------------- RAG QUERY SYSTEM -------------------

def query_rag_system(query: str, vectorstore) -> Tuple[str, List[dict]]:
    """Query the RAG system with the given question"""
    print(f"[*] Processing query: {query}")
    
    try:
        # Get relevant documents
        relevant_docs = vectorstore.similarity_search(query, k=5)
    except Exception as e:
        print(f"[!] Error in similarity search: {e}")
        return "Error processing query.", []
    
    # Extract content and organize article references
    top_chunks = [doc.page_content for doc in relevant_docs]
    
    # Collect unique article references with metadata
    seen_articles = {}
    for doc in relevant_docs:
        url = doc.metadata.get("url", "")
        title = doc.metadata.get("title", "Unknown Title")
        source = doc.metadata.get("source", "Unknown Source")
        keywords = doc.metadata.get("keywords", "")
        search_word = doc.metadata.get("search_word", "")
        
        if url and url not in seen_articles:
            seen_articles[url] = {
                "title": title,
                "source": source,
                "keywords": keywords,
                "search_word": search_word,
                "url": url
            }
    
    # Format article references
    article_references = list(seen_articles.values())
    
    # Create response text
    combined_text = "\n\n".join(top_chunks)
    
    return combined_text, article_references

def format_rag_response(response_text, article_references):
    """Format the RAG response for display"""
    # Format the response text with proper wrapping
    wrapped_text = "\n".join(textwrap.fill(line, width=100) 
                          for line in response_text.split("\n"))
    
    # Format article references
    sources_text = "\n\nSOURCES:\n"
    for i, article in enumerate(article_references, 1):
        title = article.get("title", "Article")
        source = article.get("source", "Unknown")
        url = article.get("url", "#")
        keywords = article.get("keywords", "")
        search_word = article.get("search_word", "")
        
        sources_text += f"{i}. {title} | {source}\n"
        sources_text += f"   URL: {url}\n"
        if search_word:
            sources_text += f"   Search Word: {search_word}\n"
        if keywords:
            sources_text += f"   Keywords: {keywords}\n"
        sources_text += "\n"
    
    return wrapped_text + sources_text

# ------------------- MAIN PIPELINE -------------------

def build_rag_pipeline(rebuild=False):
    """Build or load the RAG pipeline"""
    # Check if ChromaDB already exists
    if os.path.exists(CHROMA_DB_PATH) and not rebuild:
        print(f"[*] Loading existing ChromaDB from {CHROMA_DB_PATH}")
        vectorstore = load_chroma_db()
        if vectorstore:
            return vectorstore
        else:
            print("[!] Existing ChromaDB could not be loaded. Building new one...")
    
    # Load articles from CSV file
    articles = load_articles_from_csv()
    
    if not articles:
        print("[!] No articles found in CSV")
        return None
    
    # Create ChromaDB from articles with pre-computed embeddings
    vectorstore = create_chroma_from_articles(articles)
    
    return vectorstore

def query_news(query_text, rebuild=False):
    """User-facing function to query the news RAG system"""
    # Build or load the pipeline
    vectorstore = build_rag_pipeline(rebuild=rebuild)
    
    if not vectorstore:
        return "Failed to build or load the RAG pipeline."
    
    # Query the system
    response_text, article_references = query_rag_system(query_text, vectorstore)
    
    # Format the response
    formatted_response = format_rag_response(response_text, article_references)
    
    return formatted_response

# ------------------- FUNCTION TO GET NEWS RESULTS -------------------

def get_news_results(query):
    """
    Main function to get news results for a given query.
    
    Args:
        query (str): The query to search for in the news database
        
    Returns:
        str: Formatted response with relevant news and sources
    """
    print("\n======= NEWS RAG QUERY =======\n")
    print(f"Query: {query}")
    
    # Get and return results
    result = query_news(query)
    return result

# Make sure to import traceback for detailed error reporting
import traceback

# Example usage
if __name__ == "__main__":
    # Example query about Modi based on your sample data
    example_query = "What happened with Modi?"
    results = get_news_results(example_query)
    print(results)
    
    # You can also try the query about Hania Aamir
    example_query2 = "trending news?"
    results2 = get_news_results(example_query2)
    print(results2)



Query: What happened with Modi?
[*] Loading existing ChromaDB from ./chroma_news_csv_db
[*] Loading existing Chroma DB from ./chroma_news_csv_db


  vectorstore = Chroma(


[*] Processing query: What happened with Modi?
Kharge claims PM Modi received ‘intel report’ 3 days before Pahalgam attack, cancelled Kashmir visit

Congress president Mallikarjun Kharge has claimed that Prime Minister Narendra Modi cancelled his
visit to Kashmir as he received an “intelligence report” three days before the Pahalgam terror
attack. Kharge was addressing the Congress’s ‘Sanvidhan Bachao Rally’ in Ranchi. Referring to the
terror attack that claimed 26 innocent lives, he said, “When you knew this, why didn’t you ensure
tight security?” “I was informed, and it’s published in the media as well, that three days before
the attack, an intelligence report was sent to Modiji, following which he cancelled his Kashmir
visit,” he claimed. Story continues below this ad “When you were well informed about the situation,
then why didn’t you communicate the same to your intelligence team, security, Kashmir police, and
the Border Security Force (BSF), and ensure the safety of civilians?” 

In [3]:
if __name__ == "__main__":
    # Example query about Modi based on your sample data
    example_query = "tell me about phalgam terror attack?"
    results = get_news_results(example_query)
    print(results)



Query: tell me about phalgam terror attack?
[*] Loading existing ChromaDB from ./chroma_news_csv_db
[*] Loading existing Chroma DB from ./chroma_news_csv_db
[*] Processing query: tell me about phalgam terror attack?
India May Strike At Any Point Along LoC, Warns Pakistan Defence Minister

India May Strike At Any Point Along LoC, Warns Pakistan Defence Minister Curated By : News18.com
Last Updated: May 06, 2025, 08:04 IST Pakistan’s Defence Minister Khawaja Asif on Monday warned that
India could carry out a military strike at any moment along the Line of Control in Kashmir. Pakistan
Defence Minister Khawaja Asif. (IMAGE: X) India-Pak tensions: Amid the escalating tensions between
India and Pakistan in the wake of the Pahalgam terror attack, Pakistan’s Defence Minister Khawaja
Asif has warned that New Delhi could launch a military strike at any moment along the Line of
Control in Jammu and Kashmir “There are reports that India may strike at any point along the LoC…
New Delhi will be gi

In [4]:

example_query = "What did Elon Musk recently say about AI?"
results = get_news_results(example_query)
print(results)



Query: What did Elon Musk recently say about AI?
[*] Loading existing ChromaDB from ./chroma_news_csv_db
[*] Loading existing Chroma DB from ./chroma_news_csv_db
[*] Processing query: What did Elon Musk recently say about AI?
Neuralink Gets FDA Nod For Chip That'll Help Mute People Speak, Elon Musk Reacts

Neuralink Gets FDA Nod For Chip That'll Help Mute People Speak, Elon Musk Reacts Curated By :
Trending Desk Last Updated: May 03, 2025, 15:26 IST Neuralink has now asked people to register
through its Patient Registry as it gets ready to begin human trials. Neuralink Communication Chip:
Elon Musk plans to make this technology widely available. (Reuters Image) Neuralink, Elon Musk’s
brain-chip company, has reached yet another significant milestone. A “Breakthrough Device
Designation" has been granted by the US Food and Drug Administration (FDA) to the company for a new
brain implant that will assist those who are unable to speak. The declaration follows Neuralink’s
announcement of t