In [3]:
import json
from pathlib import Path
from collections import Counter
import pandas as pd
import sys

# Add project root to python path
# This handles both scripts (__file__) and notebooks (cwd)
try:
    project_root = Path(__file__).resolve().parent.parent
except NameError:
    # In Jupyter, use the current working directory's parent
    project_root = Path.cwd().parent

if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from src.core.config import settings
from src.ingestion.loader import DataLoader

def profile_raw_data():
    print(f"Scanning data in: {settings.RAW_DATA_DIR}")
    
    # We'll use your existing loader to ensure we're reading the same way the app does
    loader = DataLoader(settings.RAW_DATA_DIR)
    
    sources_per_politician = {}
    unique_politician_names = Counter()
    total_articles = 0
    
    # We walk the directory to get article-level info directly
    json_files = list(settings.RAW_DATA_DIR.rglob("*.json"))
    
    for file_path in json_files:
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
            
            # Extract basic article metadata
            pol = data.get("politician", "Unknown")
            src = data.get("source", "Unknown")
            
            unique_politician_names[pol] += 1
            
            if pol not in sources_per_politician:
                sources_per_politician[pol] = Counter()
            
            sources_per_politician[pol][src] += 1
            total_articles += 1
            
        except Exception as e:
            print(f"Error reading {file_path.name}: {e}")

    # --- Print Results ---
    print("\n" + "="*50)
    print("RAJNEETI DATA PROFILE")
    print("="*50)
    print(f"Total JSON Articles Found: {total_articles}")
    
    print("\n1. Detected Politician Names (in JSON 'politician' field):")
    for name, count in unique_politician_names.items():
        print(f"   - {name}: {count} articles")

    print("\n2. Media Sources per Politician:")
    for pol, sources in sources_per_politician.items():
        print(f"\n--- {pol} ---")
        for src, count in sources.most_common():
            print(f"   - {src}: {count}")

    print("\n3. Overall Unique Sources Found:")
    all_sources = set()
    for sources in sources_per_politician.values():
        all_sources.update(sources.keys())
    for s in sorted(all_sources):
        print(f"   - {s}")
    print("="*50)

if __name__ == "__main__":
    profile_raw_data()

Scanning data in: E:\4thYear\Rajneeti\RAG_v3\raw_data

RAJNEETI DATA PROFILE
Total JSON Articles Found: 3043

1. Detected Politician Names (in JSON 'politician' field):
   - Arvind Kejriwal: 128 articles
   - N/A: 993 articles
   - Sheila Dikshit: 14 articles
   - Priyanka Gandhi Vadra: 4 articles
   - Manish Sisodia: 6 articles
   - Aam Aadmi Party (AAP): 1 articles
   - Chandrababu Naidu: 4 articles
   - Mamata Banerjee: 383 articles
   - Narendra Modi: 89 articles
   - Nitin Gadkari: 2 articles
   - Gautam Gambhir: 8 articles
   - Kanchan Chaudhary Bhattacharya: 1 articles
   - Sukhpal Singh Khaira: 1 articles
   - Meera Sanyal: 1 articles
   - Alka Lamba: 4 articles
   - Manmohan Singh: 3 articles
   - Robert Vadra: 1 articles
   - Rahul Gandhi: 17 articles
   - Hardeep Singh Puri: 1 articles
   - Navjot Singh Sidhu: 2 articles
   - Amrinder Singh Raja Warring: 1 articles
   - Hans Raj Hans: 1 articles
   - Rajnath Singh: 2 articles
   - Irom Sharmila: 1 articles
   - N Chandrababu