## Optimized Tourism Data Chunking

Custom Intelligent chunking system for tourism data optimized for RAG systems and small language models (Qwen2.5-1.5B size models). 

**Features**
-  Location hierarchy with GPS coordinates
-  Enhanced category/subcategory classification  
-  Automatic price range detection
-  Traveler-type relevance scoring
-  Seasonal content detection
-  Multiple output formats for different use cases

In [None]:
import json
import os
import re
from typing import List, Dict, Any
from pathlib import Path
from datetime import datetime

In [3]:
def categorize_section(section_name: str) -> str:
    """
    Categorize section by content type for better organization
    """
    section_lower = section_name.lower()
    
    # Accommodation related
    if any(keyword in section_lower for keyword in ['sleep', 'hotel', 'accommodation', 'stay']):
        return 'accommodation'
    
    # Food and dining
    elif any(keyword in section_lower for keyword in ['eat', 'drink', 'food', 'restaurant', 'dining']):
        return 'dining'
    
    # Transportation
    elif any(keyword in section_lower for keyword in ['get there', 'get around', 'transport', 'bus', 'train', 'flight']):
        return 'transport'
    
    # Activities and sightseeing
    elif any(keyword in section_lower for keyword in ['see', 'do', 'activity', 'attraction', 'temple', 'trek']):
        return 'activities'
    
    # Practical information
    elif any(keyword in section_lower for keyword in ['money', 'budget', 'cost', 'price', 'phone', 'internet']):
        return 'practical'
    
    # Safety and health
    elif any(keyword in section_lower for keyword in ['safe', 'health', 'emergency', 'hospital']):
        return 'safety'
    
    # General information
    elif any(keyword in section_lower for keyword in ['introduction', 'understand', 'history', 'culture']):
        return 'general'
    
    else:
        return 'misc'

In [4]:
def extract_price_info(text: str) -> List[str]:
    """
    Extract price information from text for price-aware chunking
    """
    price_patterns = [
        r'‚Çπ[\d,]+',  # Indian Rupees
        r'Rs\.?\s*[\d,]+',  # Rs format
        r'\$[\d,]+',  # US Dollars
        r'[\d,]+\s*rupees?',  # Written rupees
        r'cost[s]?\s*[\d,‚Çπ\$]+',  # Cost mentions
        r'price[s]?\s*[\d,‚Çπ\$]+',  # Price mentions
    ]
    
    prices = []
    for pattern in price_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        prices.extend(matches)
    
    return prices

def extract_location_info(text: str) -> List[str]:
    """
    Extract location references for location-aware chunking
    """
    location_patterns = [
        r'near\s+[\w\s]+',
        r'close\s+to\s+[\w\s]+', 
        r'[\d]+\s*km\s+from\s+[\w\s]+',
        r'located\s+[\w\s]+',
        r'station\s*[\w\s]*',
        r'airport\s*[\w\s]*',
        r'mall\s+road',
        r'main\s+market'
    ]
    
    locations = []
    for pattern in location_patterns:
        matches = re.findall(pattern, text, re.IGNORECASE)
        locations.extend([match.strip() for match in matches])
    
    return locations

In [None]:
# Core Schema Functions - Location and Classification
from datetime import datetime

# GPS coordinates for major Indian tourist destinations
CITY_COORDINATES = {
    # Himachal Pradesh
    'manali': [32.2432, 77.1892],
    'shimla': [31.1048, 77.1734],
    'dharamsala': [32.2190, 76.3234],
    'dalhousie': [32.5448, 75.9715],
    'kasol': [32.0102, 77.2953],
    'kullu': [31.9578, 77.1176],
    
    # Jammu & Kashmir
    'srinagar': [34.0837, 74.7973],
    'gulmarg': [34.0484, 74.3831],
    'pahalgam': [34.0158, 75.3312],
    'sonamarg': [34.2996, 75.2912],
    'katra': [32.9616, 74.9329],
    'jammu': [32.7266, 74.8570],
    
    # Ladakh
    'leh': [34.1526, 77.5771],
    'kargil': [34.5539, 76.1312],
    'nubra valley': [34.5240, 77.6025],
    
    # Uttarakhand
    'dehradun': [30.3165, 78.0322],
    'mussoorie': [30.4598, 78.0664],
    'nainital': [29.3803, 79.4636],
    'rishikesh': [30.0869, 78.2676],
    'haridwar': [29.9457, 78.1642],
    'kedarnath': [30.7346, 79.0669],
    'badrinath': [30.7433, 79.4938],
    'almora': [29.5971, 79.6593],
    'mukteshwar': [29.4779, 79.6425],
    
    # General fallbacks
    'himachal pradesh': [31.1048, 77.1734],
    'jammu and kashmir': [34.0837, 74.7973], 
    'ladakh': [34.1526, 77.5771],
    'uttarakhand': [30.3165, 78.0322],
    'india': [20.5937, 78.9629]
}

def extract_location_info_optimized(destination: str, state: str) -> Dict[str, Any]:
    """Extract structured location information with GPS coordinates"""
    city = destination.lower().strip()
    state_name = state.strip() if state else "Unknown"
    
    # Get coordinates
    coordinates = CITY_COORDINATES.get(city)
    if not coordinates and state_name.lower() in CITY_COORDINATES:
        coordinates = CITY_COORDINATES[state_name.lower()]
    if not coordinates:
        coordinates = CITY_COORDINATES['india']  # Default fallback
    
    return {
        "country": "India",
        "state": state_name,
        "city": destination,
        "coordinates": coordinates
    }

def classify_content_optimized(section_name: str, content: str, category: str) -> Dict[str, str]:
    """Enhanced classification with detailed subcategories"""
    section_lower = section_name.lower()
    content_lower = content.lower()
    
    subcategory_map = {
        'accommodation': {
            'budget': ['budget', 'cheap', 'backpacker', 'hostel', 'guesthouse'],
            'mid_range': ['mid-range', 'moderate', 'standard'],
            'luxury': ['luxury', 'premium', 'resort', 'heritage', 'palace'],
            'homestay': ['homestay', 'family', 'local']
        },
        'dining': {
            'street_food': ['street', 'local', 'dhaba', 'roadside'],
            'restaurant': ['restaurant', 'fine dining', 'cafe'],
            'traditional': ['traditional', 'authentic', 'local cuisine'],
            'international': ['pizza', 'chinese', 'continental']
        },
        'activities': {
            'adventure': ['trek', 'rafting', 'paragliding', 'skiing', 'climbing'],
            'sightseeing': ['temple', 'palace', 'fort', 'museum', 'monument'],
            'nature': ['lake', 'valley', 'peak', 'waterfall', 'garden'],
            'cultural': ['festival', 'market', 'local', 'heritage']
        },
        'transport': {
            'road': ['bus', 'taxi', 'car', 'drive'],
            'rail': ['train', 'railway', 'station'],
            'air': ['flight', 'airport', 'fly'],
            'local': ['rickshaw', 'auto', 'local transport']
        }
    }
    
    subcategory = 'general'
    if category in subcategory_map:
        for sub, keywords in subcategory_map[category].items():
            if any(keyword in content_lower or keyword in section_lower for keyword in keywords):
                subcategory = sub
                break
    
    return {
        "category": category,
        "subcategory": subcategory
    }

def extract_practical_info_optimized(content: str) -> Dict[str, Any]:
    """Extract practical information: prices, contacts, seasonal data"""
    # Price extraction and categorization
    prices = extract_price_info(content)
    price_range = "unknown"
    
    if prices:
        # Extract numeric values from prices
        numeric_prices = []
        for price in prices:
            nums = re.findall(r'[\d,]+', price.replace(',', ''))
            if nums:
                try:
                    numeric_prices.append(int(nums[0]))
                except:
                    pass
        
        if numeric_prices:
            avg_price = sum(numeric_prices) / len(numeric_prices)
            if avg_price < 1000:
                price_range = "budget"
            elif avg_price < 3000:
                price_range = "mid_range"
            else:
                price_range = "luxury"
    
    # Contact information
    has_contact = bool(re.search(r'\+91|phone|contact|email|call|booking', content, re.IGNORECASE))
    
    # Seasonal information
    seasonal = []
    if re.search(r'winter|snow|skiing|december|january|february', content, re.IGNORECASE):
        seasonal.append("winter")
    if re.search(r'summer|may|june|july|august', content, re.IGNORECASE):
        seasonal.append("summer")
    if re.search(r'monsoon|rain|july|august|september', content, re.IGNORECASE):
        seasonal.append("monsoon")
    if re.search(r'spring|march|april|pleasant', content, re.IGNORECASE):
        seasonal.append("spring")
    
    if not seasonal:
        seasonal = ["all_year"]
    
    practical_info = {
        "price_range": price_range,
        "prices": prices[:3],  # Keep top 3 prices
        "has_contact": has_contact,
    }
    
    # Only add seasonal if it's relevant (not all_year)
    if seasonal != ["all_year"]:
        practical_info["seasonal"] = seasonal
    
    return practical_info

def calculate_relevance_scores(content: str, category: str) -> Dict[str, int]:
    """Calculate relevance scores for different traveler types (1-10 scale)"""
    content_lower = content.lower()
    
    # Base scores by category
    base_scores = {
        'accommodation': {'solo_traveler': 6, 'family': 7, 'adventure': 5},
        'dining': {'solo_traveler': 7, 'family': 8, 'adventure': 6},
        'activities': {'solo_traveler': 7, 'family': 6, 'adventure': 9},
        'transport': {'solo_traveler': 8, 'family': 7, 'adventure': 8},
        'practical': {'solo_traveler': 9, 'family': 9, 'adventure': 8},
        'safety': {'solo_traveler': 8, 'family': 10, 'adventure': 7}
    }
    
    scores = base_scores.get(category, {'solo_traveler': 5, 'family': 5, 'adventure': 5})
    
    # Adjust scores based on content
    if any(word in content_lower for word in ['budget', 'cheap', 'backpacker']):
        scores['solo_traveler'] += 2
        scores['family'] -= 1
    
    if any(word in content_lower for word in ['family', 'kids', 'children', 'safe']):
        scores['family'] += 2
        scores['solo_traveler'] += 1
    
    if any(word in content_lower for word in ['trek', 'adventure', 'climbing', 'rafting', 'extreme']):
        scores['adventure'] += 3
        scores['family'] -= 1
        
    if any(word in content_lower for word in ['luxury', 'premium', 'resort']):
        scores['family'] += 1
        scores['solo_traveler'] -= 1
    
    # Cap scores between 1-10
    for key in scores:
        scores[key] = max(1, min(10, scores[key]))
    
    return scores

In [6]:
def create_smart_chunks(content: str, max_chunk_size: int = 250) -> List[str]:
    """
    Create intelligent chunks optimized for 1.5B model
    Split on sentences first, then paragraphs, maintaining context
    """
    if len(content) <= max_chunk_size:
        return [content]
    
    chunks = []
    
    # Split by paragraphs first
    paragraphs = [p.strip() for p in content.split('\n') if p.strip()]
    
    current_chunk = ""
    
    for paragraph in paragraphs:
        # If paragraph alone is too big, split by sentences
        if len(paragraph) > max_chunk_size:
            sentences = re.split(r'[.!?]+', paragraph)
            
            for sentence in sentences:
                sentence = sentence.strip()
                if not sentence:
                    continue
                    
                # If adding this sentence would exceed limit
                if len(current_chunk) + len(sentence) > max_chunk_size and current_chunk:
                    chunks.append(current_chunk.strip())
                    current_chunk = sentence
                else:
                    if current_chunk:
                        current_chunk += ". " + sentence
                    else:
                        current_chunk = sentence
        else:
            # If adding this paragraph would exceed limit
            if len(current_chunk) + len(paragraph) > max_chunk_size and current_chunk:
                chunks.append(current_chunk.strip())
                current_chunk = paragraph
            else:
                if current_chunk:
                    current_chunk += "\n" + paragraph
                else:
                    current_chunk = paragraph
    
    # Add the last chunk if it exists
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    
    return chunks

In [7]:
def process_json_file_optimized(file_path: str, state_name: str = "") -> List[Dict[str, Any]]:
    """
    Process a single JSON file using the new optimized schema
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            data = json.load(f)
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return []
    
    destination = data.get('title', 'Unknown')
    sections = data.get('sections', {})
    
    all_chunks = []
    chunk_counter = 1
    
    for section_name, section_content in sections.items():
        if not section_content or len(section_content.strip()) < 20:
            continue
            
        # Get basic category
        category = categorize_section(section_name)
        
        # Create smart chunks
        content_chunks = create_smart_chunks(section_content, max_chunk_size=250)
        
        for i, chunk_content in enumerate(content_chunks):
            # Extract location information
            location_info = extract_location_info_optimized(destination, state_name)
            
            # Enhanced classification
            classification = classify_content_optimized(section_name, chunk_content, category)
            
            # Extract practical information
            practical_info = extract_practical_info_optimized(chunk_content)
            
            # Calculate relevance scores
            relevance_scores = calculate_relevance_scores(chunk_content, category)
            
            # Create structured chunk ID
            state_code = {
                'Himachal': 'hp',
                'Jammu': 'jk', 
                'Ladakh': 'lad',
                'Uttarakhand': 'uk',
                'India': 'in'
            }.get(state_name, state_name.lower()[:2] if state_name else 'in')
            
            city_code = destination.lower().replace(' ', '_')[:6]
            cat_code = classification['category'][:3]
            subcat_code = classification['subcategory'][:3]
            
            chunk_id = f"in_{state_code}_{city_code}_{cat_code}_{subcat_code}_{chunk_counter:03d}"
            
            # Build optimized chunk structure
            chunk = {
                'chunk_id': chunk_id,
                'content': chunk_content,
                'location': location_info,
                'classification': classification,
                'practical_info': practical_info,
                'relevance_scores': relevance_scores,
                'last_updated': datetime.now().strftime('%Y-%m-%d')
            }
            
            all_chunks.append(chunk)
            chunk_counter += 1
    
    return all_chunks

In [8]:
def chunk_all_data_optimized(base_path: str = "../json_data") -> Dict[str, List[Dict[str, Any]]]:
    """
    Process all JSON files using the new optimized chunking schema
    """
    base_path = Path(base_path)
    all_chunked_data = {}
    
    # Process main India file
    india_file = base_path / "india.json"
    if india_file.exists():
        print(f"Processing main India file...")
        india_chunks = process_json_file_optimized(str(india_file), "India")
        all_chunked_data["India"] = india_chunks
        print(f"Created {len(india_chunks)} chunks for India")
    
    # Process Him_north folder
    him_north_path = base_path / "Him_north"
    if him_north_path.exists():
        
        # Process state folders
        for state_folder in him_north_path.iterdir():
            if state_folder.is_dir():
                state_name = state_folder.name.title()
                print(f"\nProcessing {state_name} state...")
                
                state_chunks = []
                
                # Process all JSON files in state folder
                for json_file in state_folder.glob("*.json"):
                    file_chunks = process_json_file_optimized(str(json_file), state_name)
                    state_chunks.extend(file_chunks)
                    print(f"  {json_file.name}: {len(file_chunks)} chunks")
                
                all_chunked_data[state_name] = state_chunks
                print(f"Total chunks for {state_name}: {len(state_chunks)}")
        
        # Process files directly in Him_north folder
        for json_file in him_north_path.glob("*.json"):
            file_chunks = process_json_file_optimized(str(json_file), "Himalayan_North")
            all_chunked_data["Himalayan_North"] = file_chunks
            print(f"Himalayan North region: {len(file_chunks)} chunks")
    
    return all_chunked_data

In [9]:
# Execute the NEW OPTIMIZED chunking process
print("üöÄ Starting OPTIMIZED chunking process with new schema...")
print("=" * 70)

chunked_data_optimized = chunk_all_data_optimized()

# Display summary statistics  
print(f"\nüìä OPTIMIZED CHUNKING SUMMARY:")
print("=" * 70)

total_chunks = 0
for state, chunks in chunked_data_optimized.items():
    chunk_count = len(chunks)
    total_chunks += chunk_count
    print(f"{state}: {chunk_count:,} chunks")

print(f"\nüéØ TOTAL OPTIMIZED CHUNKS CREATED: {total_chunks:,}")

# Analyze optimized chunk statistics
all_chunks_flat = []
for chunks in chunked_data_optimized.values():
    all_chunks_flat.extend(chunks)

if all_chunks_flat:
    # Category analysis
    categories = {}
    subcategories = {}
    price_ranges = {}
    states = {}
    
    for chunk in all_chunks_flat:
        cat = chunk['classification']['category']
        subcat = chunk['classification']['subcategory']
        price_range = chunk['practical_info']['price_range']
        state = chunk['location']['state']
        
        categories[cat] = categories.get(cat, 0) + 1
        subcategories[subcat] = subcategories.get(subcat, 0) + 1
        price_ranges[price_range] = price_ranges.get(price_range, 0) + 1
        states[state] = states.get(state, 0) + 1
    
    print(f"\nüìà OPTIMIZED CHUNK ANALYTICS:")
    print("-" * 50)
    
    print(f"\nüè∑Ô∏è  CATEGORY DISTRIBUTION:")
    print("-" * 30)
    for category, count in sorted(categories.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / total_chunks) * 100
        print(f"{category:12}: {count:5,} ({percentage:4.1f}%)")
    
    print(f"\nüîñ SUBCATEGORY DISTRIBUTION:")
    print("-" * 30)
    for subcat, count in sorted(subcategories.items(), key=lambda x: x[1], reverse=True)[:10]:
        percentage = (count / total_chunks) * 100
        print(f"{subcat:12}: {count:5,} ({percentage:4.1f}%)")
    
    print(f"\nüí∞ PRICE RANGE DISTRIBUTION:")
    print("-" * 30)
    for price_range, count in sorted(price_ranges.items(), key=lambda x: x[1], reverse=True):
        percentage = (count / total_chunks) * 100
        print(f"{price_range:12}: {count:5,} ({percentage:4.1f}%)")

print(f"\n‚úÖ Optimized chunking completed successfully!")
print("üéâ New schema features: Location hierarchy, relevance scores, structured classification!")

üöÄ Starting OPTIMIZED chunking process with new schema...
Processing main India file...
Created 1086 chunks for India

Processing Himachal state...
  dalhousie (india).json: 79 chunks
  dharamsala.json: 180 chunks
  bilaspur (himachal pradesh).json: 106 chunks
  kullu.json: 28 chunks
  palampur.json: 30 chunks
  mandi.json: 47 chunks
  shimla.json: 199 chunks
  manali.json: 192 chunks
  himachal pradesh.json: 53 chunks
  jogindernagar.json: 87 chunks
Total chunks for Himachal: 1001

Processing Uttarakhand state...
  nainital.json: 115 chunks
  munsyari.json: 15 chunks
  mussoorie.json: 84 chunks
  uttarakhand.json: 82 chunks
  nanda devi national park.json: 28 chunks
  rajaji national park.json: 15 chunks
  mukteshwar.json: 22 chunks
  haridwar.json: 105 chunks
  kedarnath.json: 47 chunks
  dunagiri.json: 22 chunks
  ghangaria.json: 52 chunks
  almora.json: 36 chunks
  gangotri.json: 31 chunks
  jim corbett national park.json: 50 chunks
  badrinath.json: 40 chunks
  pithoragarh.json:

In [None]:
# Save optimized chunked data in multiple RAG-ready formats
def save_optimized_chunked_data(chunked_data: Dict, output_dir: str = "optimized_chunks"):
    """Save optimized chunked data in multiple formats designed for tourism RAG"""
    output_path = Path(output_dir)
    output_path.mkdir(exist_ok=True)
    
    print(f"\nüíæ SAVING OPTIMIZED CHUNKED DATA...")
    print("-" * 50)
    
    # Format 1: Flat list of all optimized chunks (for vector database)
    all_chunks_flat = []
    for chunks in chunked_data.values():
        all_chunks_flat.extend(chunks)
    
    flat_file = output_path / "all_chunks_optimized.json"
    with open(flat_file, 'w', encoding='utf-8') as f:
        json.dump(all_chunks_flat, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Optimized flat format: {flat_file} ({len(all_chunks_flat):,} chunks)")
    
    # Format 2: Organized by state and city (for location-based retrieval)
    location_organized = {}
    for chunk in all_chunks_flat:
        state = chunk['location']['state']
        city = chunk['location']['city']
        
        if state not in location_organized:
            location_organized[state] = {}
        if city not in location_organized[state]:
            location_organized[state][city] = []
        
        location_organized[state][city].append(chunk)
    
    location_file = output_path / "chunks_by_location.json"
    with open(location_file, 'w', encoding='utf-8') as f:
        json.dump(location_organized, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Location organized: {location_file}")
    
    # Format 3: Organized by category and subcategory (for activity-based search)
    category_organized = {}
    for chunk in all_chunks_flat:
        category = chunk['classification']['category']
        subcategory = chunk['classification']['subcategory']
        
        if category not in category_organized:
            category_organized[category] = {}
        if subcategory not in category_organized[category]:
            category_organized[category][subcategory] = []
            
        category_organized[category][subcategory].append(chunk)
    
    category_file = output_path / "chunks_by_category_optimized.json"
    with open(category_file, 'w', encoding='utf-8') as f:
        json.dump(category_organized, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Category organized: {category_file}")
    
    # Format 4: High relevance chunks by traveler type
    traveler_types = ['solo_traveler', 'family', 'adventure']
    for traveler_type in traveler_types:
        high_relevance = [chunk for chunk in all_chunks_flat 
                         if chunk['relevance_scores'][traveler_type] >= 7]
        
        traveler_file = output_path / f"high_relevance_{traveler_type}.json"
        with open(traveler_file, 'w', encoding='utf-8') as f:
            json.dump(high_relevance, f, indent=2, ensure_ascii=False)
        print(f"‚úÖ {traveler_type}: {traveler_file} ({len(high_relevance):,} chunks)")
    
    # Format 5: Budget-focused chunks (for price-conscious travelers)
    budget_chunks = [chunk for chunk in all_chunks_flat 
                    if chunk['practical_info']['price_range'] in ['budget', 'mid_range']]
    budget_file = output_path / "budget_focused_chunks.json"
    with open(budget_file, 'w', encoding='utf-8') as f:
        json.dump(budget_chunks, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Budget focused: {budget_file} ({len(budget_chunks):,} chunks)")
    
    # Format 6: Practical info summary (contacts, prices, seasonal)
    practical_chunks = [chunk for chunk in all_chunks_flat 
                       if chunk['practical_info']['has_contact'] or 
                       len(chunk['practical_info']['prices']) > 0]
    practical_file = output_path / "practical_info_optimized.json"
    with open(practical_file, 'w', encoding='utf-8') as f:
        json.dump(practical_chunks, f, indent=2, ensure_ascii=False)
    print(f"‚úÖ Practical info: {practical_file} ({len(practical_chunks):,} chunks)")
    
    return output_path

# Execute saving
output_directory = save_optimized_chunked_data(chunked_data_optimized)
print(f"\nüìÅ All optimized formats saved to: {output_directory}")

print(f"\nüöÄ RAG-READY DATA GENERATED!")
print("Available formats:")
print("‚Ä¢ all_chunks_optimized.json ‚Üí Vector database with location coordinates")
print("‚Ä¢ chunks_by_location.json ‚Üí State/City hierarchical search")  
print("‚Ä¢ chunks_by_category_optimized.json ‚Üí Activity-based with subcategories")
print("‚Ä¢ high_relevance_[type].json ‚Üí Traveler-specific recommendations")
print("‚Ä¢ budget_focused_chunks.json ‚Üí Price-conscious travel planning")
print("‚Ä¢ practical_info_optimized.json ‚Üí Contact details and pricing")


üíæ SAVING OPTIMIZED CHUNKED DATA...
--------------------------------------------------
‚úÖ Optimized flat format: optimized_chunks/all_chunks_optimized.json (4,160 chunks)
‚úÖ Location organized: optimized_chunks/chunks_by_location.json
‚úÖ Category organized: optimized_chunks/chunks_by_category_optimized.json
‚úÖ solo_traveler: optimized_chunks/high_relevance_solo_traveler.json (1,544 chunks)
‚úÖ family: optimized_chunks/high_relevance_family.json (1,062 chunks)
‚úÖ adventure: optimized_chunks/high_relevance_adventure.json (1,295 chunks)
‚úÖ Budget focused: optimized_chunks/budget_focused_chunks.json (392 chunks)
‚úÖ Practical info: optimized_chunks/practical_info_optimized.json (971 chunks)

üìÅ All optimized formats saved to: optimized_chunks

üöÄ OPTIMIZED RAG-READY DATA!
New enhanced formats:
‚Ä¢ all_chunks_optimized.json ‚Üí Vector database with location coordinates
‚Ä¢ chunks_by_location.json ‚Üí State/City hierarchical search
‚Ä¢ chunks_by_category_optimized.json ‚Üí Activ