In [14]:
"""Spacy & Numpy import & test."""

# !python3 -m spacy download en_core_web_sm -q

import spacy
import numpy as np

try:
    # Load the small English model
    nlp = spacy.load("en_core_web_sm")
    print("spaCy loaded successfully!")
    print(f"Using NumPy version: {np.__version__}") # Check NumPy version

    # Test it
    doc = nlp("This is a test sentence.")
    print("Processed sentence:", doc.text)
    for token in doc:
        print(token.text, token.pos_)

except Exception as e:
    print(f"An error occurred: {e}")

spaCy loaded successfully!
Using NumPy version: 2.2.4
Processed sentence: This is a test sentence.
This PRON
is AUX
a DET
test NOUN
sentence NOUN
. PUNCT


In [15]:
"""Basic Spacy pipeline test."""

# 'en_core_web_sm' is small and fast, good for starting.
# 'en_core_web_md' or 'en_core_web_lg' are larger but more accurate.
try:
    nlp = spacy.load("en_core_web_sm")
    print("Loaded 'en_core_web_sm' spaCy model.")
# Basic error handling if the model isn't downloaded
except OSError:
    print("spaCy model 'en_core_web_sm' not found.")
    print("Please run: python -m spacy download en_core_web_sm")
    # Depending on the environment, you might need to restart the kernel after download
    # Or exit if the model is critical for subsequent steps
    nlp = None # Set nlp to None to avoid errors later if model loading failed

if nlp:
    # Example Queries (taken from Step 1)
    example_queries = [
        "Top 10 most valuable American IT companies.", # Filtered Ranking
        "Show details for Apple Inc.",                 # Company Info Lookup (using specific name from DB potential)
        "Every French company valued over 1B.",        # Threshold Filtering
        "List all sectors available.",                 # A potential simpler query
        "What is the market cap of Microsoft?",        # Specific detail lookup
    ]

    # Process each query using spaCy
    for query in example_queries:
        print(f"\n--- Analyzing Query: '{query}' ---")
        
        # Process the query with the loaded spaCy model
        doc = nlp(query)
        
        # Print the analysis for each token
        # Using a formatted table similar to the roadmap example
        print(f"{'Token':<15} | {'Lemma':<15} | {'POS':<7} | {'Dep':<10} | {'Detailed Tag':<7} | {'Entity Type'}")
        print("-" * 80)
        for token in doc:
            print(f"{token.text:<15} | {token.lemma_:<15} | {token.pos_:<7} | {token.dep_:<10} | {token.tag_:<12} | {token.ent_type_ if token.ent_type_ else '-'}")

        # Optional: Display Named Entities found by the base model (precursor to Step 5)
        print("\nBase Named Entities Found:")
        if doc.ents:
            for ent in doc.ents:
                print(f"- Entity: '{ent.text}', Label: {ent.label_} ({spacy.explain(ent.label_)})")
        else:
            print("- No named entities found by the base model.")
        
        print("="*80)

else:
    print("\nSkipping NLP analysis because the spaCy model could not be loaded.")

Loaded 'en_core_web_sm' spaCy model.

--- Analyzing Query: 'Top 10 most valuable American IT companies.' ---
Token           | Lemma           | POS     | Dep        | Detailed Tag | Entity Type
--------------------------------------------------------------------------------
Top             | top             | ADJ     | ROOT       | JJ           | -
10              | 10              | NUM     | nummod     | CD           | CARDINAL
most            | most            | ADV     | advmod     | RBS          | -
valuable        | valuable        | ADJ     | amod       | JJ           | -
American        | american        | ADJ     | amod       | JJ           | NORP
IT              | IT              | PROPN   | compound   | NNP          | -
companies       | company         | NOUN    | npadvmod   | NNS          | -
.               | .               | PUNCT   | punct      | .            | -

Base Named Entities Found:
- Entity: '10', Label: CARDINAL (Numerals that do not fall under another type)

In [None]:
"""Adding Gazetteer, PhraseMatcher and EntityRuler to improve identification of key pieces of information."""

# Force reinstall pandas to ensure it's compiled against the current numpy version
# %pip uninstall pandas -y
# %pip install pandas --no-cache-dir

# Also good practice to ensure numpy is reasonably up-to-date, though reinstalling pandas is key
# %pip install --upgrade numpy --no-cache-dir

# print("Reinstalled pandas and updated numpy. Please RESTART THE KERNEL now.")
# print("After restarting, re-run the previous cells (database setup, Step 4) and then try Step 5 again.")

# %%capture -- Allows %pip install without showing massive output
# Re-install spacy if needed and download model again if issues persist
# %pip install spacy -q
# !python -m spacy download en_core_web_sm -q

# %%capture
# %pip install spacy -q
# !python -m spacy download en_core_web_sm -q

"""Step 5 (Refactored v2): Enhanced NER & Term Extraction"""

import spacy
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler
import sqlite3
import pandas as pd
import warnings

# --- Configuration ---
db_file = 'companies_database.db'
table_name = 'companies'

# --- spaCy Model Loading ---
try:
    if 'nlp' not in locals() or not isinstance(nlp, spacy.language.Language): 
         nlp = spacy.load("en_core_web_sm")
         print("Loaded 'en_core_web_sm' spaCy model for Step 5.")
    else:
        print("'en_core_web_sm' spaCy model already loaded.")
        # Ensure a clean state for the ruler
        if "entity_ruler" in nlp.pipe_names:
            nlp.remove_pipe("entity_ruler")
            print("Removed existing EntityRuler pipe for clean setup.")
            
except OSError:
    print("spaCy model 'en_core_web_sm' not found. Cannot proceed.")
    nlp = None
except Exception as e:
    print(f"An error occurred loading spaCy model: {e}")
    nlp = None

# --- Gazetteer Loading Function (Unchanged) ---
def load_terms_from_db(db_path, table, column_name):
    terms = set(); original_casing_terms = set()
    conn = None
    try:
        conn = sqlite3.connect(db_path)
        query = f'SELECT DISTINCT "{column_name}" FROM "{table}" WHERE "{column_name}" IS NOT NULL AND "{column_name}" != \'\''
        df = pd.read_sql_query(query, conn)
        if column_name in df.columns:
            original_casing_terms.update(term.strip() for term in df[column_name].astype(str) if term.strip())
            terms.update(term.lower().strip() for term in df[column_name].astype(str) if term.strip())
        else: print(f"Warning: Column '{column_name}' not found in table '{table}'.")
    except Exception as e: print(f"Error loading terms for '{column_name}': {e}")
    finally:
        if conn: conn.close()
    terms.discard(''); original_casing_terms.discard('')
    print(f"Loaded {len(terms)} unique lowercase terms for '{column_name}'.")
    return list(terms), list(original_casing_terms)

# --- Main Execution Block for Step 5 Refactoring v2 ---
if nlp:
    # 1. Load Gazetteers
    print("\n--- Loading Gazetteers from Database ---")
    lc_companies, unique_companies_orig = load_terms_from_db(db_file, table_name, 'Security')
    lc_sectors, unique_sectors_orig = load_terms_from_db(db_file, table_name, 'Sector')
    lc_countries, unique_countries_orig = load_terms_from_db(db_file, table_name, 'Country')
    lc_industries, unique_industries_orig = load_terms_from_db(db_file, table_name, 'Industry')

    # --- Mappings ---
    country_mapping = { # alias -> canonical (lowercase)
        "american": "usa", "us": "usa", "u.s.": "usa", "u.s.a": "usa",
        "uk": "united kingdom", "u.k.": "united kingdom",
        "french": "france", "german": "germany", "spanish": "spain",
        "indian": "india", # Added
    }
    sector_alias_mapping = { # alias -> canonical (lowercase)
        "it": "information technology", "info tech": "information technology",
        "health": "health care",
    }

    # Combine terms for matching
    all_country_terms_lc = set(lc_countries) | set(country_mapping.keys())
    all_sector_terms_lc = set(lc_sectors) | set(sector_alias_mapping.keys())
    all_industry_terms_lc = set(lc_industries) # Add aliases if needed
    all_company_terms_lc = set(lc_companies) # Add aliases if needed


    # 2. Setup PhraseMatcher (keep focus on multi-word)
    print("\n--- Setting up PhraseMatcher ---")
    matcher = PhraseMatcher(nlp.vocab, attr='LOWER') 
    # Filter for docs with more than one token
    matcher.add("COMPANY_NAME", [doc for doc in nlp.pipe(all_company_terms_lc) if len(doc) > 1])
    matcher.add("SECTOR_TERM", [doc for doc in nlp.pipe(all_sector_terms_lc) if len(doc) > 1])
    matcher.add("COUNTRY_TERM", [doc for doc in nlp.pipe(all_country_terms_lc) if len(doc) > 1])
    matcher.add("INDUSTRY_TERM", [doc for doc in nlp.pipe(all_industry_terms_lc) if len(doc) > 1])
    print("PhraseMatcher patterns added (primarily multi-word).")


    # 3. Setup EntityRuler (Revised Pattern Building)
    print("\n--- Setting up EntityRuler ---")
    if "entity_ruler" in nlp.pipe_names: nlp.remove_pipe("entity_ruler")
    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}, before="ner") # Explicitly overwrite
    print("EntityRuler added to spaCy pipeline before NER (overwrite=True).")

    # --- Define patterns in separate lists ---
    
    # Add multi-word company names to Ruler as well for priority (optional but might help Apple Inc.)
    # multi_word_company_patterns = [{"label": "COMPANY_NAME", "pattern": [{"LOWER": token.text} for token in nlp(term)], "id": term} 
    #                                for term in lc_companies if len(nlp(term)) > 1]
    # patterns.extend(multi_word_company_patterns)


    # 1. First, ensure we're properly creating patterns
    patterns = []

    # Country patterns
    for term in lc_countries:
        if len(nlp(term)) == 1:  # Single token countries
            patterns.append({"label": "COUNTRY_TERM", "pattern": [{"LOWER": term}], "id": term})

    # Country aliases  
    for alias, canonical in country_mapping.items():
        patterns.append({"label": "COUNTRY_TERM", "pattern": [{"LOWER": alias}], "id": canonical})

    # Sector patterns - Carefully add single tokens if needed
    for term in lc_sectors:
        if len(nlp(term)) == 1:
            patterns.append({"label": "SECTOR_TERM", "pattern": [{"LOWER": term}], "id": term})

    # Sector aliases
    for alias, canonical in sector_alias_mapping.items():
        patterns.append({"label": "SECTOR_TERM", "pattern": [{"LOWER": alias}], "id": canonical})
        
    # Add multi-word sector pattern for "info tech" explicitly
    patterns.append({"label": "SECTOR_TERM", "pattern": [{"LOWER": "info"}, {"LOWER": "tech"}], "id": "information technology"})

    # Industry patterns
    for term in lc_industries:
        if len(nlp(term)) == 1:
            patterns.append({"label": "INDUSTRY_TERM", "pattern": [{"LOWER": term}], "id": term})

    # Company patterns
    for term in lc_companies:
        if len(nlp(term)) == 1:
            patterns.append({"label": "COMPANY_NAME", "pattern": [{"LOWER": term}], "id": term})

    # Fix money value pattern to better handle "1B." cases
    patterns.append({"label": "MONEY_VALUE", "pattern": [
        {"LIKE_NUM": True}, 
        {"LOWER": {"IN": ["b", "bn", "billion", "m", "mn", "million", "t", "tn", "trillion"]}},
        {"TEXT": ".", "OP": "?"} # Optional period
    ]})
    
    # Custom patterns (Monetary, Ranking, Ops, Keywords, Cardinal)
    custom_patterns = [
        # Monetary Values (Revised) - Use LIKE_NUM and specific followers
        {"label": "MONEY_VALUE", "pattern": [{"LIKE_NUM": True}, {"LOWER": {"IN": ["b", "bn", "billion", "m", "mn", "million", "t", "tn", "trillion"]}}, {"IS_PUNCT": True, "OP": "?"}]},
        {"label": "MONEY_VALUE", "pattern": [{"TEXT": "$", "OP": "?"}, {"LIKE_NUM": True}, {"LOWER": {"IN": ["b", "bn", "billion", "m", "mn", "million", "t", "tn", "trillion"]}}, {"IS_PUNCT": True, "OP": "?"}]},
        
        # Cardinal Number (Explicitly add)
        {"label": "CARDINAL", "pattern": [{"POS": "NUM", "ENT_TYPE": ""}]}, # Match NUM token not already part of another entity

        # Ranking
        {"label": "RANKING_MODIFIER", "pattern": [{"LOWER": {"IN": ["top", "most", "highest", "largest", "biggest", "least", "lowest", "smallest"]}}]},
        
        # Comparison Ops
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": {"IN": ["over", "above", "under", "below"]}}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "more"}, {"LOWER": "than"}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "greater"}, {"LOWER": "than"}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "less"}, {"LOWER": "than"}]},
        {"label": "COMPARISON_OP", "pattern": [{"TEXT": {"IN": [">", "<"]}}]},

        # Value Keywords
        {"label": "VALUE_KEYWORD", "pattern": [{"LOWER": {"IN": ["valued", "value", "marketcap", "worth"]}}]},
        {"label": "VALUE_KEYWORD", "pattern": [{"LOWER": "market"}, {"LOWER": "cap"}]},

        # Column Keywords
        {"label": "COLUMN_SECTOR", "pattern": [{"LOWER": {"IN": ["sector", "sectors"]}}]},
        {"label": "COLUMN_INDUSTRY", "pattern": [{"LOWER": {"IN": ["industry", "industries"]}}]},
        {"label": "COLUMN_COUNTRY", "pattern": [{"LOWER": {"IN": ["country", "countries"]}}]},
        {"label": "COLUMN_FOUNDED", "pattern": [{"LOWER": "founded"}]},
        {"label": "COLUMN_STOCKPRICE", "pattern": [{"LOWER": {"IN": ["stock", "price", "stockprice"]}}]},
    ]
    patterns.extend(custom_patterns)

    # Add all patterns to the ruler
    ruler.add_patterns(patterns)
    print(f"Added {len(patterns)} patterns to EntityRuler.")


    # --- Process Example Queries (Keep same test list) ---
    test_queries = [
        "Top 10 most valuable American IT companies.",
        "Show details for Apple Inc.",
        "Every French company valued over 1B.",
        "List all sectors available.",
        "What is the market cap of Microsoft?",
        "Which German companies are in the Health Care sector?", 
        "Find companies worth more than 500 billion dollars", 
        "Show financials for Tesla", 
        "list info tech companies in the US", 
        "Lowest 5 market cap companies in France", 
        "Show country and sector for companies in Spain" 
    ]

    print("\n--- Analyzing Queries with Refactored Enhanced NER (Step 5 v2) ---")
    processed_docs = {}
    identified_items_store = {}

    for query in test_queries:
        print(f"\n--- Analyzing Query: '{query}' ---")
        doc = nlp(query)
        processed_docs[query] = doc 
        phrase_matches = matcher(doc)

        # --- Collect Entities/Matches (Revised Logic v2) ---
        # Prioritize pipeline entities (Ruler runs first with overwrite=True)
        found_items = []
        pipeline_ents = {} 
        
        for ent in doc.ents:
            span_key = (ent.start_char, ent.end_char)
            ent_id = ent.ent_id_ if ent.ent_id_ else None 
            
            # Attempt normalization based on label and ID/text
            mapped_value = ent.text # Default
            try:
                if ent.label_ == "COUNTRY_TERM":
                     # Use ID if it's a known canonical name, else try mapping the text
                    if ent_id and ent_id in lc_countries: mapped_value = ent_id 
                    elif ent.text.lower() in country_mapping: mapped_value = country_mapping[ent.text.lower()]
                elif ent.label_ == "SECTOR_TERM":
                    if ent_id and ent_id in lc_sectors: mapped_value = ent_id
                    elif ent.text.lower() in sector_alias_mapping: mapped_value = sector_alias_mapping[ent.text.lower()]
                # Add Industry/Company normalization if needed using ent_id and lc_ lists
                
            except Exception as e:
                 print(f"  Warning: Error during mapping/normalization for '{ent.text}' ({ent.label_}): {e}")

            # Store the entity, allowing overwrite based on Ruler's priority
            pipeline_ents[span_key] = {
                "text": ent.text, "label": ent.label_, "ent_id": ent_id,
                "mapped_value": mapped_value, 
                "start_char": ent.start_char, "end_char": ent.end_char,
                "source": "Pipeline (Ruler/NER)" 
            }

        found_items.extend(pipeline_ents.values())

        # Add non-overlapping PhraseMatcher results
        pipeline_spans = {(item['start_char'], item['end_char']) for item in found_items}
        for match_id, start, end in phrase_matches:
            span = doc[start:end]
            span_chars = (span.start_char, span.end_char)
            if span_chars not in pipeline_spans:
                label = nlp.vocab.strings[match_id]
                term_text = span.text
                mapped_value = term_text # Basic mapping for PhraseMatcher results if needed
                if label == "COUNTRY_TERM" and span.text.lower() in country_mapping: mapped_value = country_mapping[span.text.lower()]
                elif label == "SECTOR_TERM" and span.text.lower() in sector_alias_mapping: mapped_value = sector_alias_mapping[span.text.lower()]
                
                found_items.append({
                    "text": term_text, "label": label, "mapped_value": mapped_value,
                    "ent_id": None, 
                    "start_char": span.start_char, "end_char": span.end_char,
                    "source": "PhraseMatcher"
                })

        found_items.sort(key=lambda item: item['start_char'])
        identified_items_store[query] = found_items 

        # Display Results
        print("\nIdentified Entities & Terms (Refactored Step 5 v2):")
        if found_items:
            for item in found_items:
                details = f"- Text: '{item['text']}', Label: {item['label']}, Source: {item['source']}"
                if item['mapped_value'] != item['text']: details += f" (Mapped: '{item['mapped_value']}')"
                if item['ent_id']: details += f" (ID: {item['ent_id']})"
                print(details)
        else: print("- No specific entities or terms identified.")
        print("="*80)
else:
    print("\nSkipping Step 5 v2 analysis because the spaCy model could not be loaded.")

'en_core_web_sm' spaCy model already loaded.

--- Loading Gazetteers from Database ---
Loaded 491 unique lowercase terms for 'Security'.
Loaded 11 unique lowercase terms for 'Sector'.
Loaded 7 unique lowercase terms for 'Country'.
Loaded 124 unique lowercase terms for 'Industry'.

--- Setting up PhraseMatcher ---
PhraseMatcher patterns added (primarily multi-word).

--- Setting up EntityRuler ---
EntityRuler added to spaCy pipeline before NER (overwrite=True).




Added 189 patterns to EntityRuler.

--- Analyzing Queries with Refactored Enhanced NER (Step 5 v2) ---

--- Analyzing Query: 'Top 10 most valuable American IT companies.' ---

Identified Entities & Terms (Refactored Step 5 v2):
- Text: 'Top', Label: RANKING_MODIFIER, Source: Pipeline (Ruler/NER)
- Text: '10', Label: CARDINAL, Source: Pipeline (Ruler/NER)
- Text: 'most', Label: RANKING_MODIFIER, Source: Pipeline (Ruler/NER)
- Text: 'American', Label: COUNTRY_TERM, Source: Pipeline (Ruler/NER) (Mapped: 'usa') (ID: usa)
- Text: 'IT', Label: SECTOR_TERM, Source: Pipeline (Ruler/NER) (Mapped: 'information technology') (ID: information technology)

--- Analyzing Query: 'Show details for Apple Inc.' ---

Identified Entities & Terms (Refactored Step 5 v2):
- Text: 'Apple Inc.', Label: ORG, Source: Pipeline (Ruler/NER)

--- Analyzing Query: 'Every French company valued over 1B.' ---

Identified Entities & Terms (Refactored Step 5 v2):
- Text: 'French', Label: COUNTRY_TERM, Source: Pipeline (Rul

In [17]:
"""Step 6 (Refactored): Query Structure Parsing (Intent & Parameter Extraction)"""

import re 
import json # For pretty printing the output

# --- Helper Functions (Keep as is) ---
# parse_monetary_value, map_comparison_operator are likely fine

# --- Normalization Helper ---
def normalize_term(term, original_list, alias_mapping=None):
    """Normalizes a found term against a list of canonical terms (original casing) and optional aliases."""
    term_lower = term.lower().strip()
    
    # 1. Check alias mapping first
    if alias_mapping and term_lower in alias_mapping:
        canonical_lower = alias_mapping[term_lower]
        # Find original casing for the canonical name
        for orig_term in original_list:
            if orig_term.lower() == canonical_lower:
                return orig_term
        return canonical_lower # Return lowercase canonical if original casing not found (shouldn't happen ideally)

    # 2. Check direct match (case-insensitive) against original list
    for orig_term in original_list:
        if orig_term.lower() == term_lower:
            return orig_term # Return the original casing

    # 3. Return original term if no match/normalization found
    return term 

# --- COLUMN_MAP (Keep as is, seems reasonable) ---
COLUMN_MAP = {
    "COMPANY_NAME": "Security", "ORG": "Security", 
    "SECTOR_TERM": "Sector", "INDUSTRY_TERM": "Industry",
    "COUNTRY_TERM": "Country", "GPE": "Country",
    "VALUE_KEYWORD": "Marketcap", "COLUMN_SECTOR": "Sector",
    "COLUMN_INDUSTRY": "Industry", "COLUMN_COUNTRY": "Country",
    "COLUMN_FOUNDED": "Founded", "COLUMN_STOCKPRICE": "Stockprice",
    "MONEY_VALUE": "Marketcap", # Default column for monetary values
    "CARDINAL": None # Cardinal numbers usually represent LIMIT, not a filter column
}

# --- Main Parsing Function (Refactored) ---
def parse_query_structure_refactored(doc, identified_items, 
                                     orig_sectors, orig_industries, orig_countries): # Pass original casing lists
    """
    Parses the spaCy Doc and identified entities/terms into a structured query representation. (Refactored)
    """
    parsed_structure = {
        'intent': None, 'select_cols': [], 'filters': [], 'limit': None,
        'order_by': None, 'distinct': False, 'errors': []
    }

    # Create lowercase versions for lookup inside function
    lc_sectors_set = {s.lower() for s in orig_sectors}
    lc_industries_set = {i.lower() for i in orig_industries}
    lc_countries_set = {c.lower() for c in orig_countries}

    items_by_label = {}
    for item in identified_items:
        label = item['label']
        if label not in items_by_label: items_by_label[label] = []
        items_by_label[label].append(item)
        
    lemmas = [token.lemma_.lower() for token in doc if not token.is_punct | token.is_stop]
    root_verb = next((token for token in doc if token.dep_ == "ROOT" and token.pos_ == "VERB"), None)
    root_lemma = root_verb.lemma_.lower() if root_verb else None

    # --- Refined Intent and Parameter Extraction ---

    has_ranking_modifier = bool(items_by_label.get("RANKING_MODIFIER"))
    has_cardinal = bool(items_by_label.get("CARDINAL"))
    has_company = bool(items_by_label.get("COMPANY_NAME") or items_by_label.get("ORG"))
    has_comparison = bool(items_by_label.get("COMPARISON_OP"))
    has_money = bool(items_by_label.get("MONEY_VALUE"))
    has_list_all_keyword = any(lbl in items_by_label for lbl in ["COLUMN_SECTOR", "COLUMN_INDUSTRY", "COLUMN_COUNTRY"]) and \
                           any(l in lemmas for l in ["all", "available"]) and \
                           root_lemma in ["list", "show"]
                           
    # --- Intent Prioritization ---
    
    # 1. List distinct column values
    if has_list_all_keyword:
        parsed_structure['intent'] = 'list_values'
        parsed_structure['distinct'] = True
        col_label = next((lbl for lbl in ["COLUMN_SECTOR", "COLUMN_INDUSTRY", "COLUMN_COUNTRY"] if lbl in items_by_label), None)
        if col_label and col_label in COLUMN_MAP:
            parsed_structure['select_cols'] = [COLUMN_MAP[col_label]]
        else:
            parsed_structure['errors'].append("Could not determine column for listing values.")
            parsed_structure['select_cols'] = ['*'] # Fallback

    # 2. Top/Bottom N queries
    elif has_ranking_modifier and has_cardinal:
        parsed_structure['intent'] = 'find_top'
        
        # Limit
        limit_item = items_by_label["CARDINAL"][0]
        try: parsed_structure['limit'] = int(limit_item['text'])
        except ValueError: parsed_structure['errors'].append(f"Could not parse limit value: {limit_item['text']}")

        # Order By
        order_col = "Marketcap" # Default
        order_dir = "DESC"      # Default
        rank_item = items_by_label["RANKING_MODIFIER"][0]
        
        # Check explicit column keywords near ranking modifier
        for lbl, col_name in COLUMN_MAP.items():
            if lbl and col_name and lbl.startswith("COLUMN_") and lbl in items_by_label:
                 col_item = items_by_label[lbl][0]
                 if abs(col_item['start_char'] - rank_item['start_char']) < 25: # Proximity threshold
                      order_col = col_name
                      break 
                      
        # Check direction (ASC for lowest/smallest)
        # Use the actual text of the modifier for lemma check
        rank_token = doc.char_span(rank_item['start_char'], rank_item['end_char'])
        if rank_token and rank_token[0].lemma_.lower() in ["least", "lowest", "smallest"]:
            order_dir = "ASC"
            
        parsed_structure['order_by'] = {'column': order_col, 'direction': order_dir}
        parsed_structure['select_cols'] = ['Security', order_col]

    # 3. Specific Company Lookup (only if NO ranking and primary focus is company)
    # Be stricter: Trigger only if company name is present and limited other filters/indicators exist
    elif has_company and not has_ranking_modifier and not has_comparison and len(identified_items) < 5: # Heuristic: simple query focused on company
        parsed_structure['intent'] = 'lookup_details'
        parsed_structure['select_cols'] = ['*'] 
        # Get company name (prioritize COMPANY_NAME)
        company_item = items_by_label.get("COMPANY_NAME", items_by_label.get("ORG", []))[0]
        # Normalize company name against the DB list if possible (exact match)
        company_name_norm = normalize_term(company_item['text'], unique_companies_orig) 
        parsed_structure['filters'].append({
            'column': 'Security', 'operator': '=', 'value': company_name_norm
        })
        
    # 4. Default to Filtered List
    else:
        parsed_structure['intent'] = 'filter_list'
        # Default columns, can be overridden later
        parsed_structure['select_cols'] = ['Security', 'Marketcap'] 

    # --- Extract Filters (Common Logic) ---
    # Process filters regardless of intent (except list_values)
    # Use a set to avoid adding duplicate filters easily
    added_filter_tuples = set() 
    
    if parsed_structure['intent'] != 'list_values':
        
        # Priority Labels for Filters: Specific terms first
        filter_labels_priority = ["COMPANY_NAME", "COUNTRY_TERM", "SECTOR_TERM", "INDUSTRY_TERM"]
        # Fallback Labels: Generic NER tags
        filter_labels_fallback = ["ORG", "GPE"] 

        for item_label in filter_labels_priority + filter_labels_fallback:
            if item_label in items_by_label and item_label in COLUMN_MAP:
                db_column = COLUMN_MAP[item_label]
                if not db_column: continue # Skip labels not mapped to columns (like CARDINAL)

                for item in items_by_label[item_label]:
                    # Skip adding company filter if intent is already lookup_details
                    if parsed_structure['intent'] == 'lookup_details' and db_column == 'Security':
                        continue
                        
                    # Normalize value using appropriate list and mappings
                    filter_value_raw = item.get('mapped_value', item['text']) # Start with mapped value if available
                    normalized_value = filter_value_raw # Default

                    if item_label in ["SECTOR_TERM", "COLUMN_SECTOR"]:
                        normalized_value = normalize_term(filter_value_raw, unique_sectors_orig, sector_alias_mapping)
                    elif item_label in ["INDUSTRY_TERM", "COLUMN_INDUSTRY"]:
                         normalized_value = normalize_term(filter_value_raw, unique_industries_orig) # Add industry alias map if needed
                    elif item_label in ["COUNTRY_TERM", "GPE", "COLUMN_COUNTRY"]:
                         # Use mapped value directly if available and valid, otherwise normalize text
                         if item.get('mapped_value') and item['mapped_value'].lower() in lc_countries_set:
                              normalized_value = normalize_term(item['mapped_value'], unique_countries_orig)
                         else:
                              normalized_value = normalize_term(filter_value_raw, unique_countries_orig, country_mapping)
                    elif item_label in ["COMPANY_NAME", "ORG"]:
                         normalized_value = normalize_term(filter_value_raw, unique_companies_orig)

                    # Check for generic words being misinterpreted (e.g., "financials")
                    # If the normalized term isn't in our known DB lists for that column, maybe ignore it?
                    is_known = True
                    if db_column == 'Sector' and normalized_value.lower() not in lc_sectors_set: is_known = False
                    if db_column == 'Industry' and normalized_value.lower() not in lc_industries_set: is_known = False
                    if db_column == 'Country' and normalized_value.lower() not in lc_countries_set: is_known = False
                    # Add check for company? Might be too strict if DB isn't exhaustive.

                    if not is_known:
                         parsed_structure['errors'].append(f"Ignoring potential filter term '{item['text']}' for column '{db_column}' as it's not a known value.")
                         continue # Skip adding this filter

                    # Add filter if not duplicate
                    filter_tuple = (db_column, '=', str(normalized_value))
                    if filter_tuple not in added_filter_tuples:
                        parsed_structure['filters'].append({
                            'column': db_column, 'operator': '=', 'value': normalized_value
                        })
                        added_filter_tuples.add(filter_tuple)

        # Threshold Filters
        comp_ops = items_by_label.get("COMPARISON_OP", [])
        money_vals = items_by_label.get("MONEY_VALUE", [])

        if comp_ops and money_vals:
            # Simple: assume first op applies to first money value, related to Marketcap
            # Could be enhanced by checking proximity or keywords like "valued"
            op_item = comp_ops[0]
            val_item = money_vals[0]
            
            filter_column = "Marketcap" # Default
            sql_op = map_comparison_operator(op_item['text'])
            numeric_val = parse_monetary_value(val_item['text'])

            if sql_op and numeric_val is not None:
                filter_tuple = (filter_column, sql_op, str(numeric_val))
                if filter_tuple not in added_filter_tuples:
                     parsed_structure['filters'].append({
                         'column': filter_column, 'operator': sql_op, 'value': numeric_val
                     })
                     added_filter_tuples.add(filter_tuple)
            else:
                 if not sql_op: parsed_structure['errors'].append(f"Could not map comparison operator: {op_item['text']}")
                 if numeric_val is None: parsed_structure['errors'].append(f"Could not parse monetary value: {val_item['text']}")

    # --- Refine Select Columns (Revised Logic) ---
    # Only refine if intent is not specific lookup or list_values
    if parsed_structure['intent'] not in ['lookup_details', 'list_values']:
        explicit_cols = set()
        explicit_col_request = False
        for col_label, col_name in COLUMN_MAP.items():
             if col_name and col_label.startswith("COLUMN_") and col_label in items_by_label:
                  explicit_cols.add(col_name)
                  explicit_col_request = True
        
        # If explicit columns were requested, OVERRIDE the defaults
        if explicit_col_request:
             # Always include Security for context, unless explicitly excluded somehow (not handled here)
             final_cols = ['Security'] + sorted(list(explicit_cols)) # Sort for consistency
             parsed_structure['select_cols'] = list(dict.fromkeys(final_cols)) # Keep order, remove duplicates
        elif not parsed_structure['select_cols']: # If defaults were empty (shouldn't happen)
             parsed_structure['select_cols'] = ['Security'] # Fallback


    # Final filter list is already deduplicated by the set logic during addition
    
    return parsed_structure


# --- Test the Refactored Parser ---
if nlp:
    print("\n--- Testing Refactored Query Structure Parsing (Step 6) ---")
    
    # Use the results stored from the refactored Step 5 run
    
    for query in test_queries: # Use the same list as defined in Step 5 test block
        if query in processed_docs and query in identified_items_store:
            print(f"\n--- Parsing Query: '{query}' ---")
            doc = processed_docs[query]
            found_items = identified_items_store[query]
            
            # ----> Call the REFFACTORED parsing function <----
            parsed_result = parse_query_structure_refactored(
                doc, 
                found_items,
                unique_sectors_orig, # Pass original casing list
                unique_industries_orig,
                unique_countries_orig 
            )
            
            # Pretty print the result
            print(json.dumps(parsed_result, indent=2))
            print("="*80)
        else:
            print(f"\n--- Skipping Query: '{query}' (Data not found from Step 5 run) ---")

else:
    print("\nSkipping Step 6 testing because the spaCy model could not be loaded.")


--- Testing Refactored Query Structure Parsing (Step 6) ---

--- Parsing Query: 'Top 10 most valuable American IT companies.' ---
{
  "intent": "find_top",
  "select_cols": [
    "Security",
    "Marketcap"
  ],
  "filters": [
    {
      "column": "Country",
      "operator": "=",
      "value": "USA"
    },
    {
      "column": "Sector",
      "operator": "=",
      "value": "Information Technology"
    }
  ],
  "limit": 10,
  "order_by": {
    "column": "Marketcap",
    "direction": "DESC"
  },
  "distinct": false,
  "errors": []
}

--- Parsing Query: 'Show details for Apple Inc.' ---
{
  "intent": "lookup_details",
  "select_cols": [
    "*"
  ],
  "filters": [
    {
      "column": "Security",
      "operator": "=",
      "value": "Apple Inc."
    }
  ],
  "limit": null,
  "order_by": null,
  "distinct": false,
  "errors": []
}

--- Parsing Query: 'Every French company valued over 1B.' ---
{
  "intent": "filter_list",
  "select_cols": [
    "Security",
    "Marketcap"
  ],
  "fi