In [1]:
"""Spacy & Numpy import & test."""

# !python3 -m spacy download en_core_web_sm -q

import spacy
import numpy as np

try:
    # Load the small English model
    nlp = spacy.load("en_core_web_sm")
    print("spaCy loaded successfully!")
    print(f"Using NumPy version: {np.__version__}") # Check NumPy version

    # Test it
    doc = nlp("This is a test sentence.")
    print("Processed sentence:", doc.text)
    for token in doc:
        print(token.text, token.pos_)

except Exception as e:
    print(f"An error occurred: {e}")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/nathan/Library/Python/3.11/lib/python/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/nathan/Library/Python/3.11/lib/python/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/nathan/Library/Python/3.11/lib/python/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start(

spaCy loaded successfully!
Using NumPy version: 2.2.4
Processed sentence: This is a test sentence.
This PRON
is AUX
a DET
test NOUN
sentence NOUN
. PUNCT


In [2]:
"""Basic Spacy pipeline test."""

# 'en_core_web_sm' is small and fast, good for starting.
# 'en_core_web_md' or 'en_core_web_lg' are larger but more accurate.
try:
    nlp = spacy.load("en_core_web_sm")
    print("Loaded 'en_core_web_sm' spaCy model.")
# Basic error handling if the model isn't downloaded
except OSError:
    print("spaCy model 'en_core_web_sm' not found.")
    print("Please run: python -m spacy download en_core_web_sm")
    # Depending on the environment, you might need to restart the kernel after download
    # Or exit if the model is critical for subsequent steps
    nlp = None # Set nlp to None to avoid errors later if model loading failed

if nlp:
    # Example Queries (taken from Step 1)
    example_queries = [
        "Top 10 most valuable American IT companies.", # Filtered Ranking
        "Show details for Apple Inc.",                 # Company Info Lookup (using specific name from DB potential)
        "Every French company valued over 1B.",        # Threshold Filtering
        "List all sectors available.",                 # A potential simpler query
        "What is the market cap of Microsoft?",        # Specific detail lookup
    ]

    # Process each query using spaCy
    for query in example_queries:
        print(f"\n--- Analyzing Query: '{query}' ---")
        
        # Process the query with the loaded spaCy model
        doc = nlp(query)
        
        # Print the analysis for each token
        # Using a formatted table similar to the roadmap example
        print(f"{'Token':<15} | {'Lemma':<15} | {'POS':<7} | {'Dep':<10} | {'Detailed Tag':<7} | {'Entity Type'}")
        print("-" * 80)
        for token in doc:
            print(f"{token.text:<15} | {token.lemma_:<15} | {token.pos_:<7} | {token.dep_:<10} | {token.tag_:<12} | {token.ent_type_ if token.ent_type_ else '-'}")

        # Optional: Display Named Entities found by the base model (precursor to Step 5)
        print("\nBase Named Entities Found:")
        if doc.ents:
            for ent in doc.ents:
                print(f"- Entity: '{ent.text}', Label: {ent.label_} ({spacy.explain(ent.label_)})")
        else:
            print("- No named entities found by the base model.")
        
        print("="*80)

else:
    print("\nSkipping NLP analysis because the spaCy model could not be loaded.")

Loaded 'en_core_web_sm' spaCy model.

--- Analyzing Query: 'Top 10 most valuable American IT companies.' ---
Token           | Lemma           | POS     | Dep        | Detailed Tag | Entity Type
--------------------------------------------------------------------------------
Top             | top             | ADJ     | ROOT       | JJ           | -
10              | 10              | NUM     | nummod     | CD           | CARDINAL
most            | most            | ADV     | advmod     | RBS          | -
valuable        | valuable        | ADJ     | amod       | JJ           | -
American        | american        | ADJ     | amod       | JJ           | NORP
IT              | IT              | PROPN   | compound   | NNP          | -
companies       | company         | NOUN    | npadvmod   | NNS          | -
.               | .               | PUNCT   | punct      | .            | -

Base Named Entities Found:
- Entity: '10', Label: CARDINAL (Numerals that do not fall under another type)

In [None]:
"""Adding Gazetteer, PhraseMatcher and EntityRuler to improve identification of key pieces of information."""

# Force reinstall pandas to ensure it's compiled against the current numpy version
# %pip uninstall pandas -y
# %pip install pandas --no-cache-dir

# Also good practice to ensure numpy is reasonably up-to-date, though reinstalling pandas is key
# %pip install --upgrade numpy --no-cache-dir

# print("Reinstalled pandas and updated numpy. Please RESTART THE KERNEL now.")
# print("After restarting, re-run the previous cells (database setup, Step 4) and then try Step 5 again.")

# %%capture -- Allows %pip install without showing massive output
# Re-install spacy if needed and download model again if issues persist
# %pip install spacy -q
# !python -m spacy download en_core_web_sm -q

# %%capture
# %pip install spacy -q
# !python -m spacy download en_core_web_sm -q

"""Step 5 (Refactored v2): Enhanced NER & Term Extraction"""

import spacy
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler
import sqlite3
import pandas as pd
import warnings

# --- Configuration ---
db_file = 'companies_database.db'
table_name = 'companies'

# --- spaCy Model Loading ---
try:
    if 'nlp' not in locals() or not isinstance(nlp, spacy.language.Language): 
         nlp = spacy.load("en_core_web_sm")
         print("Loaded 'en_core_web_sm' spaCy model for Step 5.")
    else:
        print("'en_core_web_sm' spaCy model already loaded.")
        # Ensure a clean state for the ruler
        if "entity_ruler" in nlp.pipe_names:
            nlp.remove_pipe("entity_ruler")
            print("Removed existing EntityRuler pipe for clean setup.")
            
except OSError:
    print("spaCy model 'en_core_web_sm' not found. Cannot proceed.")
    nlp = None
except Exception as e:
    print(f"An error occurred loading spaCy model: {e}")
    nlp = None

# --- Gazetteer Loading Function (Unchanged) ---
def load_terms_from_db(db_path, table, column_name):
    terms = set(); original_casing_terms = set()
    conn = None
    try:
        conn = sqlite3.connect(db_path)
        query = f'SELECT DISTINCT "{column_name}" FROM "{table}" WHERE "{column_name}" IS NOT NULL AND "{column_name}" != \'\''
        df = pd.read_sql_query(query, conn)
        if column_name in df.columns:
            original_casing_terms.update(term.strip() for term in df[column_name].astype(str) if term.strip())
            terms.update(term.lower().strip() for term in df[column_name].astype(str) if term.strip())
        else: print(f"Warning: Column '{column_name}' not found in table '{table}'.")
    except Exception as e: print(f"Error loading terms for '{column_name}': {e}")
    finally:
        if conn: conn.close()
    terms.discard(''); original_casing_terms.discard('')
    print(f"Loaded {len(terms)} unique lowercase terms for '{column_name}'.")
    return list(terms), list(original_casing_terms)

# --- Main Execution Block for Step 5 Refactoring v2 ---
if nlp:
    # 1. Load Gazetteers
    print("\n--- Loading Gazetteers from Database ---")
    lc_companies, unique_companies_orig = load_terms_from_db(db_file, table_name, 'Security')
    lc_sectors, unique_sectors_orig = load_terms_from_db(db_file, table_name, 'Sector')
    lc_countries, unique_countries_orig = load_terms_from_db(db_file, table_name, 'Country')
    lc_industries, unique_industries_orig = load_terms_from_db(db_file, table_name, 'Industry')

    # --- Mappings ---
    country_mapping = { # alias -> canonical (lowercase)
        "american": "usa", "us": "usa", "u.s.": "usa", "u.s.a": "usa",
        "uk": "united kingdom", "u.k.": "united kingdom",
        "french": "france", "german": "germany", "spanish": "spain",
        "indian": "india", # Added
    }
    sector_alias_mapping = { # alias -> canonical (lowercase)
        "it": "information technology", "info tech": "information technology",
        "health": "health care",
    }

    # Combine terms for matching
    all_country_terms_lc = set(lc_countries) | set(country_mapping.keys())
    all_sector_terms_lc = set(lc_sectors) | set(sector_alias_mapping.keys())
    all_industry_terms_lc = set(lc_industries) # Add aliases if needed
    all_company_terms_lc = set(lc_companies) # Add aliases if needed


    # 2. Setup PhraseMatcher (keep focus on multi-word)
    print("\n--- Setting up PhraseMatcher ---")
    matcher = PhraseMatcher(nlp.vocab, attr='LOWER') 
    # Filter for docs with more than one token
    matcher.add("COMPANY_NAME", [doc for doc in nlp.pipe(all_company_terms_lc) if len(doc) > 1])
    matcher.add("SECTOR_TERM", [doc for doc in nlp.pipe(all_sector_terms_lc) if len(doc) > 1])
    matcher.add("COUNTRY_TERM", [doc for doc in nlp.pipe(all_country_terms_lc) if len(doc) > 1])
    matcher.add("INDUSTRY_TERM", [doc for doc in nlp.pipe(all_industry_terms_lc) if len(doc) > 1])
    print("PhraseMatcher patterns added (primarily multi-word).")


    # 3. Setup EntityRuler (Revised Pattern Building)
    print("\n--- Setting up EntityRuler ---")
    if "entity_ruler" in nlp.pipe_names: nlp.remove_pipe("entity_ruler")
    ruler = nlp.add_pipe("entity_ruler", config={"overwrite_ents": True}, before="ner") # Explicitly overwrite
    print("EntityRuler added to spaCy pipeline before NER (overwrite=True).")

    # --- Define patterns in separate lists ---
    
    # Add multi-word company names to Ruler as well for priority (optional but might help Apple Inc.)
    # multi_word_company_patterns = [{"label": "COMPANY_NAME", "pattern": [{"LOWER": token.text} for token in nlp(term)], "id": term} 
    #                                for term in lc_companies if len(nlp(term)) > 1]
    # patterns.extend(multi_word_company_patterns)


    # 1. First, ensure we're properly creating patterns
    patterns = []

    # Country patterns
    for term in lc_countries:
        if len(nlp(term)) == 1:  # Single token countries
            patterns.append({"label": "COUNTRY_TERM", "pattern": [{"LOWER": term}], "id": term})

    # Country aliases  
    for alias, canonical in country_mapping.items():
        patterns.append({"label": "COUNTRY_TERM", "pattern": [{"LOWER": alias}], "id": canonical})

    # Sector patterns - Carefully add single tokens if needed
    for term in lc_sectors:
        if len(nlp(term)) == 1:
            patterns.append({"label": "SECTOR_TERM", "pattern": [{"LOWER": term}], "id": term})

    # Sector aliases
    for alias, canonical in sector_alias_mapping.items():
        patterns.append({"label": "SECTOR_TERM", "pattern": [{"LOWER": alias}], "id": canonical})
        
    # Add multi-word sector pattern for "info tech" explicitly
    patterns.append({"label": "SECTOR_TERM", "pattern": [{"LOWER": "info"}, {"LOWER": "tech"}], "id": "information technology"})

    # Industry patterns
    for term in lc_industries:
        if len(nlp(term)) == 1:
            patterns.append({"label": "INDUSTRY_TERM", "pattern": [{"LOWER": term}], "id": term})

    # Company patterns
    for term in lc_companies:
        if len(nlp(term)) == 1:
            patterns.append({"label": "COMPANY_NAME", "pattern": [{"LOWER": term}], "id": term})

    # Fix money value pattern to better handle "1B." cases
    patterns.append({"label": "MONEY_VALUE", "pattern": [
        {"LIKE_NUM": True}, 
        {"LOWER": {"IN": ["b", "bn", "billion", "m", "mn", "million", "t", "tn", "trillion"]}},
        {"TEXT": ".", "OP": "?"} # Optional period
    ]})

    # Custom patterns (Monetary, Ranking, Ops, Keywords, Cardinal) - Revised v4
    custom_patterns = [
        # Monetary Values (Keep previous version - seemed okay for '$500 billion')
        {"label": "MONEY_VALUE", "pattern": [{"LIKE_NUM": True}, {"LOWER": {"IN": ["b", "bn", "billion", "m", "mn", "million", "t", "tn", "trillion"]}}, {"IS_PUNCT": True, "OP": "?"}]},
        {"label": "MONEY_VALUE", "pattern": [{"TEXT": "$", "OP": "?"}, {"LIKE_NUM": True}, {"LOWER": {"IN": ["b", "bn", "billion", "m", "mn", "million", "t", "tn", "trillion"]}}, {"IS_PUNCT": True, "OP": "?"}]},
        # Pattern for simple money amounts like $500 (Needs PRICE_VALUE label?)
        {"label": "PRICE_VALUE", "pattern": [{"TEXT": "$", "OP": "?"}, {"LIKE_NUM": True}]}, # Label specifically for price context? Or reuse MONEY_VALUE? Let's try PRICE_VALUE first.

        # Cardinal Number (for limits, but avoid matching years if possible)
        {"label": "CARDINAL", "pattern": [{"POS": "NUM", "ENT_TYPE": "", "SHAPE": {"NOT_IN": ["dddd"]}}]}, # Avoid tagging 4-digit numbers as CARDINAL initially
        {"label": "YEAR_NUMBER", "pattern": [{"SHAPE": "dddd", "POS": "NUM"}]}, # Specific pattern for years

        # Ranking (Keep as is)
        {"label": "RANKING_MODIFIER", "pattern": [{"LOWER": {"IN": ["top", "most", "highest", "largest", "biggest", "least", "lowest", "smallest"]}}]},
        
        # Comparison Ops (Add before/after)
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": {"IN": ["over", "above", "under", "below", "after", "before"]}}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "more"}, {"LOWER": "than"}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "greater"}, {"LOWER": "than"}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "less"}, {"LOWER": "than"}]},
        {"label": "COMPARISON_OP", "pattern": [{"TEXT": {"IN": [">", "<"]}}]},

        # Value Keywords (Keep as is)
        {"label": "VALUE_KEYWORD", "pattern": [{"LOWER": {"IN": ["valued", "value", "marketcap", "worth"]}}]},
        {"label": "VALUE_KEYWORD", "pattern": [{"LOWER": "market"}, {"LOWER": "cap"}]},

        # Column Keywords (Add variations)
        {"label": "COLUMN_SECTOR", "pattern": [{"LOWER": {"IN": ["sector", "sectors"]}}]},
        {"label": "COLUMN_INDUSTRY", "pattern": [{"LOWER": {"IN": ["industry", "industries"]}}]},
        {"label": "COLUMN_COUNTRY", "pattern": [{"LOWER": {"IN": ["country", "countries", "location", "region"]}}]}, # Added synonyms
        {"label": "COLUMN_FOUNDED", "pattern": [{"LOWER": "founded"}]},
        {"label": "COLUMN_FOUNDED", "pattern": [{"LOWER": "founding"}, {"LOWER": "date"}]}, # Added variation
        {"label": "COLUMN_STOCKPRICE", "pattern": [{"LOWER": {"IN": ["stock", "price", "stockprice", "stockprices"]}}]}, # Added plural
        {"label": "COLUMN_STOCKPRICE", "pattern": [{"LOWER": "stock"}, {"LOWER": "price"}]}, # Ensure multi-word matches
        {"label": "COLUMN_STOCKPRICE", "pattern": [{"LOWER": "stock"}, {"LOWER": "prices"}]}, 

        # Ordering Keyword
        {"label": "ORDERING_KEYWORD", "pattern": [{"LOWER": "ordered"}, {"LOWER": "by"}]},
        {"label": "ORDERING_KEYWORD", "pattern": [{"LOWER": "sorted"}, {"LOWER": "by"}]},
    ]
    patterns.extend(custom_patterns)



    # Add all patterns to the ruler
    ruler.add_patterns(patterns)
    print(f"Added {len(patterns)} patterns to EntityRuler.")


    # --- Process Example Queries (Keep same test list) ---
    test_queries = [
        "Top 10 most valuable American IT companies.",
        "Show details for Apple Inc.",
        "Every French company valued over 1B.",
        "List all sectors available.",
        "What is the market cap of Microsoft?",
        "Which German companies are in the Health Care sector?", 
        "Find companies worth more than 500 billion dollars", 
        "Show financials for Tesla", 
        "list info tech companies in the US", 
        "Lowest 5 market cap companies in France", 
        "Show country and sector for companies in Spain",
        "What sector is Apple in?",
        "Show the founding date for Microsoft.",
        "List stock prices for IT companies.",
        "Companies founded after 1990.",
        "Companies with stock price over $500.",
        "USA companies in IT founded before 2000.",
        "List companies ordered by founding date."
    ]

    print("\n--- Analyzing Queries with Refactored Enhanced NER (Step 5 v2) ---")
    processed_docs = {}
    identified_items_store = {}

    for query in test_queries:
        print(f"\n--- Analyzing Query: '{query}' ---")
        doc = nlp(query)
        processed_docs[query] = doc 
        phrase_matches = matcher(doc)

        # --- Collect Entities/Matches (Revised Logic v2) ---
        # Prioritize pipeline entities (Ruler runs first with overwrite=True)
        found_items = []
        pipeline_ents = {} 
        
        for ent in doc.ents:
            span_key = (ent.start_char, ent.end_char)
            ent_id = ent.ent_id_ if ent.ent_id_ else None 
            
            # Attempt normalization based on label and ID/text
            mapped_value = ent.text # Default
            try:
                if ent.label_ == "COUNTRY_TERM":
                     # Use ID if it's a known canonical name, else try mapping the text
                    if ent_id and ent_id in lc_countries: mapped_value = ent_id 
                    elif ent.text.lower() in country_mapping: mapped_value = country_mapping[ent.text.lower()]
                elif ent.label_ == "SECTOR_TERM":
                    if ent_id and ent_id in lc_sectors: mapped_value = ent_id
                    elif ent.text.lower() in sector_alias_mapping: mapped_value = sector_alias_mapping[ent.text.lower()]
                # Add Industry/Company normalization if needed using ent_id and lc_ lists
                
            except Exception as e:
                 print(f"  Warning: Error during mapping/normalization for '{ent.text}' ({ent.label_}): {e}")

            # Store the entity, allowing overwrite based on Ruler's priority
            pipeline_ents[span_key] = {
                "text": ent.text, "label": ent.label_, "ent_id": ent_id,
                "mapped_value": mapped_value, 
                "start_char": ent.start_char, "end_char": ent.end_char,
                "source": "Pipeline (Ruler/NER)" 
            }

        found_items.extend(pipeline_ents.values())

        # Add non-overlapping PhraseMatcher results
        pipeline_spans = {(item['start_char'], item['end_char']) for item in found_items}
        for match_id, start, end in phrase_matches:
            span = doc[start:end]
            span_chars = (span.start_char, span.end_char)
            if span_chars not in pipeline_spans:
                label = nlp.vocab.strings[match_id]
                term_text = span.text
                mapped_value = term_text # Basic mapping for PhraseMatcher results if needed
                if label == "COUNTRY_TERM" and span.text.lower() in country_mapping: mapped_value = country_mapping[span.text.lower()]
                elif label == "SECTOR_TERM" and span.text.lower() in sector_alias_mapping: mapped_value = sector_alias_mapping[span.text.lower()]
                
                found_items.append({
                    "text": term_text, "label": label, "mapped_value": mapped_value,
                    "ent_id": None, 
                    "start_char": span.start_char, "end_char": span.end_char,
                    "source": "PhraseMatcher"
                })

        found_items.sort(key=lambda item: item['start_char'])
        identified_items_store[query] = found_items 

        # Display Results
        print("\nIdentified Entities & Terms (Refactored Step 5 v2):")
        if found_items:
            for item in found_items:
                details = f"- Text: '{item['text']}', Label: {item['label']}, Source: {item['source']}"
                if item['mapped_value'] != item['text']: details += f" (Mapped: '{item['mapped_value']}')"
                if item['ent_id']: details += f" (ID: {item['ent_id']})"
                print(details)
        else: print("- No specific entities or terms identified.")
        print("="*80)
else:
    print("\nSkipping Step 5 v2 analysis because the spaCy model could not be loaded.")

'en_core_web_sm' spaCy model already loaded.

--- Loading Gazetteers from Database ---
Loaded 491 unique lowercase terms for 'Security'.
Loaded 11 unique lowercase terms for 'Sector'.
Loaded 7 unique lowercase terms for 'Country'.
Loaded 124 unique lowercase terms for 'Industry'.

--- Setting up PhraseMatcher ---
PhraseMatcher patterns added (primarily multi-word).

--- Setting up EntityRuler ---
EntityRuler added to spaCy pipeline before NER (overwrite=True).




Added 189 patterns to EntityRuler.

--- Analyzing Queries with Refactored Enhanced NER (Step 5 v2) ---

--- Analyzing Query: 'Top 10 most valuable American IT companies.' ---

Identified Entities & Terms (Refactored Step 5 v2):
- Text: 'Top', Label: RANKING_MODIFIER, Source: Pipeline (Ruler/NER)
- Text: '10', Label: CARDINAL, Source: Pipeline (Ruler/NER)
- Text: 'most', Label: RANKING_MODIFIER, Source: Pipeline (Ruler/NER)
- Text: 'American', Label: COUNTRY_TERM, Source: Pipeline (Ruler/NER) (Mapped: 'usa') (ID: usa)
- Text: 'IT', Label: SECTOR_TERM, Source: Pipeline (Ruler/NER) (Mapped: 'information technology') (ID: information technology)

--- Analyzing Query: 'Show details for Apple Inc.' ---

Identified Entities & Terms (Refactored Step 5 v2):
- Text: 'Apple Inc.', Label: ORG, Source: Pipeline (Ruler/NER)

--- Analyzing Query: 'Every French company valued over 1B.' ---

Identified Entities & Terms (Refactored Step 5 v2):
- Text: 'French', Label: COUNTRY_TERM, Source: Pipeline (Rul

In [None]:
"""Step 6 (Refactored): Query Structure Parsing (Intent & Parameter Extraction)"""

import re 
import json # For pretty printing the output

# --- Helper Functions (Keep as is) ---
# parse_monetary_value, map_comparison_operator are likely fine


# --- Helper Functions (Should be defined before using them) ---
def map_comparison_operator(op_text):
    """Maps textual comparison operators to SQL operators."""
    op_text = op_text.lower().strip()
    
    # Direct mappings
    if op_text in ["over", "above", "greater than", "more than", ">", "after"]:
        return ">"
    elif op_text in ["under", "below", "less than", "<", "before"]:
        return "<"
    elif op_text in ["equal to", "equals", "is equal to"]:
        return "="
    elif op_text in ["at least", "at most"]:
        return ">=" if "least" in op_text else "<="
    return None

# Corrected version of parse_monetary_value
import re # Make sure re is imported

def parse_monetary_value(text_value):
    """Converts text like '1B', '$500 million', '1.5T', '$500' to a number."""
    if text_value is None: return None
    
    text_value = str(text_value).lower().replace('$', '').replace(',', '').strip()
    multiplier = 1.0 

    # Check for multipliers first
    if 't' in text_value: 
        multiplier = 1e12
        text_value = text_value.replace('trillion', '').replace('tn', '').replace('t', '')
    elif 'b' in text_value: 
        multiplier = 1e9
        text_value = text_value.replace('billion', '').replace('bn', '').replace('b', '')
    elif 'm' in text_value: 
        multiplier = 1e6
        text_value = text_value.replace('million', '').replace('mn', '').replace('m', '')
    
    # Now extract the numeric part using regex
    try:
        # Regex to find float or integer part, ignoring trailing non-numeric chars
        numeric_part = re.findall(r"^[-+]?\d*\.?\d+", text_value.strip())
        if numeric_part:
            return float(numeric_part[0]) * multiplier 
        else: 
            # print(f"DEBUG: No numeric part found in '{text_value}' after cleaning.")
            return None # No number found
    except (ValueError, TypeError, IndexError):
        # print(f"DEBUG: Error converting '{text_value}' to float.")
        return None # Conversion failed

# --- Normalization Helper ---
def normalize_term(term, original_list, alias_mapping=None):
    """Normalizes a found term against a list of canonical terms (original casing) and optional aliases."""
    term_lower = term.lower().strip()
    
    # 1. Check alias mapping first
    if alias_mapping and term_lower in alias_mapping:
        canonical_lower = alias_mapping[term_lower]
        # Find original casing for the canonical name
        for orig_term in original_list:
            if orig_term.lower() == canonical_lower:
                return orig_term
        return canonical_lower # Return lowercase canonical if original casing not found (shouldn't happen ideally)

    # 2. Check direct match (case-insensitive) against original list
    for orig_term in original_list:
        if orig_term.lower() == term_lower:
            return orig_term # Return the original casing

    # 3. Return original term if no match/normalization found
    return term 

# --- COLUMN_MAP (Keep as is, seems reasonable) ---
COLUMN_MAP = {
    "COMPANY_NAME": "Security", "ORG": "Security", 
    "SECTOR_TERM": "Sector", "INDUSTRY_TERM": "Industry",
    "COUNTRY_TERM": "Country", "GPE": "Country",
    "VALUE_KEYWORD": "Marketcap", "COLUMN_SECTOR": "Sector",
    "COLUMN_INDUSTRY": "Industry", "COLUMN_COUNTRY": "Country",
    "COLUMN_FOUNDED": "Founded", "COLUMN_STOCKPRICE": "Stockprice",
    "MONEY_VALUE": "Marketcap", # Default column for monetary values
    "CARDINAL": None # Cardinal numbers usually represent LIMIT, not a filter column
}

# --- Main Parsing Function (Refactored) ---
def parse_query_structure_refactored(doc, identified_items, 
                                     orig_sectors, orig_industries, orig_countries): # Pass original casing lists
    """
    Parses the spaCy Doc and identified entities/terms into a structured query representation. (Refactored)
    """
    parsed_structure = {
        'intent': None, 'select_cols': [], 'filters': [], 'limit': None,
        'order_by': None, 'distinct': False, 'errors': []
    }

    # Create lowercase versions for lookup inside function
    lc_sectors_set = {s.lower() for s in orig_sectors}
    lc_industries_set = {i.lower() for i in orig_industries}
    lc_countries_set = {c.lower() for c in orig_countries}

    items_by_label = {}
    for item in identified_items:
        label = item['label']
        if label not in items_by_label: items_by_label[label] = []
        items_by_label[label].append(item)
        
    lemmas = [token.lemma_.lower() for token in doc if not token.is_punct | token.is_stop]
    root_verb = next((token for token in doc if token.dep_ == "ROOT" and token.pos_ == "VERB"), None)
    root_lemma = root_verb.lemma_.lower() if root_verb else None

    # --- Refined Intent and Parameter Extraction ---

    has_ranking_modifier = bool(items_by_label.get("RANKING_MODIFIER"))
    has_cardinal = bool(items_by_label.get("CARDINAL"))
    has_company = bool(items_by_label.get("COMPANY_NAME") or items_by_label.get("ORG"))
    has_comparison = bool(items_by_label.get("COMPARISON_OP"))
    has_money = bool(items_by_label.get("MONEY_VALUE"))
    has_price = bool(items_by_label.get("PRICE_VALUE")) # New flag
    has_year = bool(items_by_label.get("YEAR_NUMBER")) # New flag
    has_date_keyword = bool(items_by_label.get("COLUMN_FOUNDED")) # New flag
    has_price_keyword = bool(items_by_label.get("COLUMN_STOCKPRICE")) # New flag
    has_ordering_keyword = bool(items_by_label.get("ORDERING_KEYWORD")) # New flag
    
    has_list_all_keyword = any(lbl in items_by_label for lbl in ["COLUMN_SECTOR", "COLUMN_INDUSTRY", "COLUMN_COUNTRY"]) and \
                           any(l in lemmas for l in ["all", "available"]) and \
                           root_lemma in ["list", "show"]

    mentioned_columns = [lbl for lbl in items_by_label if lbl.startswith("COLUMN_")]
    is_specific_col_req = (root_lemma in ["what", "show", "list", "tell", "give"] and len(mentioned_columns) == 1 and has_company)
    # Check for "List all sectors" type

    # --- Intent Prioritization ---
    
    # 1. List distinct column values
    if has_list_all_keyword:
        parsed_structure['intent'] = 'list_values'
        parsed_structure['distinct'] = True
        col_label = next((lbl for lbl in ["COLUMN_SECTOR", "COLUMN_INDUSTRY", "COLUMN_COUNTRY"] if lbl in items_by_label), None)
        if col_label and col_label in COLUMN_MAP:
            parsed_structure['select_cols'] = [COLUMN_MAP[col_label]]
        else:
            parsed_structure['errors'].append("Could not determine column for listing values.")
            parsed_structure['select_cols'] = ['*'] # Fallback

    # 2. Specific Column Lookup ("What is X of Y?" / "Show X for Y?")
    elif is_specific_col_req:
         parsed_structure['intent'] = 'lookup_specific_column'
         # Extract target company
         company_item = items_by_label.get("COMPANY_NAME", items_by_label.get("ORG", []))[0]
         company_name_norm = normalize_term(company_item['text'], unique_companies_orig)
         parsed_structure['filters'].append({ 'column': 'Security', 'operator': '=', 'value': company_name_norm })
         # Extract requested column
         col_label = mentioned_columns[0]
         if col_label in COLUMN_MAP:
              parsed_structure['select_cols'] = [COLUMN_MAP[col_label]]
         else:
              parsed_structure['errors'].append(f"Could not map requested column: {col_label}")
              parsed_structure['select_cols'] = ['*'] # Fallback

 # 3. Top/Bottom N queries
    elif has_ranking_modifier and has_cardinal:
        parsed_structure['intent'] = 'find_top'
        # ... (Keep existing limit extraction logic) ...
        limit_item = items_by_label["CARDINAL"][0]
        try: parsed_structure['limit'] = int(limit_item['text'])
        except ValueError: parsed_structure['errors'].append(f"Could not parse limit value: {limit_item['text']}")

        # Order By (Refined: Check explicit order keyword OR default to Marketcap)
        order_col = "Marketcap" # Default for ranking
        order_dir = "DESC"      # Default for ranking
        rank_item = items_by_label["RANKING_MODIFIER"][0] 
        
        # Check if an explicit ORDERING_KEYWORD and column are mentioned near the ranking modifier
        if has_ordering_keyword:
             order_kw_item = items_by_label["ORDERING_KEYWORD"][0]
             # Find closest COLUMN_* after ordering keyword
             following_cols = [item for lbl in mentioned_columns for item in items_by_label[lbl] if item['start_char'] > order_kw_item['end_char']]
             following_cols.sort(key=lambda x: x['start_char'])
             if following_cols and following_cols[0]['label'] in COLUMN_MAP:
                  order_col = COLUMN_MAP[following_cols[0]['label']]
                  print(f"DEBUG: Explicit ordering column found via keyword: {order_col}")
             # If explicit ordering is found, maybe default direction to ASC? Or keep DESC for 'top'? Let's keep DESC for now.

        # Check direction based on ranking modifier lemma
        rank_token = doc.char_span(rank_item['start_char'], rank_item['end_char'], label=rank_item['label']) 
        if rank_token and len(rank_token) > 0 and rank_token[0].lemma_.lower() in ["least", "low", "small"]:
            order_dir = "ASC"
            
        parsed_structure['order_by'] = {'column': order_col, 'direction': order_dir}
        parsed_structure['select_cols'] = ['Security', order_col] # Select name and the ordering column


    # 4. Specific Company Lookup (Simpler Case - only if focus is company)
    elif has_company and not has_ranking_modifier and not has_comparison and not mentioned_columns and len(identified_items) < 5: 
        parsed_structure['intent'] = 'lookup_details'
        parsed_structure['select_cols'] = ['*'] 
        company_item = items_by_label.get("COMPANY_NAME", items_by_label.get("ORG", []))[0]
        company_name_norm = normalize_term(company_item['text'], unique_companies_orig) 
        parsed_structure['filters'].append({ 'column': 'Security', 'operator': '=', 'value': company_name_norm })
        
    # 5. Default to Filtered List / General Ordering
    else:
        parsed_structure['intent'] = 'filter_list'
        parsed_structure['select_cols'] = ['Security', 'Marketcap'] # Default columns
        
        # Check for explicit ordering request even without ranking modifier
        if has_ordering_keyword:
             order_kw_item = items_by_label["ORDERING_KEYWORD"][0]
             following_cols = [item for lbl in mentioned_columns for item in items_by_label[lbl] if item['start_char'] > order_kw_item['end_char']]
             following_cols.sort(key=lambda x: x['start_char'])
             if following_cols and following_cols[0]['label'] in COLUMN_MAP:
                  order_col = COLUMN_MAP[following_cols[0]['label']]
                  order_dir = 'ASC' # Default direction for non-ranking sort
                  parsed_structure['order_by'] = {'column': order_col, 'direction': order_dir}
                  print(f"DEBUG: General ordering request found: {order_col} {order_dir}")
    
    added_filter_tuples = set() 
    if parsed_structure['intent'] not in ['list_values', 'lookup_specific_column']: # Don't add extra filters for specific lookups
        # Add filters from lookup intent if needed (already done above)
        for f in parsed_structure['filters']: 
              added_filter_tuples.add( (f['column'], f['operator'], str(f['value'])) )
        
        # Priority Labels for Filters: Specific terms first
        filter_labels_priority = ["COMPANY_NAME", "COUNTRY_TERM", "SECTOR_TERM", "INDUSTRY_TERM"]
        # Fallback Labels: Generic NER tags
        filter_labels_fallback = ["ORG", "GPE"] 


        for item_label in filter_labels_priority + filter_labels_fallback:
            if item_label in items_by_label and item_label in COLUMN_MAP:
                db_column = COLUMN_MAP[item_label]
                if not db_column: continue # Skip labels not mapped to columns (like CARDINAL)

                for item in items_by_label[item_label]:
                    # Skip adding company filter if intent is already lookup_details
                    if parsed_structure['intent'] == 'lookup_details' and db_column == 'Security':
                        continue
                        
                    # Normalize value using appropriate list and mappings
                    filter_value_raw = item.get('mapped_value', item['text']) # Start with mapped value if available
                    normalized_value = filter_value_raw # Default

                    if item_label in ["SECTOR_TERM", "COLUMN_SECTOR"]:
                        normalized_value = normalize_term(filter_value_raw, unique_sectors_orig, sector_alias_mapping)
                    elif item_label in ["INDUSTRY_TERM", "COLUMN_INDUSTRY"]:
                         normalized_value = normalize_term(filter_value_raw, unique_industries_orig) # Add industry alias map if needed
                    elif item_label in ["COUNTRY_TERM", "GPE", "COLUMN_COUNTRY"]:
                         # Use mapped value directly if available and valid, otherwise normalize text
                         if item.get('mapped_value') and item['mapped_value'].lower() in lc_countries_set:
                              normalized_value = normalize_term(item['mapped_value'], unique_countries_orig)
                         else:
                              normalized_value = normalize_term(filter_value_raw, unique_countries_orig, country_mapping)
                    elif item_label in ["COMPANY_NAME", "ORG"]:
                         normalized_value = normalize_term(filter_value_raw, unique_companies_orig)

                    # Check for generic words being misinterpreted (e.g., "financials")
                    # If the normalized term isn't in our known DB lists for that column, maybe ignore it?
                    is_known = True
                    if db_column == 'Sector' and normalized_value.lower() not in lc_sectors_set: is_known = False
                    if db_column == 'Industry' and normalized_value.lower() not in lc_industries_set: is_known = False
                    if db_column == 'Country' and normalized_value.lower() not in lc_countries_set: is_known = False
                    # Add check for company? Might be too strict if DB isn't exhaustive.

                    if not is_known:
                         parsed_structure['errors'].append(f"Ignoring potential filter term '{item['text']}' for column '{db_column}' as it's not a known value.")
                         continue # Skip adding this filter

                    # Add filter if not duplicate
                    filter_tuple = (db_column, '=', str(normalized_value))
                    if filter_tuple not in added_filter_tuples:
                        parsed_structure['filters'].append({
                            'column': db_column, 'operator': '=', 'value': normalized_value
                        })
                        added_filter_tuples.add(filter_tuple)

        # --- NEW: Date Filters ---
        if has_comparison and has_year and has_date_keyword:
            # Simple approach: Assume first comparison applies to first year near "founded"
            op_item = items_by_label["COMPARISON_OP"][0]
            year_item = items_by_label["YEAR_NUMBER"][0]
            
            sql_op = map_comparison_operator(op_item['text'])
            year_val = year_item['text'] # Year is treated as string for now
            
            if sql_op and year_val:
                 filter_column = "Founded" 
                 filter_tuple = (filter_column, sql_op, str(year_val))
                 if filter_tuple not in added_filter_tuples:
                      parsed_structure['filters'].append({'column': filter_column, 'operator': sql_op, 'value': year_val})
                      added_filter_tuples.add(filter_tuple)
                 
        # --- NEW: Price Filters ---
        if has_comparison and has_price and has_price_keyword:
            # Simple approach: Assume first comparison applies to first price value near "stock price"
            op_item = items_by_label["COMPARISON_OP"][0]
            price_item = items_by_label["PRICE_VALUE"][0]
            
            sql_op = map_comparison_operator(op_item['text'])
            numeric_val = parse_monetary_value(price_item['text']) # Use helper

            if sql_op and numeric_val is not None:
                 filter_column = "Stockprice"
                 filter_tuple = (filter_column, sql_op, str(numeric_val))
                 if filter_tuple not in added_filter_tuples:
                     parsed_structure['filters'].append({'column': filter_column, 'operator': sql_op, 'value': numeric_val})
                     added_filter_tuples.add(filter_tuple)


        # Threshold Filters
        comp_ops = items_by_label.get("COMPARISON_OP", [])
        money_vals = items_by_label.get("MONEY_VALUE", [])

        if has_comparison and has_money and not has_price_keyword: # Only apply if not explicitly price
            # Simple: assume first op applies to first money value, related to Marketcap
            # Could be enhanced by checking proximity or keywords like "valued"
            op_item = items_by_label["COMPARISON_OP"][0]
            val_item = items_by_label["MONEY_VALUE"][0]
            filter_column = "Marketcap" 
            sql_op = map_comparison_operator(op_item['text'])
            numeric_val = parse_monetary_value(val_item['text'])
            if sql_op and numeric_val is not None:
                filter_tuple = (filter_column, sql_op, str(numeric_val))
                if filter_tuple not in added_filter_tuples:
                    parsed_structure['filters'].append({'column': filter_column, 'operator': sql_op, 'value': numeric_val})
                    added_filter_tuples.add(filter_tuple)

            else:
                 if not sql_op: parsed_structure['errors'].append(f"Could not map comparison operator: {op_item['text']}")
                 if numeric_val is None: parsed_structure['errors'].append(f"Could not parse monetary value: {val_item['text']}")

    # --- Refine Select Columns (Enhanced) ---
    # Only refine if intent allows multiple columns (filter_list, find_top)
    if parsed_structure['intent'] in ['filter_list', 'find_top']:
        # Start with default columns for the intent
        current_cols = set(parsed_structure['select_cols']) 
        explicit_cols = set()
        explicit_col_request = False
        
        for col_label in mentioned_columns:
             if col_label in COLUMN_MAP and COLUMN_MAP[col_label]:
                  explicit_cols.add(COLUMN_MAP[col_label])
                  explicit_col_request = True
        
        # If specific columns were mentioned, use them (plus Security)
        if explicit_col_request:
             final_cols = ['Security'] + sorted(list(explicit_cols))
             parsed_structure['select_cols'] = list(dict.fromkeys(final_cols)) # Keep order, remove duplicates
        # If no explicit request, stick to the intent's defaults
        # elif not current_cols: # Fallback if defaults were empty
        #    parsed_structure['select_cols'] = ['Security']

    # --- Final Cleanup (Deduplication already handled by set) ---
    # parsed_structure['filters'] = unique_filters # Not needed if using set during addition

    
    return parsed_structure


# --- Test the Refactored Parser ---
if nlp:
    print("\n--- Testing Refactored Query Structure Parsing (Step 6) ---")
    
    # Use the results stored from the refactored Step 5 run
    
    for query in test_queries: # Use the same list as defined in Step 5 test block
        if query in processed_docs and query in identified_items_store:
            print(f"\n--- Parsing Query: '{query}' ---")
            doc = processed_docs[query]
            found_items = identified_items_store[query]
            
            # ----> Call the REFFACTORED parsing function <----
            parsed_result = parse_query_structure_refactored(
                doc, 
                found_items,
                unique_sectors_orig, # Pass original casing list
                unique_industries_orig,
                unique_countries_orig 
            )
            
            # Pretty print the result
            print(json.dumps(parsed_result, indent=2))
            print("="*80)
        else:
            print(f"\n--- Skipping Query: '{query}' (Data not found from Step 5 run) ---")

else:
    print("\nSkipping Step 6 testing because the spaCy model could not be loaded.")


--- Testing Refactored Query Structure Parsing (Step 6) ---

--- Parsing Query: 'Top 10 most valuable American IT companies.' ---
{
  "intent": "find_top",
  "select_cols": [
    "Security",
    "Marketcap"
  ],
  "filters": [
    {
      "column": "Country",
      "operator": "=",
      "value": "USA"
    },
    {
      "column": "Sector",
      "operator": "=",
      "value": "Information Technology"
    }
  ],
  "limit": 10,
  "order_by": {
    "column": "Marketcap",
    "direction": "DESC"
  },
  "distinct": false,
  "errors": []
}

--- Parsing Query: 'Show details for Apple Inc.' ---
{
  "intent": "lookup_details",
  "select_cols": [
    "*"
  ],
  "filters": [
    {
      "column": "Security",
      "operator": "=",
      "value": "Apple Inc."
    }
  ],
  "limit": null,
  "order_by": null,
  "distinct": false,
  "errors": []
}

--- Parsing Query: 'Every French company valued over 1B.' ---
{
  "intent": "filter_list",
  "select_cols": [
    "Security",
    "Marketcap"
  ],
  "fi

In [5]:
"""Step 7 (Revised): Convert Parsed Structure to SQL Query (with Error Propagation)"""

# import sqlite3 
import json # Needed for testing block output

# Helper function format_sql_value remains the same
def format_sql_value(value):
    """Formats a Python value for safe inclusion in an SQL query."""
    if isinstance(value, str):
        escaped_value = value.replace("'", "''")
        return f"'{escaped_value}'"
    elif isinstance(value, (int, float)):
        return str(value)
    elif value is None:
        return "NULL"
    elif isinstance(value, bool):
        return "1" if value else "0"
    else:
        escaped_value = str(value).replace("'", "''")
        # Removing the print warning from here, let the parsing step handle warnings
        # print(f"Warning: Formatting unexpected type {type(value)} as string: {value}") 
        return f"'{escaped_value}'"

# Revised SQL Generation function
def generate_sql_from_structure(parsed_structure):
    """
    Translates the structured dictionary from Step 6 into a valid SQLite query,
    and returns any parsing errors encountered in Step 6.

    Args:
        parsed_structure (dict): The dictionary output from parse_query_structure_refactored.

    Returns:
        tuple: (sql_query_string, list_of_parsing_errors)
    """
    parsing_errors = parsed_structure.get('errors', []) # Get errors from input structure
    sql_query = "-- No SQL generated due to invalid input." # Default SQL if structure is bad

    if not parsed_structure or not parsed_structure.get('intent'):
        parsing_errors.append("Invalid or unparsed query structure provided.")
        return (sql_query, parsing_errors)

    # --- Generate SQL based on structure (same logic as before) ---
    try:
        select_parts = []
        from_clause = 'FROM "companies"' 
        where_clause = None
        orderby_clause = None
        limit_clause = None

        select_prefix = "SELECT DISTINCT" if parsed_structure.get('distinct', False) else "SELECT"
        select_cols = parsed_structure.get('select_cols')
        
        if not select_cols: select_parts = ["*"]
        elif select_cols == ['*']: select_parts = ["*"]
        else: select_parts = [f'"{col}"' for col in select_cols]
        select_clause = f"{select_prefix} {', '.join(select_parts)}"

        filters = parsed_structure.get('filters')
        if filters:
            conditions = []
            for f in filters:
                col, op, val = f.get('column'), f.get('operator', '='), f.get('value')
                if not col: continue
                quoted_col = f'"{col}"'
                if val is None:
                     if op == '=': conditions.append(f"{quoted_col} IS NULL")
                     elif op in ['!=', '<>']: conditions.append(f"{quoted_col} IS NOT NULL")
                     else: continue 
                else:
                     formatted_val = format_sql_value(val)
                     conditions.append(f"{quoted_col} {op} {formatted_val}")
            if conditions: where_clause = "WHERE " + " AND ".join(conditions)

        order_by = parsed_structure.get('order_by')
        if order_by:
            col, direction = order_by.get('column'), order_by.get('direction', 'ASC').upper()
            if col and direction in ['ASC', 'DESC']: orderby_clause = f'ORDER BY "{col}" {direction}'

        limit = parsed_structure.get('limit')
        if limit is not None:
            try:
                limit_val = int(limit)
                if limit_val > 0: limit_clause = f"LIMIT {limit_val}"
            except (ValueError, TypeError): pass # Ignore invalid limit

        query_parts = [select_clause, from_clause, where_clause, orderby_clause, limit_clause]
        sql_query = " ".join(filter(None, query_parts)).strip() + ";"

    except Exception as e:
         # Catch potential errors during SQL string construction itself
         parsing_errors.append(f"Error during SQL generation: {e}")
         sql_query = "-- Error occurred during SQL generation."

    # --- Return both the generated SQL and the list of parsing errors ---
    return (sql_query, parsing_errors)


# --- Test with Example Outputs from Step 6 (using the stored results) ---
# (Ensure prerequisite variables and functions are available from previous steps)

if 'parse_query_structure_refactored' in locals() and 'processed_docs' in locals() and 'identified_items_store' in locals():
    print("\n--- Testing SQL Generation (Step 7 - Revised with Error Propagation) ---")
    
    if 'unique_sectors_orig' not in locals(): 
         print("Error: Gazetteer lists not found. Please run Step 5 first.")
    else:
        for query in test_queries: 
            if query in processed_docs and query in identified_items_store:
                print(f"\n--- Generating SQL for Query: '{query}' ---")
                doc = processed_docs[query]
                found_items = identified_items_store[query]
                
                # Parse structure (includes 'errors' list)
                # Apply the ASC/DESC fix directly here for testing
                parsed_struct = parse_query_structure_refactored(
                    doc, found_items, unique_sectors_orig, 
                    unique_industries_orig, unique_countries_orig 
                )
                # Manual fix for ASC/DESC bug if not applied in function definition yet
                if parsed_struct['intent'] == 'find_top' and parsed_struct.get('order_by'):
                    rank_item = next((item for item in found_items if item['label'] == 'RANKING_MODIFIER'), None)
                    if rank_item:
                        rank_token = doc.char_span(rank_item['start_char'], rank_item['end_char'])
                        if rank_token and len(rank_token) > 0 and rank_token[0].lemma_.lower() in ["least", "low", "small"]:
                             parsed_struct['order_by']['direction'] = 'ASC'
                # End manual fix

                # ---> Generate SQL and get errors <---
                generated_sql, parsing_errors_list = generate_sql_from_structure(parsed_struct)
                
                # Print the intermediate structure and the final SQL + Errors
                print("Parsed Structure:")
                print(json.dumps(parsed_struct, indent=2))
                
                if parsing_errors_list:
                    print("\nParsing Errors/Warnings Encountered:")
                    for err in parsing_errors_list:
                        print(f"- {err}")
                        
                print("\nGenerated SQL:")
                print(generated_sql)
                print("="*80)
            else:
                print(f"\n--- Skipping Query: '{query}' (Data not found from Step 5 run) ---")
else:
    print("\nSkipping Step 7 testing because prerequisite functions/data were not found.")


--- Testing SQL Generation (Step 7 - Revised with Error Propagation) ---

--- Generating SQL for Query: 'Top 10 most valuable American IT companies.' ---
Parsed Structure:
{
  "intent": "find_top",
  "select_cols": [
    "Security",
    "Marketcap"
  ],
  "filters": [
    {
      "column": "Country",
      "operator": "=",
      "value": "USA"
    },
    {
      "column": "Sector",
      "operator": "=",
      "value": "Information Technology"
    }
  ],
  "limit": 10,
  "order_by": {
    "column": "Marketcap",
    "direction": "DESC"
  },
  "distinct": false,
  "errors": []
}

Generated SQL:
SELECT "Security", "Marketcap" FROM "companies" WHERE "Country" = 'USA' AND "Sector" = 'Information Technology' ORDER BY "Marketcap" DESC LIMIT 10;

--- Generating SQL for Query: 'Show details for Apple Inc.' ---
Parsed Structure:
{
  "intent": "lookup_details",
  "select_cols": [
    "*"
  ],
  "filters": [
    {
      "column": "Security",
      "operator": "=",
      "value": "Apple Inc."
    

In [None]:
"""Step 8: Initial Rule-Based SQL Generation Testing (Execution)"""

import sqlite3
import pandas as pd
import json

# --- Ensure prerequisite functions and variables exist ---
# generate_sql_from_structure, parse_query_structure_refactored, 
# processed_docs, identified_items_store, test_queries, db_file,
# unique_sectors_orig, unique_industries_orig, unique_countries_orig

# Database file path
db_file = 'companies_database.db'

# --- Function to Execute SQL ---
def execute_sql_query(sql_query, db_path):
    """Executes a given SQL query against the SQLite database and returns results."""
    conn = None
    try:
        conn = sqlite3.connect(db_path)
        # Use pandas to read results directly into a DataFrame
        results_df = pd.read_sql_query(sql_query, conn)
        return results_df, None # Return DataFrame and no error
    
    except sqlite3.Error as e:
        error_message = f"SQLite Error: {e}\nQuery: {sql_query}"
        print(error_message)
        return None, error_message # Return None and the error message
        
    except Exception as e:
        # Catch other potential errors (e.g., pandas issues)
        error_message = f"An unexpected error occurred: {e}\nQuery: {sql_query}"
        print(error_message)
        return None, error_message # Return None and the error message
        
    finally:
        if conn:
            conn.close()

# --- Testing Loop (Integrating Steps 6, 7, and 8) ---

if 'parse_query_structure_refactored' in locals() and 'generate_sql_from_structure' in locals() and \
   'processed_docs' in locals() and 'identified_items_store' in locals():
    print("\n--- Testing End-to-End Pipeline (Steps 6-8) ---")
    
    if 'unique_sectors_orig' not in locals(): 
         print("Error: Gazetteer lists not found. Please run Step 5 first.")
    else:
        for query in test_queries: 
            print(f"\n\n{'='*20} Processing Query: '{query}' {'='*20}")
            
            if query not in processed_docs or query not in identified_items_store:
                 print(f"--- Skipping Query: '{query}' (Data not found from Step 5 run) ---")
                 continue

            doc = processed_docs[query]
            found_items = identified_items_store[query]
                
            # --- Step 6: Parse Structure ---
            parsed_struct = parse_query_structure_refactored(
                doc, found_items, unique_sectors_orig, 
                unique_industries_orig, unique_countries_orig 
            )
            # Manual fix for ASC/DESC bug 
            if parsed_struct['intent'] == 'find_top' and parsed_struct.get('order_by'):
                rank_item = next((item for item in found_items if item['label'] == 'RANKING_MODIFIER'), None)
                if rank_item:
                    rank_token = doc.char_span(rank_item['start_char'], rank_item['end_char'])
                    # Use correct lemma check
                    if rank_token and len(rank_token) > 0 and rank_token[0].lemma_.lower() in ["least", "low", "small"]: 
                         parsed_struct['order_by']['direction'] = 'ASC'

            # --- Step 7: Generate SQL ---
            generated_sql, parsing_errors_list = generate_sql_from_structure(parsed_struct)
                
            # --- Display Parsing Info ---
            print("\n--- Step 6: Parsed Structure ---")
            print(json.dumps(parsed_struct, indent=2))
                
            if parsing_errors_list:
                print("\n--- Parsing Errors/Warnings ---")
                for err in parsing_errors_list: print(f"- {err}")
                        
            print("\n--- Step 7: Generated SQL ---")
            print(generated_sql)

            # --- Step 8: Execute SQL ---
            print("\n--- Step 8: SQL Execution Result ---")
            
            # Optional: Only execute if no parsing errors? For now, try executing all.
            # if not parsing_errors_list:
            
            results_df, execution_error = execute_sql_query(generated_sql, db_file)
            
            if execution_error:
                print(f"Execution Failed: {execution_error}")
            elif results_df is not None:
                if results_df.empty:
                    print("Query executed successfully, but returned no results.")
                else:
                    print("Query executed successfully. Results:")
                    # Display limited rows for potentially large results
                    print(results_df.to_string(index=False, max_rows=10)) 
            else:
                # This case shouldn't happen with current execute_sql_query logic, but good to have
                 print("Execution did not return results or an error.")
                 
            # else:
            #    print("SQL execution skipped due to parsing errors.")
                 
            print("-" * 80) # Separator for next query

else:
    print("\nSkipping Step 8 testing because prerequisite functions/data were not found.")


--- Testing End-to-End Pipeline (Steps 6-8) ---



--- Step 6: Parsed Structure ---
{
  "intent": "find_top",
  "select_cols": [
    "Security",
    "Marketcap"
  ],
  "filters": [
    {
      "column": "Country",
      "operator": "=",
      "value": "USA"
    },
    {
      "column": "Sector",
      "operator": "=",
      "value": "Information Technology"
    }
  ],
  "limit": 10,
  "order_by": {
    "column": "Marketcap",
    "direction": "DESC"
  },
  "distinct": false,
  "errors": []
}

--- Step 7: Generated SQL ---
SELECT "Security", "Marketcap" FROM "companies" WHERE "Country" = 'USA' AND "Sector" = 'Information Technology' ORDER BY "Marketcap" DESC LIMIT 10;

--- Step 8: SQL Execution Result ---
Query executed successfully. Results:
              Security    Marketcap
             Microsoft 3.033000e+12
            Apple Inc. 2.951000e+12
                Nvidia 1.522000e+12
              Broadcom 5.688700e+11
    Oracle Corporation 3.129100e+11
Advanced Micro Devices 2.854300e

In [None]:
# --- Final Input Cell (Example Structure) ---
# Add a new cell below this one with the following structure:

# ```python
# # --- Interactive Query Cell ---
# import pandas as pd
# import json
# import sqlite3
# # Make sure all necessary functions (parse_query..., generate_sql..., execute_sql...) 
# # and variables (nlp, matcher, gazetteer lists etc.) are defined in the cells above.
# 
# def run_full_pipeline(user_query):
#     print(f"Processing Query: '{user_query}'")
#     
#     # Step 4/5: Process and Extract Entities (Simplified - assumes nlp, matcher etc. are global)
#     doc = nlp(user_query)
#     phrase_matches = matcher(doc)
#     # Regenerate found_items (using logic from Step 5 test block)
#     found_items = []
#     pipeline_ents = {} 
#     for ent in doc.ents:
#         # ... (copy collection logic from Step 5 test block) ...
#         span_key = (ent.start_char, ent.end_char)
#         ent_id = ent.ent_id_ if ent.ent_id_ else None 
#         mapped_value = ent.text # Apply mapping logic here if needed
#         # ...
#         pipeline_ents[span_key] = { ... } # Populate dict
#     found_items.extend(pipeline_ents.values())
#     # ... (copy PhraseMatcher collection logic from Step 5 test block) ...
#     found_items.sort(key=lambda item: item['start_char'])
#     print("\nStep 5: Identified Items:\n", found_items) # Optional: show intermediate step
#
#     # Step 6: Parse Structure
#     parsed_struct = parse_query_structure_refactored(
#         doc, found_items, unique_sectors_orig, 
#         unique_industries_orig, unique_countries_orig 
#     )
#     # Manual fix for ASC/DESC bug 
#     if parsed_struct['intent'] == 'find_top' and parsed_struct.get('order_by'):
#         rank_item = next((item for item in found_items if item['label'] == 'RANKING_MODIFIER'), None)
#         if rank_item:
#             rank_token = doc.char_span(rank_item['start_char'], rank_item['end_char'])
#             if rank_token and len(rank_token) > 0 and rank_token[0].lemma_.lower() in ["least", "low", "small"]: 
#                  parsed_struct['order_by']['direction'] = 'ASC'
#     print("\nStep 6: Parsed Structure:\n", json.dumps(parsed_struct, indent=2))
#
#     # Step 7: Generate SQL
#     generated_sql, parsing_errors_list = generate_sql_from_structure(parsed_struct)
#     if parsing_errors_list:
#         print("\nParsing Errors/Warnings:")
#         for err in parsing_errors_list: print(f"- {err}")
#     print("\nStep 7: Generated SQL:\n", generated_sql)
#
#     # Step 8: Execute SQL
#     print("\nStep 8: Execution Result:")
#     # --- Decision Point: Execute or Not? ---
#     # Option 1: Always try to execute
#     execute = True 
#     # Option 2: Only execute if no parsing errors
#     # execute = not bool(parsing_errors_list) 
#     # Option 3: Execute only if errors are just warnings (e.g., 'Ignoring potential filter...')
#     # execute = all('Ignoring potential filter term' in err for err in parsing_errors_list)
#
#     if execute:
#         results_df, execution_error = execute_sql_query(generated_sql, db_file)
#         if execution_error: print(f"Execution Failed: {execution_error}")
#         elif results_df is not None:
#             if results_df.empty: print("Query executed successfully, but returned no results.")
#             else: print("Query executed successfully. Results:\n", results_df.to_string(index=False))
#         else: print("Execution did not return results or an error.")
#     else:
#         print("SQL execution skipped due to critical parsing errors.")
#
# # --- Get User Input and Run ---
# try:
#     while True:
#         my_query = input("Enter your query (or type 'quit'): ")
#         if my_query.lower() == 'quit':
#             break
#         if my_query:
#            run_full_pipeline(my_query)
#            print("\n" + "="*50 + "\n") # Separator
# except KeyboardInterrupt:
#     print("\nExiting.")
# ```