In [6]:
!python3 -m spacy download en_core_web_sm -q

^C
object address  : 0x1130f1ba0
object refcount : 2
object type     : 0x10d41f210
object type name: KeyboardInterrupt
object repr     : KeyboardInterrupt()
lost sys.stderr


In [1]:
"""Spacy & Numpy import & test."""

import spacy
import numpy as np

try:
    # Load the small English model
    nlp = spacy.load("en_core_web_sm")
    print("spaCy loaded successfully!")
    print(f"Using NumPy version: {np.__version__}") # Check NumPy version

    # Test it
    doc = nlp("This is a test sentence.")
    print("Processed sentence:", doc.text)
    for token in doc:
        print(token.text, token.pos_)

except Exception as e:
    print(f"An error occurred: {e}")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.4 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/Users/nathan/Library/Python/3.11/lib/python/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/nathan/Library/Python/3.11/lib/python/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/nathan/Library/Python/3.11/lib/python/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start(

spaCy loaded successfully!
Using NumPy version: 2.2.4
Processed sentence: This is a test sentence.
This PRON
is AUX
a DET
test NOUN
sentence NOUN
. PUNCT


In [2]:
"""Basic Spacy pipeline test."""

# 'en_core_web_sm' is small and fast, good for starting.
# 'en_core_web_md' or 'en_core_web_lg' are larger but more accurate.
try:
    nlp = spacy.load("en_core_web_sm")
    print("Loaded 'en_core_web_sm' spaCy model.")
# Basic error handling if the model isn't downloaded
except OSError:
    print("spaCy model 'en_core_web_sm' not found.")
    print("Please run: python -m spacy download en_core_web_sm")
    # Depending on the environment, you might need to restart the kernel after download
    # Or exit if the model is critical for subsequent steps
    nlp = None # Set nlp to None to avoid errors later if model loading failed

if nlp:
    # Example Queries (taken from Step 1)
    example_queries = [
        "Top 10 most valuable American IT companies.", # Filtered Ranking
        "Show details for Apple Inc.",                 # Company Info Lookup (using specific name from DB potential)
        "Every French company valued over 1B.",        # Threshold Filtering
        "List all sectors available.",                 # A potential simpler query
        "What is the market cap of Microsoft?",        # Specific detail lookup
    ]

    # Process each query using spaCy
    for query in example_queries:
        print(f"\n--- Analyzing Query: '{query}' ---")
        
        # Process the query with the loaded spaCy model
        doc = nlp(query)
        
        # Print the analysis for each token
        # Using a formatted table similar to the roadmap example
        print(f"{'Token':<15} | {'Lemma':<15} | {'POS':<7} | {'Dep':<10} | {'Detailed Tag':<7} | {'Entity Type'}")
        print("-" * 80)
        for token in doc:
            print(f"{token.text:<15} | {token.lemma_:<15} | {token.pos_:<7} | {token.dep_:<10} | {token.tag_:<12} | {token.ent_type_ if token.ent_type_ else '-'}")

        # Optional: Display Named Entities found by the base model (precursor to Step 5)
        print("\nBase Named Entities Found:")
        if doc.ents:
            for ent in doc.ents:
                print(f"- Entity: '{ent.text}', Label: {ent.label_} ({spacy.explain(ent.label_)})")
        else:
            print("- No named entities found by the base model.")
        
        print("="*80)

else:
    print("\nSkipping NLP analysis because the spaCy model could not be loaded.")

Loaded 'en_core_web_sm' spaCy model.

--- Analyzing Query: 'Top 10 most valuable American IT companies.' ---
Token           | Lemma           | POS     | Dep        | Detailed Tag | Entity Type
--------------------------------------------------------------------------------
Top             | top             | ADJ     | ROOT       | JJ           | -
10              | 10              | NUM     | nummod     | CD           | CARDINAL
most            | most            | ADV     | advmod     | RBS          | -
valuable        | valuable        | ADJ     | amod       | JJ           | -
American        | american        | ADJ     | amod       | JJ           | NORP
IT              | IT              | PROPN   | compound   | NNP          | -
companies       | company         | NOUN    | npadvmod   | NNS          | -
.               | .               | PUNCT   | punct      | .            | -

Base Named Entities Found:
- Entity: '10', Label: CARDINAL (Numerals that do not fall under another type)

In [3]:
# Force reinstall pandas to ensure it's compiled against the current numpy version
%pip uninstall pandas -y
%pip install pandas --no-cache-dir

# Also good practice to ensure numpy is reasonably up-to-date, though reinstalling pandas is key
%pip install --upgrade numpy --no-cache-dir

print("Reinstalled pandas and updated numpy. Please RESTART THE KERNEL now.")
print("After restarting, re-run the previous cells (database setup, Step 4) and then try Step 5 again.")

^C
object address  : 0x1077fba60
object refcount : 2
object type     : 0x106566210
object type name: KeyboardInterrupt
object repr     : KeyboardInterrupt()
lost sys.stderr
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip3 install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Reinstalled pandas and updated numpy. Please RESTART THE KERNEL now.
After restarting, re-run the previous cells (database se

In [5]:
"""Adding Gazetteer, PhraseMatcher and EntityRuler to improve identification of key pieces of information."""

from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler
import sqlite3
import pandas as pd # Needed for reading from DB


# Configuration
db_file = 'companies_database.db'
table_name = 'companies'

# Reload the spaCy model if it wasn't loaded or if the kernel restarted
# Using the small model again for consistency
try:
    # nlp = spacy.load("en_core_web_sm") # Reload if necessary
    if 'nlp' not in locals() or nlp is None: # Check if nlp exists from previous cell
         nlp = spacy.load("en_core_web_sm")
         print("Reloaded 'en_core_web_sm' spaCy model for Step 5.")
    else:
        print("'en_core_web_sm' spaCy model already loaded.")
except OSError:
    print("spaCy model 'en_core_web_sm' not found. Cannot proceed with Step 5.")
    print("Please run: python -m spacy download en_core_web_sm")
    nlp = None

# Gazetteer Loading Function
def load_terms_from_db(db_path, table, column_name):
    """Loads unique, non-null terms from a specific column in the SQLite DB."""
    terms = set() # Use a set for automatic deduplication
    conn = None
    try:
        conn = sqlite3.connect(db_path)
        query = f"SELECT DISTINCT \"{column_name}\" FROM {table} WHERE \"{column_name}\" IS NOT NULL AND \"{column_name}\" != ''"
        # Using pandas for robust reading, handles potential issues better
        df = pd.read_sql_query(query, conn)
        # Ensure the column exists before trying to access it
        if column_name in df.columns:
             # Convert to lowercase and strip whitespace for consistency before adding
            terms.update(term.lower().strip() for term in df[column_name].astype(str))
        else:
             print(f"Warning: Column '{column_name}' not found in table '{table}'.")

    except sqlite3.Error as e:
        print(f"SQLite error loading terms for '{column_name}': {e}")
    except Exception as e:
        print(f"An unexpected error occurred loading terms for '{column_name}': {e}")
    finally:
        if conn:
            conn.close()
            
    # Remove any empty strings that might have crept in
    terms.discard('')
    print(f"Loaded {len(terms)} unique terms for '{column_name}'.")
    # print(f"Sample terms for {column_name}: {list(terms)[:5]}") # Optional: print samples
    return list(terms) # Return as a list

# Main Execution Block for Step 5
if nlp:
    # 1. Load Gazetteers from Database
    # A gazetteer is a precompiled list of named entities (such as companies, countries, 
    # industries, and financial terms) used to improve Named Entity Recognition (NER) 
    # and query parsing in your NLP-to-SQL system.
    print("\n--- Loading Gazetteers from Database ---")
    unique_companies = load_terms_from_db(db_file, table_name, 'Security')
    unique_sectors = load_terms_from_db(db_file, table_name, 'Sector')
    unique_countries = load_terms_from_db(db_file, table_name, 'Country')
    # Add Industries as well? Might be useful depending on query complexity
    unique_industries = load_terms_from_db(db_file, table_name, 'Industry')

    # Add common country variations or adjectives if needed
    # Example: Map "American" or "US" to "USA" if "USA" is the value in your DB
    country_mapping = {
        "american": "usa",
        "us": "usa",
        "u.s.": "usa",
        "u.s.a": "usa",
        "uk": "united kingdom",
        "u.k.": "united kingdom",
        "french": "france",
        "german": "germany",
        # [...]
    }
    # Combine DB countries with mapped variations
    all_country_terms = set(unique_countries)
    all_country_terms.update(country_mapping.keys())


    # 2. Setup PhraseMatcher
    # PhraseMatcher is used to match exact phrases in a given text using word tokens. 
    # It is faster and more efficient than regex for multi-word phrases.
    print("\n--- Setting up PhraseMatcher ---")
    matcher = PhraseMatcher(nlp.vocab, attr='LOWER') # Case-insensitive matching

    # Add patterns for each gazetteer list
    # Use nlp.pipe for efficiency with many terms
    company_patterns = list(nlp.pipe(unique_companies))
    sector_patterns = list(nlp.pipe(unique_sectors))
    # Use all_country_terms which includes variations
    country_patterns = list(nlp.pipe(all_country_terms)) 
    industry_patterns = list(nlp.pipe(unique_industries))

    # Add patterns to the matcher with specific labels
    matcher.add("COMPANY_NAME", company_patterns)
    matcher.add("SECTOR_TERM", sector_patterns)
    matcher.add("COUNTRY_TERM", country_patterns) # Includes variations now
    matcher.add("INDUSTRY_TERM", industry_patterns)
    print("PhraseMatcher patterns added.")


    # 3. Setup EntityRuler for Patterns
    # EntityRuler adds custom named entities (NER) based on token patterns and phrase matching.
    print("\n--- Setting up EntityRuler ---")
    # Try adding the ruler *before* NER to give custom patterns precedence
    if "entity_ruler" not in nlp.pipe_names:
        ruler = nlp.add_pipe("entity_ruler", before="ner")
        print("EntityRuler added to spaCy pipeline before NER.")
    else:
        ruler = nlp.get_pipe("entity_ruler")
        print("EntityRuler already exists in pipeline.")

    # Define patterns for EntityRuler
    # Using dictionaries for token attributes (see spaCy Matcher docs)
    entity_patterns = [
        # Monetary Values (simple B/M/T - can be expanded)
        {"label": "MONEY_VALUE", "pattern": [{"IS_DIGIT": True}, {"LOWER": {"IN": ["b", "bn", "billion"]}}]},
        {"label": "MONEY_VALUE", "pattern": [{"IS_DIGIT": True}, {"LOWER": {"IN": ["m", "mn", "million"]}}]},
        {"label": "MONEY_VALUE", "pattern": [{"IS_DIGIT": True}, {"LOWER": {"IN": ["t", "tn", "trillion"]}}]},
        # Handle "$1B" style - optional $ prefix
        {"label": "MONEY_VALUE", "pattern": [{"LOWER":"$", "OP": "?"}, {"IS_DIGIT": True}, {"LOWER": {"IN": ["b", "bn", "billion"]}}]},
        {"label": "MONEY_VALUE", "pattern": [{"LOWER":"$", "OP": "?"}, {"IS_DIGIT": True}, {"LOWER": {"IN": ["m", "mn", "million"]}}]},
        {"label": "MONEY_VALUE", "pattern": [{"LOWER":"$", "OP": "?"}, {"IS_DIGIT": True}, {"LOWER": {"IN": ["t", "tn", "trillion"]}}]},
        
        # Ranking / Top N Indicators
        {"label": "RANKING_MODIFIER", "pattern": [{"LOWER": "top"}]},
        {"label": "RANKING_MODIFIER", "pattern": [{"LOWER": {"IN": ["most", "highest", "largest", "biggest"]}}]},
        {"label": "RANKING_MODIFIER", "pattern": [{"LOWER": {"IN": ["least", "lowest", "smallest"]}}]}, # Handle bottom N?

        # Comparison Operators/Phrases (for thresholds like "over 1B")
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "over"}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "above"}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "more"}, {"LOWER": "than"}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "greater"}, {"LOWER": "than"}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": ">"}]}, # Handle symbol
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "under"}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "below"}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "less"}, {"LOWER": "than"}]},
        {"label": "COMPARISON_OP", "pattern": [{"LOWER": "<"}]}, # Handle symbol

        # Keywords indicating value/market cap (can be expanded)
        {"label": "VALUE_KEYWORD", "pattern": [{"LOWER": "valued"}]},
        {"label": "VALUE_KEYWORD", "pattern": [{"LOWER": "value"}]},
        {"label": "VALUE_KEYWORD", "pattern": [{"LOWER": "marketcap"}]},
        {"label": "VALUE_KEYWORD", "pattern": [{"LOWER": "market"}, {"LOWER": "cap"}]}, # Multi-word
        {"label": "VALUE_KEYWORD", "pattern": [{"LOWER": "worth"}]},

        # Keywords for specific columns
        {"label": "COLUMN_SECTOR", "pattern": [{"LOWER": "sector"}]},
        {"label": "COLUMN_SECTOR", "pattern": [{"LOWER": "sectors"}]},
        {"label": "COLUMN_INDUSTRY", "pattern": [{"LOWER": "industry"}]},
        {"label": "COLUMN_INDUSTRY", "pattern": [{"LOWER": "industries"}]},
        {"label": "COLUMN_COUNTRY", "pattern": [{"LOWER": "country"}]},
        {"label": "COLUMN_COUNTRY", "pattern": [{"LOWER": "countries"}]},
        {"label": "COLUMN_FOUNDED", "pattern": [{"LOWER": "founded"}]},
         {"label": "COLUMN_STOCKPRICE", "pattern": [{"LOWER": "stock"}]}, # Simple match for stock price
         {"label": "COLUMN_STOCKPRICE", "pattern": [{"LOWER": "price"}]},
         {"label": "COLUMN_STOCKPRICE", "pattern": [{"LOWER": "stockprice"}]},
        
        # Add 'IT' specifically as a SECTOR_TERM if PhraseMatcher misses it due to tokenization
        # Overwrite potentially wrong base NER tags
        {"label": "SECTOR_TERM", "pattern": [{"TEXT": "IT"}], "id": "information_technology"}, # ID helps map later
        {"label": "SECTOR_TERM", "pattern": [{"LOWER": "info"}, {"LOWER": "tech"}], "id": "information_technology"},
        
    ]

    # Add patterns to the ruler
    # Note: If patterns conflict, the first one added usually wins for EntityRuler.
    # `overwrite_ents=True` allows ruler patterns to overwrite existing entities (from model or previous patterns)
    ruler.add_patterns(entity_patterns)
    # ruler.initialize(lambda: [], nlp=nlp, overwrite_ents=True) # Ensure it's initialized if adding patterns after loading nlp
    print(f"Added {len(entity_patterns)} patterns to EntityRuler.")

    # --- Process Example Queries with Enhanced NER ---
    # Use the same queries as before
    example_queries = [
        "Top 10 most valuable American IT companies.",
        "Show details for Apple Inc.",
        "Every French company valued over 1B.",
        "List all sectors available.",
        "What is the market cap of Microsoft?",
        "Which German companies are in the Health Care sector?", # New test case
        "Find companies worth more than 500 billion dollars", # New test case
        "Show financials for Tesla", # Test company name matching
        "list info tech companies in the US", # Test variations
    ]

    print("\n--- Analyzing Queries with Enhanced NER ---")
    for query in example_queries:
        print(f"\n--- Analyzing Query: '{query}' ---")

        # Process with the pipeline (now includes EntityRuler)
        doc = nlp(query)

        # Apply the PhraseMatcher
        phrase_matches = matcher(doc)

        # --- Collect and Display Entities/Matches ---
        # Store findings in a list for clarity, handling overlaps later if needed
        found_items = []

        # 1. Add Entities from the main pipeline (Base NER + EntityRuler)
        for ent in doc.ents:
            found_items.append({
                "text": ent.text,
                "label": ent.label_,
                "start_char": ent.start_char,
                "end_char": ent.end_char,
                "source": "Pipeline (NER/Ruler)"
            })

        # 2. Add Matches from PhraseMatcher
        # Be careful about adding duplicates if EntityRuler already caught it
        # Simple approach: add if the exact span isn't already found by the pipeline
        pipeline_spans = {(ent.start_char, ent.end_char) for ent in doc.ents}

        for match_id, start, end in phrase_matches:
            span = doc[start:end]
            span_chars = (span.start_char, span.end_char)
            
            # Check if this exact span was already captured
            if span_chars not in pipeline_spans:
                 # Map match_id (string name) back to a label
                label = nlp.vocab.strings[match_id]
                
                # Attempt to map country variations back to the DB value if needed
                term_text = span.text
                mapped_term = term_text # Default to original text
                if label == "COUNTRY_TERM" and span.text.lower() in country_mapping:
                    mapped_term = country_mapping[span.text.lower()] # Use the mapped value (e.g., "usa")
                    
                found_items.append({
                    "text": term_text, # Keep original text for context
                    "label": label,
                    "mapped_value": mapped_term, # Store the potentially mapped value
                    "start_char": span.start_char,
                    "end_char": span.end_char,
                    "source": "PhraseMatcher"
                })


        # Sort findings by start position for readability
        found_items.sort(key=lambda item: item['start_char'])

        print("\nIdentified Entities & Terms:")
        if found_items:
            for item in found_items:
                details = f"Text: '{item['text']}', Label: {item['label']}, Source: {item['source']}"
                if 'mapped_value' in item and item['mapped_value'] != item['text']:
                    details += f" (Mapped: '{item['mapped_value']}')"
                print(f"- {details}")
        else:
            print("- No specific entities or terms identified by custom rules or matchers.")
            # Optionally print base tokens again if nothing custom found
            # print("\nBasic Token Analysis:")
            # for token in doc:
            #     print(f"  {token.text:<10} {token.lemma_:<10} {token.pos_:<6} {token.dep_}")


        print("="*80)

else:
    print("\nSkipping Step 5 because the spaCy model could not be loaded.")

'en_core_web_sm' spaCy model already loaded.

--- Loading Gazetteers from Database ---
Loaded 491 unique terms for 'Security'.
Loaded 11 unique terms for 'Sector'.
Loaded 7 unique terms for 'Country'.
Loaded 124 unique terms for 'Industry'.

--- Setting up PhraseMatcher ---
PhraseMatcher patterns added.

--- Setting up EntityRuler ---
EntityRuler already exists in pipeline.
Added 35 patterns to EntityRuler.

--- Analyzing Queries with Enhanced NER ---

--- Analyzing Query: 'Top 10 most valuable American IT companies.' ---

Identified Entities & Terms:
- Text: 'Top', Label: RANKING_MODIFIER, Source: Pipeline (NER/Ruler)
- Text: '10', Label: CARDINAL, Source: Pipeline (NER/Ruler)
- Text: 'most', Label: RANKING_MODIFIER, Source: Pipeline (NER/Ruler)
- Text: 'American', Label: NORP, Source: Pipeline (NER/Ruler)
- Text: 'IT', Label: SECTOR_TERM, Source: Pipeline (NER/Ruler)

--- Analyzing Query: 'Show details for Apple Inc.' ---

Identified Entities & Terms:
- Text: 'Apple Inc.', Label: ORG