<a href="https://colab.research.google.com/github/HarinduR/FeatherFind/blob/Keyword-Bird-Finder/FeatherFinder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import spacy
from spacy.matcher import Matcher

In [4]:
#load the nlp model
nlp = spacy.load("en_core_web_sm")

In [7]:
import spacy
from spacy.matcher import Matcher
from spacy.util import filter_spans

nlp = spacy.load("en_core_web_sm")

# Synonym mappings (e.g., "azure" → "blue")
COLOR_SYNONYMS = {
    "azure": "blue", "crimson": "red", "scarlet": "red",
    "emerald": "green", "ivory": "white", "charcoal": "black"
}

# Predefined habitat/region terms
HABITAT_TERMS = ["forest", "wetland", "desert", "water", "mountains", "grassland", "marsh", "coast"]
REGION_TERMS = ["north america", "asia", "europe", "africa", "australia", "south america"]

def extract_features_optimized(text):
    doc = nlp(text.lower())
    features = {
        "size": None,
        "color": {"primary": None, "secondary": None},
        "habitat": None,
        "region": None,
        "diet": None,
        "beak": {"length": None, "color": None},
        "legs": {"length": None, "color": None},
        "feathers": {"primary_color": None, "secondary_color": None},
        "common_name": None,
        "scientific_name": None
    }

    # ================================================
    # Enhanced Dependency Parsing
    # ================================================
    for token in doc:
        # Extract size (e.g., "small bird" or "large bird")
        if token.dep_ == "amod" and token.head.text == "bird":
            features["size"] = token.text

        # Extract color from body parts (e.g., "red wings" or "blue chest")
        if token.dep_ == "amod" and token.head.text in ["feathers", "wings", "chest", "body", "plumage", "tail"]:
            color = COLOR_SYNONYMS.get(token.text, token.text)
            if not features["color"]["primary"]:
                features["color"]["primary"] = color
            else:
                features["color"]["secondary"] = color

        # Extract color directly (e.g., "blue bird")
        if token.dep_ == "amod" and token.head.text == "bird":
            color = COLOR_SYNONYMS.get(token.text, token.text)
            if not features["color"]["primary"]:
                features["color"]["primary"] = color

    # ================================================
    # Improved Matcher Patterns
    # ================================================
    matcher = Matcher(nlp.vocab)

    # Pattern 1: Habitat (e.g., "in forests", "near wetlands")
    matcher.add("HABITAT", [
        [{"LOWER": {"IN": ["in", "near", "around", "found"]}},
         {"LOWER": {"IN": HABITAT_TERMS}}]
    ])

    # Pattern 2: Region (e.g., "in North America", "found in Asia")
    matcher.add("REGION", [
        [{"LOWER": {"IN": ["in", "from", "found"]}},
         {"ENT_TYPE": "GPE"}]
    ])

    # Pattern 3: Diet (e.g., "eats seeds", "feeds on fish")
    matcher.add("DIET", [
        [{"LOWER": {"IN": ["eats", "feeds", "consumes", "diet"]}},
         {"POS": "NOUN"}]
    ])

    # Pattern 4: Beak/Leg attributes (e.g., "long beak", "short legs")
    matcher.add("BEAK_LEG_ATTR", [
        [{"POS": {"IN": ["ADJ", "NUM"]}, "OP": "+"},
         {"LOWER": {"IN": ["beak", "legs", "bill"]}}]
    ])

    # Pattern 5: Color mentions (e.g., "blue and white", "reddish-brown")
    matcher.add("COLOR", [
        [{"LOWER": {"IN": list(COLOR_SYNONYMS.keys()) + ["color"]}}]
    ])

    matches = matcher(doc)
    spans = [doc[start:end] for _, start, end in matches]
    filtered_spans = filter_spans(spans)  # Remove overlapping matches

    for span in filtered_spans:
        label = nlp.vocab.strings[span.label]
        text = span.text

        if label == "HABITAT":
            features["habitat"] = span[-1].text
        elif label == "REGION":
            features["region"] = span[-1].text
        elif label == "DIET":
            features["diet"] = span[-1].text
        elif label == "BEAK_LEG_ATTR":
            if span[-1].text in ["beak", "bill"]:
                features["beak"]["length"] = span[-2].text
            elif span[-1].text == "legs":
                features["legs"]["length"] = span[-2].text
        elif label == "COLOR":
            color = COLOR_SYNONYMS.get(text, text)
            if not features["color"]["primary"]:
                features["color"]["primary"] = color
            else:
                features["color"]["secondary"] = color

    # ================================================
    # Post-Processing
    # ================================================
    # Handle compound colors (e.g., "blue and white")
    if " and " in text:
        colors = [COLOR_SYNONYMS.get(c.strip(), c.strip())
                  for c in text.split(" and ") if c.strip() in COLOR_SYNONYMS]
        if len(colors) >= 1:
            features["color"]["primary"] = colors[0]
        if len(colors) >= 2:
            features["color"]["secondary"] = colors[1]

    return features



