<a href="https://colab.research.google.com/github/HarinduR/FeatherFind/blob/Keyword-Bird-Finder/FeatherFinder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
import spacy
from spacy.matcher import Matcher
from spacy.util import filter_spans
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")

COLOR_SYNONYMS = {
    "azure": "blue", "crimson": "red", "scarlet": "red",
    "emerald": "green", "ivory": "white", "charcoal": "black"
}

HABITAT_TERMS = ["forest", "wetland", "desert", "water", "mountains", "grassland", "marsh", "coast"]
SIZE_TERMS = ["small", "large", "tiny", "big", "medium", "giant"]
CONTINENTS = ["asia", "europe", "africa", "america", "australia", "antarctica"]
DIRECTIONS = ["north", "south", "east", "west"]

def extract_features_optimized(text):
    doc = nlp(text.lower())
    features = {
        "size": None,
        "color": {"primary": None, "secondary": None},
        "habitat": None,
        "region": None,
        "diet": None,
        "eyes": {"size": None, "color": None},
        "beak": {"size": None, "color": None},
        "legs": {"size": None, "color": None},
        "feathers": None
    }

    for token in doc:
        if token.dep_ == "amod" and token.head.text == "bird" and token.text in SIZE_TERMS:
            features["size"] = token.text

        if token.dep_ == "amod" and token.head.text in ["feathers", "wings", "chest", "body", "plumage", "tail"]:
            compound_color = None
            for child in token.head.children:
                if child.dep_ == "compound":
                    compound_color = COLOR_SYNONYMS.get(child.text, child.text)
                    break

            color = compound_color if compound_color else COLOR_SYNONYMS.get(token.text, token.text)

            if not features["color"]["primary"]:
                features["color"]["primary"] = color
            else:
                features["color"]["secondary"] = color

        if token.dep_ == "amod" and token.head.text == "bird" and token.text not in SIZE_TERMS:
            color = COLOR_SYNONYMS.get(token.text, token.text)
            if not features["color"]["primary"]:
                features["color"]["primary"] = color

        if token.dep_ == "amod" and token.children != "" and token.head.text in ["feathers", "wings", "chest", "body", "plumage", "tail", "bird"] and token.text not in SIZE_TERMS:
          for child in token.children:
            if child.dep_ == "conj":
              color = COLOR_SYNONYMS.get(child.text, child.text)
              features['color']['secondary'] = color

    matcher = Matcher(nlp.vocab)

    matcher.add("HABITAT", [
        [{"LOWER": {"IN": ["in", "near", "around", "found"]}},
         {"LOWER": {"IN": HABITAT_TERMS}}]
    ])

    matcher.add("REGION", [
        [
            {"LOWER": {"IN": ["in", "from", "found"]}},
            {"LOWER": {"IN": DIRECTIONS}, "OP": "*"},
            {"LOWER": {"IN": CONTINENTS}}
        ]
    ])

    matcher.add("DIET", [
        [{"LOWER": {"IN": ["eats", "feeds", "consumes", "diet"]}},
         {"POS": "NOUN"}]
    ])

    matcher.add("PHYSICAL_CHARACTERISTICS", [
        [
            {"POS": {"IN": ["ADV", "ADJ", "NUM"]}, "OP": "*"},
            {"POS": {"IN": ["ADJ", "NUM"]}, "OP": "+"},
            {"LOWER": {"IN": ["eyes", "beak", "bill", "legs"]}}
        ]
    ])

    matches = matcher(doc)
    spans = []

    if not Span.has_extension("match_label"):
        Span.set_extension("match_label", default=None)

    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]
        span = doc[start:end]
        span._.match_label = label
        spans.append(span)

    filtered_spans = filter_spans(spans)

    for span in filtered_spans:
        label = span._.match_label
        text = span.text
        if label == "HABITAT":
            features["habitat"] = span[-1].text
        elif label == "REGION":
            features["region"] = span[1:].text
        elif label == "DIET":
            features["diet"] = span[-1].text
        elif label == "PHYSICAL_CHARACTERISTICS":
            if (len(span) >= 2):
              words = text.split()
              category = words[-1]
              for word in words[0:-1]:
                if word in SIZE_TERMS:
                  features[category]["size"] = word
                else:
                  features[category]["color"] = COLOR_SYNONYMS.get(word, word)

    return features

example_text = "giant bird with green body and azure feathers. black large beak. lives in water and eats crabs. this bird was seen in south asia. also it had  blue eyes "
example_text2 = "blue and green bird"
features = extract_features_optimized(example_text)
print(features)


{'size': 'giant', 'color': {'primary': 'green', 'secondary': 'blue'}, 'habitat': 'water', 'region': 'south asia', 'diet': 'crabs', 'eyes': {'size': None, 'color': 'blue'}, 'beak': {'size': 'large', 'color': 'black'}, 'legs': {'size': None, 'color': None}, 'feathers': None}


Enhanced code given by chatgpt

In [53]:
import spacy
from spacy.matcher import Matcher
from spacy.util import filter_spans
from spacy.tokens import Span
import logging

logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s')

nlp = spacy.load("en_core_web_sm")

COLOR_SYNONYMS = {
    "azure": "blue",
    "crimson": "red",
    "scarlet": "red",
    "emerald": "green",
    "ivory": "white",
    "charcoal": "black",
    "cerulean": "blue",
    "sapphire": "blue",
    "ruby": "red",
    "vermilion": "red",
    "olive": "green"
}

HABITAT_TERMS = ["forest", "wetland", "desert", "water", "mountains", "grassland", "marsh", "coast"]
SIZE_TERMS = ["small", "large", "tiny", "big", "medium", "giant"]
CONTINENTS = ["asia", "europe", "africa", "america", "australia", "antarctica"]
DIRECTIONS = ["north", "south", "east", "west"]

def extract_features_optimized(text):
    doc = nlp(text.lower())
    features = {
        "size": None,
        "color": {"primary": None, "secondary": None},
        "habitat": None,
        "region": None,
        "diet": None,
        "eyes": {"size": None, "color": None},
        "beak": {"size": None, "color": None},
        "legs": {"size": None, "color": None},
        "feathers": None  # Note: "feathers" is initialized as None.
    }

    # --- Dependency Parsing for Size and Color ---
    for token in doc:
        if token.dep_ == "amod" and token.head.text == "bird" and token.text in SIZE_TERMS:
            features["size"] = token.text
        if token.dep_ == "amod" and token.head.text in ["feathers", "wings", "chest", "body", "plumage", "tail"]:
            compound_color = None
            for child in token.head.children:
                if child.dep_ == "compound":
                    compound_color = COLOR_SYNONYMS.get(child.text, child.text)
                    break
            color = compound_color if compound_color else COLOR_SYNONYMS.get(token.text, token.text)
            if not features["color"]["primary"]:
                features["color"]["primary"] = color
            else:
                features["color"]["secondary"] = color
        if token.dep_ == "amod" and token.head.text == "bird" and token.text not in SIZE_TERMS:
            color = COLOR_SYNONYMS.get(token.text, token.text)
            if not features["color"]["primary"]:
                features["color"]["primary"] = color
        if token.dep_ == "amod" and token.children != "" and token.head.text in ["feathers", "wings", "chest", "body", "plumage", "tail", "bird"] and token.text not in SIZE_TERMS:
            for child in token.children:
                if child.dep_ == "conj":
                    color = COLOR_SYNONYMS.get(child.text, child.text)
                    features["color"]["secondary"] = color

    # --- Matcher Setup ---
    matcher = Matcher(nlp.vocab)
    matcher.add("HABITAT", [
        [{"LOWER": {"IN": ["in", "near", "around", "found"]}},
         {"LOWER": {"IN": HABITAT_TERMS}}]
    ])
    matcher.add("REGION", [
        [
            {"LOWER": {"IN": ["in", "from", "found"]}},
            {"LOWER": {"IN": DIRECTIONS}, "OP": "*"},
            {"LOWER": {"IN": CONTINENTS}}
        ]
    ])
    matcher.add("DIET", [
        [{"LOWER": {"IN": ["eats", "feeds", "consumes", "diet"]}},
         {"POS": "NOUN"}]
    ])
    matcher.add("PHYSICAL_CHARACTERISTICS", [
        [
            {"POS": {"IN": ["ADV", "ADJ", "NUM"]}, "OP": "*"},
            {"POS": {"IN": ["ADJ", "NUM"]}, "OP": "+"},
            {"LOWER": {"IN": ["eyes", "beak", "bill", "legs", "feathers"]}}
        ]
    ])

    matches = matcher(doc)
    spans = []
    if not Span.has_extension("match_label"):
        Span.set_extension("match_label", default=None)
    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]
        span = doc[start:end]
        span._.match_label = label
        spans.append(span)
        logging.debug(f"Matched span: '{span.text}' with label '{label}'")
    filtered_spans = filter_spans(spans)

    # --- Post-Processing ---
    for span in filtered_spans:
        label = span._.match_label
        text = span.text
        if label == "HABITAT":
            features["habitat"] = span[-1].text
        elif label == "REGION":
            features["region"] = span[1:].text
        elif label == "DIET":
            features["diet"] = span[-1].text
        elif label == "PHYSICAL_CHARACTERISTICS":
            if len(span) >= 2:
                words = text.split()
                category = words[-1]  # e.g., "eyes", "beak", "legs", "feathers"
                # Initialize the category if not present or is None
                if category not in features or features[category] is None:
                    features[category] = {"size": None, "color": None}
                # Process each modifier (all words except the last one)
                for word in words[:-1]:
                    if word in SIZE_TERMS:
                        features[category]["size"] = word
                        logging.debug(f"Extracted {category} size: {word}")
                    else:
                        features[category]["color"] = COLOR_SYNONYMS.get(word, word)
                        logging.debug(f"Extracted {category} color: {COLOR_SYNONYMS.get(word, word)}")

    return features

# --- Test Example ---
example_text = (
    "giant bird with green body and azure feathers, black large beak, "
    "lives in water and eats crabs, was seen in south asia, and it had blue and small eyes"
)
features = extract_features_optimized(example_text)
print("Extracted Features:")
print(features)


Extracted Features:
{'size': 'giant', 'color': {'primary': 'green', 'secondary': 'blue'}, 'habitat': 'water', 'region': 'south asia', 'diet': 'crabs', 'eyes': {'size': 'small', 'color': None}, 'beak': {'size': 'large', 'color': 'black'}, 'legs': {'size': None, 'color': None}, 'feathers': {'size': None, 'color': 'blue'}}
