<a href="https://colab.research.google.com/github/HarinduR/FeatherFind/blob/Keyword-Bird-Finder/FeatherFinder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [40]:
import spacy
from spacy.matcher import Matcher
from spacy.util import filter_spans
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")

COLOR_SYNONYMS = {
    "azure": "blue", "crimson": "red", "scarlet": "red",
    "emerald": "green", "ivory": "white", "charcoal": "black"
}

HABITAT_TERMS = ["forest", "wetland", "desert", "water", "mountains", "grassland", "marsh", "coast"]
SIZE_TERMS = ["small", "large", "tiny", "big", "medium", "giant"]
CONTINENTS = ["asia", "europe", "africa", "america", "australia", "antarctica"]
DIRECTIONS = ["north", "south", "east", "west"]

def extract_features_optimized(text):
    doc = nlp(text.lower())
    features = {
        "size": None,
        "color": {"primary": None, "secondary": None},
        "habitat": None,
        "region": None,
        "diet": None,
        "eyes": {"size": None, "color": None},
        "beak": {"length": None, "color": None},
        "legs": {"length": None, "color": None},
        "feathers": {"primary_color": None, "secondary_color": None},
        "common_name": None,
        "scientific_name": None
    }

    for token in doc:
        if token.dep_ == "amod" and token.head.text == "bird" and token.text in SIZE_TERMS:
            features["size"] = token.text

        if token.dep_ == "amod" and token.head.text in ["feathers", "wings", "chest", "body", "plumage", "tail"]:
            compound_color = None
            for child in token.head.children:
                print(child)
                print(child.dep_)
                if child.dep_ == "compound":
                    compound_color = COLOR_SYNONYMS.get(child.text, child.text)
                    break

            color = compound_color if compound_color else COLOR_SYNONYMS.get(token.text, token.text)

            if not features["color"]["primary"]:
                features["color"]["primary"] = color
            else:
                features["color"]["secondary"] = color

        if token.dep_ == "amod" and token.head.text == "bird" and token.text not in SIZE_TERMS:
            color = COLOR_SYNONYMS.get(token.text, token.text)
            if not features["color"]["primary"]:
                features["color"]["primary"] = color

    matcher = Matcher(nlp.vocab)

    matcher.add("HABITAT", [
        [{"LOWER": {"IN": ["in", "near", "around", "found"]}},
         {"LOWER": {"IN": HABITAT_TERMS}}]
    ])

    matcher.add("REGION", [
        [
            {"LOWER": {"IN": ["in", "from", "found"]}},
            {"LOWER": {"IN": DIRECTIONS}, "OP": "*"},
            {"LOWER": {"IN": CONTINENTS}}
        ]
    ])

    matcher.add("DIET", [
        [{"LOWER": {"IN": ["eats", "feeds", "consumes", "diet"]}},
         {"POS": "NOUN"}]
    ])

    matcher.add("BEAK_LEG_ATTR", [
        [{"POS": {"IN": ["ADJ", "NUM"]}, "OP": "+"},
         {"LOWER": {"IN": ["beak", "legs", "bill"]}}]
    ])

    matches = matcher(doc)
    spans = []

    if not Span.has_extension("match_label"):
        Span.set_extension("match_label", default=None)

    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]
        span = doc[start:end]
        span._.match_label = label
        spans.append(span)

    filtered_spans = filter_spans(spans)

    for span in filtered_spans:
        label = span._.match_label
        text = span.text
        print(text, label)
        if label == "HABITAT":
            features["habitat"] = span[-1].text
        elif label == "REGION":
            features["region"] = span[1:].text
        elif label == "DIET":
            features["diet"] = span[-1].text
        elif label == "BEAK_LEG_ATTR":
            if span[-1].text in ["beak", "bill"]:
                features["beak"]["length"] = span[-2].text
            elif span[-1].text == "legs":
                features["legs"]["length"] = span[-2].text
        elif label == "COLOR":
            color = COLOR_SYNONYMS.get(text, text)
            if not features["color"]["primary"]:
                features["color"]["primary"] = color
            else:
                features["color"]["secondary"] = color

    return features

example_text = "giant bird with bright crimson feathers with long legs lives in forest in north asia and eats worms. had some green feathers too"
features = extract_features_optimized(example_text)
print(features)


bright
amod
crimson
compound
some
det
green
amod
long legs BEAK_LEG_ATTR
in forest HABITAT
in north asia REGION
{'size': 'giant', 'color': {'primary': 'red', 'secondary': 'green'}, 'habitat': 'forest', 'region': 'north asia', 'diet': None, 'eyes': {'size': None, 'color': None}, 'beak': {'length': None, 'color': None}, 'legs': {'length': 'long', 'color': None}, 'feathers': {'primary_color': None, 'secondary_color': None}, 'common_name': None, 'scientific_name': None}
