<a href="https://colab.research.google.com/github/HarinduR/FeatherFind/blob/Keyword-Bird-Finder/FeatherFinder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import spacy
from spacy.matcher import Matcher
from spacy.util import filter_spans
from spacy.tokens import Span

nlp = spacy.load("en_core_web_sm")

COLOR_SYNONYMS = {
    "azure": "blue", "crimson": "red", "scarlet": "red",
    "emerald": "green", "ivory": "white", "charcoal": "black"
}

HABITAT_TERMS = ["forest", "wetland", "desert", "water", "mountains", "grassland", "marsh", "coast"]
SIZE_TERMS = ["small", "large", "tiny", "big", "medium", "giant"]
CONTINENTS = ["asia", "europe", "africa", "america", "australia", "antarctica"]
DIRECTIONS = ["north", "south", "east", "west"]

def extract_features_optimized(text):
    doc = nlp(text.lower())
    features = {
        "size": None,
        "color": {"primary": None, "secondary": None},
        "habitat": None,
        "region": None,
        "diet": None,
        "eyes": {"size": None, "color": None},
        "beak": {"size": None, "color": None},
        "legs": {"size": None, "color": None},
        "feathers": None
    }

    for token in doc:
        if token.dep_ == "amod" and token.head.text == "bird" and token.text in SIZE_TERMS:
            features["size"] = token.text

        if token.dep_ == "amod" and token.head.text in ["feathers", "wings", "chest", "body", "plumage", "tail"]:
            compound_color = None
            for child in token.head.children:
                if child.dep_ == "compound":
                    compound_color = COLOR_SYNONYMS.get(child.text, child.text)
                    break

            color = compound_color if compound_color else COLOR_SYNONYMS.get(token.text, token.text)

            if not features["color"]["primary"]:
                features["color"]["primary"] = color
            else:
                features["color"]["secondary"] = color

        if token.dep_ == "amod" and token.head.text == "bird" and token.text not in SIZE_TERMS:
            color = COLOR_SYNONYMS.get(token.text, token.text)
            if not features["color"]["primary"]:
                features["color"]["primary"] = color

        if token.dep_ == "amod" and token.children != "" and token.head.text in ["feathers", "wings", "chest", "body", "plumage", "tail", "bird"] and token.text not in SIZE_TERMS:
          for child in token.children:
            if child.dep_ == "conj":
              color = COLOR_SYNONYMS.get(child.text, child.text)
              features['color']['secondary'] = color

    matcher = Matcher(nlp.vocab)

    matcher.add("HABITAT", [
        [{"LOWER": {"IN": ["in", "near", "around", "found"]}},
         {"LOWER": {"IN": HABITAT_TERMS}}]
    ])

    matcher.add("REGION", [
        [
            {"LOWER": {"IN": ["in", "from", "found"]}},
            {"LOWER": {"IN": DIRECTIONS}, "OP": "*"},
            {"LOWER": {"IN": CONTINENTS}}
        ]
    ])

    matcher.add("DIET", [
        [{"LOWER": {"IN": ["eats", "feeds", "consumes", "diet"]}},
         {"POS": "NOUN"}]
    ])

    matcher.add("PHYSICAL_CHARACTERISTICS", [
        [
            {"POS": {"IN": ["ADV", "ADJ", "NUM"]}, "OP": "*"},
            {"POS": {"IN": ["ADJ", "NUM"]}, "OP": "+"},
            {"LOWER": {"IN": ["eyes", "beak", "bill", "legs"]}}
        ]
    ])

    matches = matcher(doc)
    spans = []

    if not Span.has_extension("match_label"):
        Span.set_extension("match_label", default=None)

    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]
        span = doc[start:end]
        span._.match_label = label
        spans.append(span)

    filtered_spans = filter_spans(spans)

    for span in filtered_spans:
        label = span._.match_label
        text = span.text
        if label == "HABITAT":
            features["habitat"] = span[-1].text
        elif label == "REGION":
            features["region"] = span[1:].text
        elif label == "DIET":
            features["diet"] = span[-1].text
        elif label == "PHYSICAL_CHARACTERISTICS":
            if (len(span) >= 2):
              words = text.split()
              category = words[-1]
              for word in words[0:-1]:
                if word in SIZE_TERMS:
                  features[category]["size"] = word
                else:
                  features[category]["color"] = COLOR_SYNONYMS.get(word, word)

    return features

example_text = "giant bird with green body and azure feathers. black large beak. lives in water and eats crabs. this bird was seen in south asia. also it had  blue eyes "
example_text2 = "blue and green bird"
features = extract_features_optimized(example_text)
print(features)


{'size': 'giant', 'color': {'primary': 'green', 'secondary': 'blue'}, 'habitat': 'water', 'region': 'south asia', 'diet': 'crabs', 'eyes': {'size': None, 'color': 'blue'}, 'beak': {'size': 'large', 'color': 'black'}, 'legs': {'size': None, 'color': None}, 'feathers': None}


Enhanced code given by chatgpt

In [12]:
import spacy
from spacy.matcher import Matcher
from spacy.util import filter_spans
from spacy.tokens import Span

class FeatureExtractor:

    def __init__(self):

        self.nlp = spacy.load("en_core_web_sm")

        self.COLOR_SYNONYMS = {
            "azure": "blue", "crimson": "red", "scarlet": "red",
            "emerald": "green", "ivory": "white", "charcoal": "black"
        }

        self.HABITAT_TERMS = ["forest", "wetland", "desert", "water", "mountains", "grassland", "marsh", "coast", "open field", "water body", "woodland"]
        self.SIZE_TERMS = ["small", "large", "tiny", "big", "medium", "giant"]
        self.CONTINENTS = ["asia", "europe", "africa", "america", "australia", "antarctica"]
        self.DIRECTIONS = ["north", "south", "east", "west"]
        self.BEAK_SHAPES = ["curved", "spare", "long", "short", "thick"]

    def extractFeatures(self, text):
        doc = self.nlp(text.lower())
        features = {
            "size": None,
            "color": {"primary": None, "secondary": None},
            "habitat": None,
            "region": None,
            "diet": None,
            "eyes": {"size": None, "color": None},
            "beak": {"size": None, "color": None},
            "legs": {"size": None, "color": None},
        }

        for token in doc:
            if token.dep_ == "amod" and token.head.text == "bird" and token.text in self.SIZE_TERMS:
                features["size"] = token.text

            if token.dep_ == "amod" and token.head.text in ["feathers", "wings", "chest", "body", "plumage", "tail"]:
                compound_color = None
                for child in token.head.children:
                    if child.dep_ == "compound":
                        compound_color = self.COLOR_SYNONYMS.get(child.text, child.text)
                        break

                color = compound_color if compound_color else self.COLOR_SYNONYMS.get(token.text, token.text)

                if not features["color"]["primary"]:
                    features["color"]["primary"] = color
                else:
                    features["color"]["secondary"] = color

            if token.dep_ == "amod" and token.head.text == "bird" and token.text not in self.SIZE_TERMS:
                color = self.COLOR_SYNONYMS.get(token.text, token.text)
                if not features["color"]["primary"]:
                    features["color"]["primary"] = color

            if token.dep_ == "amod" and token.children != "" and token.head.text in ["feathers", "wings", "chest", "body", "plumage", "tail", "bird"] and token.text not in self.SIZE_TERMS:
                for child in token.children:
                    if child.dep_ == "conj":
                        color = self.COLOR_SYNONYMS.get(child.text, child.text)
                        features['color']['secondary'] = color

        matcher = Matcher(self.nlp.vocab)  # Use self.nlp

        matcher.add("HABITAT", [
            [{"LOWER": {"IN": ["in", "near", "around", "found"]}},
            {"LOWER": {"IN": self.HABITAT_TERMS}}]
        ])

        matcher.add("REGION", [
            [
                {"LOWER": {"IN": ["in", "from", "found"]}},
                {"LOWER": {"IN": self.DIRECTIONS}, "OP": "*"},
                {"LOWER": {"IN": self.CONTINENTS}}
            ]
        ])

        matcher.add("DIET", [
            [{"LOWER": {"IN": ["eats", "feeds", "consumes", "diet"]}},
            {"POS": "NOUN"}]
        ])

        matcher.add("PHYSICAL_CHARACTERISTICS", [
            [
                {"POS": {"IN": ["ADV", "ADJ", "NUM"]}, "OP": "*"},
                {"POS": {"IN": ["ADJ", "NUM"]}, "OP": "+"},
                {"LOWER": {"IN": ["eyes", "beak", "bill", "legs"]}}
            ]
        ])

        matcher.add("BEAK_SHAPE", [
            [
                {"LOWER": {"IN": self.BEAK_SHAPES}},
                {"POS": {"IN": ["ADJ", "NUM"]}, "OP": "*"},
                {"LOWER": {"IN": ["beak", "bill"]}}
            ],
            [
                {"POS": {"IN": ["ADJ", "NUM"]}, "OP": "*"},
                {"LOWER": {"IN": self.BEAK_SHAPES}},
                {"LOWER": {"IN": ["beak", "bill"]}}
            ]
        ])

        matches = matcher(doc)
        spans = []

        if not Span.has_extension("match_label"):
            Span.set_extension("match_label", default=None)

        for match_id, start, end in matches:
            label = self.nlp.vocab.strings[match_id]
            span = doc[start:end]
            span._.match_label = label
            spans.append(span)

        filtered_spans = filter_spans(spans)

        for span in filtered_spans:
            label = span._.match_label
            text = span.text
            if label == "HABITAT":
                features["habitat"] = span[-1].text
            elif label == "REGION":
                features["region"] = span[1:].text
            elif label == "DIET":
                features["diet"] = span[-1].text
            elif label == "PHYSICAL_CHARACTERISTICS":
                if len(span) >= 2:
                    words = text.split()
                    category = words[-1]
                    for word in words[:-1]:
                        if word in self.SIZE_TERMS:
                            features[category]["size"] = word
                        elif word not in self.BEAK_SHAPES:
                            features[category]["color"] = self.COLOR_SYNONYMS.get(word, word)
            elif label == "BEAK_SHAPE":
                for word in text.split()[:-1]:
                    if word in self.BEAK_SHAPES:
                        features["beak"]["size"] = word
                    else:
                        features["beak"]["color"] = self.COLOR_SYNONYMS.get(word, word)
        return features

extractor = FeatureExtractor()

example_text = "i saw a blue bird with a spare beak"
features = extractor.extractFeatures(example_text)
print(features)




{'size': None, 'color': {'primary': 'blue', 'secondary': None}, 'habitat': None, 'region': None, 'diet': None, 'eyes': {'size': None, 'color': None}, 'beak': {'size': 'spare', 'color': None}, 'legs': {'size': None, 'color': None}}


In [None]:
#More enhancements

import spacy
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Span
from spacy.util import filter_spans
import logging

logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

nlp = spacy.load("en_core_web_sm")

# Enhanced color synonyms and compound colors
COLOR_SYNONYMS = {
    "azure": "blue", "crimson": "red", "scarlet": "red", "emerald": "green",
    "ivory": "white", "charcoal": "black", "cerulean": "blue", "sapphire": "blue",
    "ruby": "red", "vermilion": "red", "olive": "green", "sky-blue": "blue",
    "black-tipped": "black", "yellowish-brown": "brown", "navy-blue": "blue"
}

HABITAT_TERMS = ["forest", "wetland", "desert", "water", "mountains", "grassland",
                "marsh", "coast", "rainforest", "tundra", "savanna"]
SIZE_TERMS = ["small", "large", "tiny", "big", "medium", "giant", "sized"]
CONTINENTS = ["asia", "europe", "africa", "america", "australia", "antarctica"]
DIRECTIONS = ["north", "south", "east", "west", "central", "southeast", "northwest"]
BODY_PARTS = {"eyes", "beak", "wings", "legs", "feathers", "plumage", "chest", "tail"}

def is_negated(token):
    """Check if a modifier is negated"""
    return any(child.dep_ == "neg" for child in token.head.children)

def extract_features(text):
    doc = nlp(text.lower())

    features = {
        "size": [],
        "color": {"primary": [], "secondary": []},
        "habitats": [],
        "regions": [],
        "diet": [],
        "eyes": {"size": [], "color": []},
        "beak": {"size": [], "color": []},
        "legs": {"size": [], "color": []},
        "feathers": {"color": []},
        "negations": []
    }

    # Dependency parsing for colors and sizes
    for token in doc:
        # Handle negations
        if token.dep_ == "neg":
            features["negations"].append(token.head.text)

        # Size detection
        if token.text in SIZE_TERMS and token.head.text == "bird" and not is_negated(token):
            features["size"].append(token.text)

        # Color detection with compound handling
        if token.dep_ == "amod" and token.head.text in ["feathers", "wings", "body", *BODY_PARTS]:
            if is_negated(token):
                continue

            color = None
            # Check for compound colors ("black-tipped")
            for child in token.children:
                if child.dep_ == "compound":
                    color = COLOR_SYNONYMS.get(f"{child.text}-{token.text}",
                            COLOR_SYNONYMS.get(child.text, child.text))
                    break

            color = color or COLOR_SYNONYMS.get(token.text, token.text)
            target = "primary" if not features["color"]["primary"] else "secondary"
            features["color"][target].append(color)

    # Initialize matchers
    matcher = Matcher(nlp.vocab)
    phrase_matcher = PhraseMatcher(nlp.vocab)

    # Habitat matching with PhraseMatcher
    habitat_patterns = [nlp.make_doc(text) for text in HABITAT_TERMS]
    phrase_matcher.add("HABITAT", habitat_patterns)

    # Region patterns
    matcher.add("REGION", [
        [{"LOWER": {"IN": ["in", "from", "found"]}},
         {"LOWER": {"IN": DIRECTIONS}, "OP": "*"},
         {"LOWER": {"IN": CONTINENTS}, "OP": "+"}]
    ])

    # Diet patterns
    matcher.add("DIET", [
        [ # Pattern 1: "eats fish and crustaceans"
            {"LOWER": {"IN": ["eats", "feeds", "diet"]}},
            {"LOWER": "on", "OP": "?"},  # Optional "on"
            {"POS": "NOUN", "OP": "+"},  # Require at least 1 noun
            {"LOWER": "and", "OP": "*"}, # Optional conjunctions
            {"POS": "NOUN", "OP": "*"}   # Additional nouns
        ]
    ])

    # Physical characteristics with measurement support
    matcher.add("PHYSICAL", [
        [{"ENT_TYPE": "QUANTITY", "OP": "?"},  # Capture measurements
         {"POS": {"IN": ["ADJ", "ADV"]}, "OP": "+"},
         {"LOWER": {"IN": list(BODY_PARTS)}}]
    ])

    # Compound colors
    matcher.add("COMPOUND_COLOR", [
        [{"LOWER": {"REGEX": "^(sky|navy|royal|black|white|yellowish)"}},
         {"LOWER": {"REGEX": "^(blue|brown|tipped|striped|edged)$"}}]
    ])

    # Run matchers
    matches = matcher(doc) + phrase_matcher(doc)
    spans = []
    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]
        span = doc[start:end]
        Span.set_extension("match_label", default=label, force=True)
        span._.match_label = label
        spans.append(span)

    filtered_spans = filter_spans(spans)

    # Process matches
    for span in filtered_spans:
        label = span._.match_label
        text = span.text.lower()

        if label == "HABITAT":
            features["habitats"].append(span.text)
        elif label == "REGION":
            region = " ".join([t.text for t in span if t.text in CONTINENTS + DIRECTIONS])
            features["regions"].append(region)
        elif label == "DIET":
            diet_items = [t.text for t in span if t.pos_ in ["NOUN", "PROPN"]]
            features["diet"].extend(diet_items)
        elif label == "PHYSICAL":
            body_part = span[-1].text
            modifiers = [t.text for t in span[:-1]]

            for mod in modifiers:
                if mod in SIZE_TERMS:
                    features[body_part]["size"].append(mod)
                else:
                    color = COLOR_SYNONYMS.get(mod, mod)
                    features[body_part]["color"].append(color)
        elif label == "COMPOUND_COLOR":
            color = "-".join([t.text for t in span])
            normalized = COLOR_SYNONYMS.get(color, color.split("-")[0])
            features["color"]["secondary"].append(normalized)

    # Extract regions using NER
    for ent in doc.ents:
        if ent.label_ in ["GPE", "LOC"]:
            features["regions"].append(ent.text)

    # Post-processing to remove duplicates
    for key in features:
        if isinstance(features[key], dict):
            for subkey in features[key]:
                features[key][subkey] = list(set(features[key][subkey]))
        else:
            features[key] = list(set(features[key]))

    return features

# Test the enhanced version
test_text = (
    "giant bird with green body and sky-blue feathers, black-tipped large beak, "
    "lives in coastal wetlands and eats crabs, found in southeast asia but not in africa, "
    "with pale yellow eyes measuring 2.5 cm."
)

result = extract_features(test_text)
print("Enhanced Extraction Results:")
for category, data in result.items():
    print(f"{category.upper():<12} {data}")

Enhanced Extraction Results:
SIZE         ['giant']
COLOR        {'primary': ['green'], 'secondary': ['large', 'yellow', 'tipped', 'blue']}
HABITATS     []
REGIONS      ['southeast asia', 'africa']
DIET         ['crabs']
EYES         {'size': [], 'color': ['yellow', 'pale']}
BEAK         {'size': ['large'], 'color': []}
LEGS         {'size': [], 'color': []}
FEATHERS     {'color': ['blue']}
NEGATIONS    ['in']
