<a href="https://colab.research.google.com/github/HarinduR/FeatherFind/blob/Keyword-Bird-Finder/DSGP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# ----------------------
# Step 1: Load and Clean Data
# ----------------------
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("/content/bird_features_full_dataset.csv")

# Handle missing values
label_columns = ["Size", "Primary Color", "Secondary Color", "Habitat",
                 "Region", "Diet", "Beak Size", "Beak Color", "Legs Size",
                 "Legs Color", "Eyes Size", "Eyes Color"]

# Fill missing descriptions and labels
df["Description"] = df["Description"].fillna("none")
df[label_columns] = df[label_columns].fillna("none")

In [5]:
# ----------------------
# Step 2: Spit and prepare text lables
# ----------------------

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_text = train_df["Description"]
test_text = test_df["Description"]

def get_labels(row):
    return [f"{col}={val}" for col in label_columns
            if (val := row[col]) != "none"]

train_labels = train_df.apply(get_labels, axis=1)
test_labels = test_df.apply(get_labels, axis=1)

print(train_labels)

554     [Size=large, Primary Color=green, Secondary Co...
1012    [Size=small, Habitat=mountainous, Region=Europ...
481     [Size=large, Primary Color=black, Secondary Co...
432     [Size=small, Primary Color=yellow, Region=Afri...
626     [Size=medium, Primary Color=black, Habitat=wet...
                              ...                        
330     [Size=medium, Primary Color=brown, Habitat=des...
466     [Size=tiny, Primary Color=black, Diet=omnivoro...
121     [Size=small, Primary Color=blue, Region=Africa...
1044    [Size=giant, Primary Color=blue, Secondary Col...
860     [Size=medium, Primary Color=yellow, Habitat=de...
Length: 840, dtype: object


In [None]:
# ----------------------
# Step 3: Vectorize Text
# ----------------------
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF features with enhanced settings
vectorizer = TfidfVectorizer(
    max_features=500,
    ngram_range=(1, 3),  # Include bigrams and trigrams
    stop_words='english'
)

X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)


  (0, 968)	0.1316189770566134
  (0, 261)	0.10141027110076103
  (0, 228)	0.16800873954976436
  (0, 142)	0.1326800149136793
  (0, 53)	0.0796500507965805
  (0, 68)	0.04615311407117658
  (0, 0)	0.1524794980312016
  (0, 567)	0.17830750932781653
  (0, 200)	0.1524794980312016
  (0, 818)	0.1524794980312016
  (0, 371)	0.1524794980312016
  (0, 483)	0.1316189770566134
  (0, 976)	0.22357583372641365
  (0, 275)	0.23517477720259483
  (0, 143)	0.20841434805060782
  (0, 54)	0.1524794980312016
  (0, 69)	0.1524794980312016
  (0, 5)	0.24630515712050424
  (0, 570)	0.24630515712050424
  (0, 201)	0.1524794980312016
  (0, 819)	0.1524794980312016
  (0, 373)	0.2507040187211576
  (0, 144)	0.24228931111281168
  (0, 55)	0.1524794980312016
  (0, 72)	0.24630515712050424
  (0, 6)	0.24630515712050424
  (0, 571)	0.24630515712050424
  (0, 202)	0.1524794980312016
  (0, 821)	0.2507040187211576 
  (0, 53)	0.0761559227325748
  (0, 68)	0.04412844629627194
  (0, 854)	0.14631894223777123
  (0, 258)	0.18977899275550128
  (0, 8