<a href="https://colab.research.google.com/github/HarinduR/FeatherFind/blob/Keyword-Bird-Finder/DSGP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
# ----------------------
# Step 1: Load and Clean Data
# ----------------------
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("/content/bird_features_full_dataset.csv")

# Handle missing values
label_columns = ["Size", "Primary Color", "Secondary Color", "Habitat",
                 "Region", "Diet", "Beak Size", "Beak Color", "Legs Size",
                 "Legs Color", "Eyes Size", "Eyes Color"]

# Fill missing descriptions and labels
df["Description"] = df["Description"].fillna("none")
df[label_columns] = df[label_columns].fillna("none")

In [27]:
# ----------------------
# Step 2: Spit and prepare text lables
# ----------------------

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_text = train_df["Description"]
test_text = test_df["Description"]

def get_labels(row):
    return [f"{col}={val}" for col in label_columns
            if (val := row[col]) != "none"]

train_labels = train_df.apply(get_labels, axis=1)
test_labels = test_df.apply(get_labels, axis=1)

print(train_labels)

554     [Size=large, Primary Color=green, Secondary Co...
1012    [Size=small, Habitat=mountainous, Region=Europ...
481     [Size=large, Primary Color=black, Secondary Co...
432     [Size=small, Primary Color=yellow, Region=Afri...
626     [Size=medium, Primary Color=black, Habitat=wet...
                              ...                        
330     [Size=medium, Primary Color=brown, Habitat=des...
466     [Size=tiny, Primary Color=black, Diet=omnivoro...
121     [Size=small, Primary Color=blue, Region=Africa...
1044    [Size=giant, Primary Color=blue, Secondary Col...
860     [Size=medium, Primary Color=yellow, Habitat=de...
Length: 840, dtype: object


In [28]:
# ----------------------
# Step 3: Vectorize Text
# ----------------------
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF features with enhanced settings
vectorizer = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1, 3),  # Include bigrams and trigrams
    stop_words='english'
)

X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)

In [29]:

# ----------------------
# Step 4: Encode Labels
# ----------------------
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_labels)
y_test = mlb.transform(test_labels)


In [30]:
# ----------------------
# Step 5: Build Logistic Regression Model
# ----------------------
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

# Create classifier with balanced class weights
clf = MultiOutputClassifier(
    LogisticRegression(
        class_weight={0: 1, 1: 5},
        max_iter=1000,
        random_state=42
    )
)

# Train
clf.fit(X_train, y_train)


In [61]:
# ----------------------
# Step 6: Evaluate with Threshold Tuning
# ----------------------
import numpy as np
from sklearn.metrics import classification_report, f1_score

# Get probabilities
y_proba = np.array([estimator.predict_proba(X_test)[:, 1]
                   for estimator in clf.estimators_]).T

# Find optimal thresholds
thresholds = np.linspace(0.1, 0.9, 20)
best_thresholds = {}
for i in range(y_train.shape[1]):
    best_f1 = -1
    best_thresh = 0.5
    for thresh in thresholds:
        preds = (y_proba[:, i] >= thresh).astype(int)
        f1 = f1_score(y_test[:, i], preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    best_thresholds[i] = best_thresh

# Apply thresholds
y_pred = np.array([
    (y_proba[:, i] >= best_thresholds[i]).astype(int)
    for i in range(y_proba.shape[1])
]).T

# Generate report
print("Optimized Classification Report:")
cr1 = (classification_report(y_test, y_pred,
                            target_names=mlb.classes_,
                            zero_division=0))

print(cr1)


Optimized Classification Report:
                        precision    recall  f1-score   support

      Beak Color=black       1.00      0.28      0.44        32
       Beak Color=blue       0.53      0.43      0.48        23
       Beak Color=gray       0.17      0.62      0.27        37
      Beak Color=green       0.20      0.58      0.30        31
     Beak Color=orange       0.36      0.20      0.26        20
       Beak Color=rare       0.21      0.65      0.31        43
        Beak Color=red       0.41      0.26      0.32        27
      Beak Color=white       0.58      0.34      0.43        32
      Beak Size=curved       0.72      0.48      0.58        58
        Beak Size=rare       0.43      0.58      0.49        55
       Beak Size=short       0.84      0.37      0.52        43
       Beak Size=stout       0.74      0.52      0.61        33
        Beak Size=thin       0.21      0.74      0.32        53
           Diet=fruits       0.88      0.53      0.67        43
      

In [32]:
import pandas as pd

# Assuming df is your DataFrame and label_columns is the list of columns to analyze
rare_classes = {}

for col in label_columns:
    counts = df[col].value_counts()

    # Calculate the 10th percentile (you can adjust the quantile as needed)
    threshold = counts.quantile(0.15)

    # Identify rare classes based on the quantile threshold
    rare = counts[counts < threshold].index.tolist()
    rare_classes[col] = rare

print(rare_classes)

{'Size': [], 'Primary Color': ['green', 'brown'], 'Secondary Color': ['blue', 'brown'], 'Habitat': ['desert', 'marsh'], 'Region': ['Antarctica', 'South America'], 'Diet': ['small mammals', 'fish'], 'Beak Size': ['long'], 'Beak Color': ['brown', 'yellow'], 'Legs Size': ['medium'], 'Legs Color': ['black', 'white'], 'Eyes Size': ['medium'], 'Eyes Color': ['orange', 'gray']}


In [33]:
# Create modified dataset
df_modified = df.copy()

# Replace rare labels with "rare"
for col, labels in rare_classes.items():
    df_modified[col] = df_modified[col].apply(
        lambda x: "rare" if x in labels else x
    )

# Verify changes
print("\nBefore grouping:", df["Secondary Color"].value_counts())
print("After grouping:", df_modified["Secondary Color"].value_counts())


Before grouping: Secondary Color
none      578
purple     62
red        55
gray       53
orange     52
white      50
black      46
green      43
yellow     42
blue       38
brown      31
Name: count, dtype: int64
After grouping: Secondary Color
none      578
rare       69
purple     62
red        55
gray       53
orange     52
white      50
black      46
green      43
yellow     42
Name: count, dtype: int64


In [34]:
!pip install nlpaug



In [48]:
# ----------------------
# Step 7: Data Augmentation
# ----------------------
import nlpaug.augmenter.word as naw
import nltk

# Download the required NLTK resource
nltk.download('all')


[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_r

True

In [55]:

aug_synonym = naw.SynonymAug(aug_src='wordnet')
aug_random = naw.RandomWordAug(action="swap")

def augment_text(text, labels, n=2):
    preserved_words = []
    for label in labels:
        if "rare" in label:
            feature_value = label.split("=")[1]
            preserved_words.extend(feature_value.split("-"))

    augmented = []
    for _ in range(n):
        # Apply augmentations and ensure string output
        aug_text = aug_synonym.augment(text)
        if isinstance(aug_text, list):  # Handle list returns
            aug_text = aug_text[0]

        aug_text = aug_random.augment(aug_text)
        if isinstance(aug_text, list):  # Handle list returns
            aug_text = aug_text[0]

        # Preserve key features
        for word in preserved_words:
            if word not in aug_text.lower():
                aug_text += f" {word}"

        augmented.append(aug_text)
    return augmented

In [56]:
rare_samples = df_modified[
    df_modified[label_columns].apply(
        lambda x: x.str.contains("rare").any(), axis=1
    )
]

In [57]:
augmented_data = []
for _, row in rare_samples.iterrows():
    text = row["Description"]
    labels = [f"{col}={val}" for col in label_columns
             if (val := row[col]) != "none"]

    new_texts = augment_text(text, labels, n=1)
    for new_text in new_texts:
        new_row = row.copy()
        new_row["Description"] = new_text
        augmented_data.append(new_row)

In [58]:
# Create balanced dataset
balanced_df = pd.concat([df_modified, pd.DataFrame(augmented_data)], ignore_index=True)
print(f"Original: {len(df)} samples → Balanced: {len(balanced_df)} samples")

Original: 1050 samples → Balanced: 1918 samples


In [62]:
# Split balanced data
train_df, test_df = train_test_split(balanced_df, test_size=0.2, random_state=42)

# Re-vectorize text
X_train = vectorizer.fit_transform(train_df["Description"])
X_test = vectorizer.transform(test_df["Description"])

# Re-encode labels
y_train = mlb.fit_transform(train_df[label_columns].apply(
    lambda x: [f"{col}={val}" for col, val in x.items() if val != "none"],
    axis=1
))
y_test = mlb.transform(test_df[label_columns].apply(
    lambda x: [f"{col}={val}" for col, val in x.items() if val != "none"],
    axis=1
))

# Retrain and evaluate
clf.fit(X_train, y_train)
# Get probabilities
y_proba = np.array([estimator.predict_proba(X_test)[:, 1]
                   for estimator in clf.estimators_]).T

# Find optimal thresholds
thresholds = np.linspace(0.1, 0.9, 20)
best_thresholds = {}
for i in range(y_train.shape[1]):
    best_f1 = -1
    best_thresh = 0.5
    for thresh in thresholds:
        preds = (y_proba[:, i] >= thresh).astype(int)
        f1 = f1_score(y_test[:, i], preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    best_thresholds[i] = best_thresh

# Apply thresholds
y_pred = np.array([
    (y_proba[:, i] >= best_thresholds[i]).astype(int)
    for i in range(y_proba.shape[1])
]).T

cr2 = (classification_report(y_test, y_pred, target_names=mlb.classes_))
print(cr2)

                        precision    recall  f1-score   support

      Beak Color=black       1.00      0.28      0.44        32
       Beak Color=blue       0.53      0.43      0.48        23
       Beak Color=gray       0.17      0.62      0.27        37
      Beak Color=green       0.20      0.58      0.30        31
     Beak Color=orange       0.36      0.20      0.26        20
       Beak Color=rare       0.21      0.65      0.31        43
        Beak Color=red       0.41      0.26      0.32        27
      Beak Color=white       0.58      0.34      0.43        32
      Beak Size=curved       0.72      0.48      0.58        58
        Beak Size=rare       0.43      0.58      0.49        55
       Beak Size=short       0.84      0.37      0.52        43
       Beak Size=stout       0.74      0.52      0.61        33
        Beak Size=thin       0.21      0.74      0.32        53
           Diet=fruits       0.88      0.53      0.67        43
          Diet=insects       0.79      