In [1]:
# ----------------------
# Step 1: Load and Clean Data
# ----------------------
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("/content/bird_features_full_dataset.csv")

# Handle missing values
label_columns = ["Size", "Primary Color", "Secondary Color", "Habitat",
                 "Region", "Diet", "Beak Size", "Beak Color", "Legs Size",
                 "Legs Color", "Eyes Size", "Eyes Color"]

# Fill missing descriptions and labels
df["Description"] = df["Description"].fillna("none")
df[label_columns] = df[label_columns].fillna("none")

FileNotFoundError: [Errno 2] No such file or directory: '/content/bird_features_full_dataset.csv'

In [3]:
# ----------------------
# Step 2: Spit and prepare text lables
# ----------------------

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_text = train_df["Description"]
test_text = test_df["Description"]

def get_labels(row):
    return [f"{col}={val}" for col in label_columns
            if (val := row[col]) != "none"]

train_labels = train_df.apply(get_labels, axis=1)
test_labels = test_df.apply(get_labels, axis=1)

print(train_labels)

554     [Size=large, Primary Color=green, Secondary Co...
1012    [Size=small, Habitat=mountainous, Region=Europ...
481     [Size=large, Primary Color=black, Secondary Co...
432     [Size=small, Primary Color=yellow, Region=Afri...
626     [Size=medium, Primary Color=black, Habitat=wet...
                              ...                        
330     [Size=medium, Primary Color=brown, Habitat=des...
466     [Size=tiny, Primary Color=black, Diet=omnivoro...
121     [Size=small, Primary Color=blue, Region=Africa...
1044    [Size=giant, Primary Color=blue, Secondary Col...
860     [Size=medium, Primary Color=yellow, Habitat=de...
Length: 840, dtype: object


In [4]:
# ----------------------
# Step 3: Vectorize Text
# ----------------------
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF features with enhanced settings
vectorizer = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1, 3),  # Include bigrams and trigrams
    stop_words='english'
)

X_train = vectorizer.fit_transform(train_text)
X_test = vectorizer.transform(test_text)

In [5]:

# ----------------------
# Step 4: Encode Labels
# ----------------------
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_labels)
y_test = mlb.transform(test_labels)


In [6]:
# ----------------------
# Step 5: Build Logistic Regression Model
# ----------------------
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

# Create classifier with balanced class weights
clf = MultiOutputClassifier(
    LogisticRegression(
        class_weight={0: 1, 1: 5},
        max_iter=1000,
        random_state=42
    )
)

# Train
clf.fit(X_train, y_train)


In [7]:
# ----------------------
# Step 6: Evaluate with Threshold Tuning
# ----------------------
import numpy as np
from sklearn.metrics import classification_report, f1_score

# Get probabilities
y_proba = np.array([estimator.predict_proba(X_test)[:, 1]
                   for estimator in clf.estimators_]).T

# Find optimal thresholds
thresholds = np.linspace(0.1, 0.9, 20)
best_thresholds = {}
for i in range(y_train.shape[1]):
    best_f1 = -1
    best_thresh = 0.5
    for thresh in thresholds:
        preds = (y_proba[:, i] >= thresh).astype(int)
        f1 = f1_score(y_test[:, i], preds, zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    best_thresholds[i] = best_thresh

# Apply thresholds
y_pred = np.array([
    (y_proba[:, i] >= best_thresholds[i]).astype(int)
    for i in range(y_proba.shape[1])
]).T

# Generate report
print("Optimized Classification Report:")
print(classification_report(y_test, y_pred,
                            target_names=mlb.classes_,
                            zero_division=0))


Optimized Classification Report:
                        precision    recall  f1-score   support

      Beak Color=black       0.86      0.38      0.52        16
       Beak Color=blue       0.67      0.43      0.52        14
      Beak Color=brown       0.09      0.25      0.14        12
       Beak Color=gray       0.50      0.20      0.29        15
      Beak Color=green       0.15      0.50      0.23        16
     Beak Color=orange       0.50      0.33      0.40         9
        Beak Color=red       0.16      0.45      0.23        11
      Beak Color=white       1.00      0.22      0.36        18
     Beak Color=yellow       0.30      0.38      0.33         8
      Beak Size=curved       0.38      0.40      0.39        25
        Beak Size=long       0.76      0.67      0.71        24
       Beak Size=short       0.92      0.42      0.58        26
       Beak Size=stout       0.70      0.48      0.57        29
        Beak Size=thin       0.19      0.61      0.29        28
      

In [8]:
import pandas as pd

# Assuming df is your DataFrame and label_columns is the list of columns to analyze
rare_classes = {}

for col in label_columns:
    counts = df[col].value_counts()

    # Calculate the 10th percentile (you can adjust the quantile as needed)
    threshold = counts.quantile(0.15)

    # Identify rare classes based on the quantile threshold
    rare = counts[counts < threshold].index.tolist()
    rare_classes[col] = rare

print(rare_classes)

{'Size': [], 'Primary Color': ['green', 'brown'], 'Secondary Color': ['blue', 'brown'], 'Habitat': ['desert', 'marsh'], 'Region': ['Antarctica', 'South America'], 'Diet': ['small mammals', 'fish'], 'Beak Size': ['long'], 'Beak Color': ['brown', 'yellow'], 'Legs Size': ['medium'], 'Legs Color': ['black', 'white'], 'Eyes Size': ['medium'], 'Eyes Color': ['orange', 'gray']}


In [None]:
# Create modified dataset
df_modified = df.copy()

# Replace rare labels with "rare"
for col, labels in rare_classes.items():
    df_modified[col] = df_modified[col].apply(
        lambda x: "rare" if x in labels else x
    )

# Verify changes
print("\nBefore grouping:", df["Secondary Color"].value_counts())
print("After grouping:", df_modified["Secondary Color"].value_counts())

In [None]:
import nlpaug.augmenter.word as naw

aug_synonym = naw.SynonymAug(aug_src='wordnet')
aug_context = naw.RandomWordAug(action="insert")

def augment_text(text, labels, n=2):
  preserved_words = []
  for label in labels:
    if "rare" in label:
      feature_value = label.split("=")[1]
      preserved_words.extend(feature_value.split("-"))

  augmented = []
  for _ in range(n):
    # Apply augmentations
    aug_text = aug_synonym.augment(text)
    aug_text = aug_context.augment(aug_text)

    print(aug_text)

In [None]:
rare_samples = df_modified[
    df_modified[label_columns].apply(
        lambda x: x.str.contains("rare").any(), axis=1
    )
]
augmented_data = []
for _, row in rare_samples.iterrows():
    text = row["Description"]
    labels = [f"{col}={val}" for col in label_columns
             if (val := row[col]) != "none"]

    # Generate augmented texts
    new_texts = augment_text(text, labels, n=2)
    break