## Load Image and Convert to Matrix

In [12]:
import os
import numpy as np
from PIL import Image

def load_split_data(base_path, split_name, target_size=(224, 224)):
    X = []
    y = []
    split_path = os.path.join(base_path, split_name)
    
    # Sort to ensure consistent label encoding
    class_names = sorted(os.listdir(split_path))
    
    for label_idx, label_name in enumerate(class_names):
        class_dir = os.path.join(split_path, label_name)
        if os.path.isdir(class_dir):
            for img_name in os.listdir(class_dir):
                img_path = os.path.join(class_dir, img_name)
                try:
                    with Image.open(img_path) as img:
                        img = img.convert('RGB').resize(target_size)
                        X.append(np.array(img))
                        y.append(label_idx) # Use numeric index for ML models
                except Exception:
                    continue
                    
    # Return as flattened arrays (N, 150528) and normalized [0, 1]
    return np.array(X).reshape(len(X), -1) / 255.0, np.array(y)

# Usage
base_dir = "./data/animal_subset"
X_train, y_train = load_split_data(base_dir, 'train')
X_val, y_val     = load_split_data(base_dir, 'validation')
X_test, y_test   = load_split_data(base_dir, 'test')

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

Train shape: (100, 150528), Test shape: (100, 150528)


## Training model

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# 1. Logistic Regression
model_lr = LogisticRegression(max_iter=2000, random_state=1)
model_lr.fit(X_train, y_train)
y_prob_lr = model_lr.predict_proba(X_val)
auc_lr = roc_auc_score(y_val, y_prob_lr, multi_class='ovr')
print("Logistic Regression Model AUC:", auc_lr)

# 2. Random Forest
model_rf = RandomForestClassifier(random_state=42)
model_rf.fit(X_train, y_train)
y_prob_rf = model_rf.predict_proba(X_val)
auc_rf = roc_auc_score(y_val, y_prob_rf, multi_class='ovr')
print("Random Forest Model AUC:", auc_rf)

Logistic Regression Model AUC: 0.5632222222222222
Random Forest Model AUC: 0.5463333333333333


## Tuning

In [17]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# 1. Dimensionality Reduction
print("Compressing data from 150,528 pixels to 100 components...")
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_val_pca = pca.transform(X_val)
print("Compression complete!")

# 2. Logistic Regression Tuning
print("\n==== Logistic Regression Tuning ====")
for C in [0.01, 0.1, 1, 3, 5, 10]:
    model_lr = LogisticRegression(C=C, max_iter=2000, random_state=1)
    model_lr.fit(X_train_pca, y_train)
    
    y_prob = model_lr.predict_proba(X_val_pca)
    auc = roc_auc_score(y_val, y_prob, multi_class='ovr')
    print(f"C={C:<5} | AUC={auc:.4f}")

# 3. Random Forest Tuning
print("\n==== Random Forest Tuning ====")
for n in [100, 200, 300]:
    for depth in [None, 5, 10, 20]:
        for leaf in [1, 3, 5]:
            model_rf = RandomForestClassifier(
                n_estimators=n,
                max_depth=depth,
                min_samples_leaf=leaf,
                random_state=42,
                n_jobs=-1
            )
            model_rf.fit(X_train_pca, y_train)
        
            y_prob = model_rf.predict_proba(X_val_pca)
            auc = roc_auc_score(y_val, y_prob, multi_class='ovr')
            print(f"trees={n:<3} depth={str(depth):<4} leaf={leaf:<2} | AUC={auc:.4f}")

Compressing data from 150,528 pixels to 100 components...
Compression complete!

==== Logistic Regression Tuning ====
C=0.01  | AUC=0.5477
C=0.1   | AUC=0.5461
C=1     | AUC=0.5471
C=3     | AUC=0.5480
C=5     | AUC=0.5483
C=10    | AUC=0.5511

==== Random Forest Tuning ====
trees=100 depth=None leaf=1  | AUC=0.5231
trees=100 depth=None leaf=3  | AUC=0.5400
trees=100 depth=None leaf=5  | AUC=0.5423
trees=100 depth=5    leaf=1  | AUC=0.5400
trees=100 depth=5    leaf=3  | AUC=0.5061
trees=100 depth=5    leaf=5  | AUC=0.5516
trees=100 depth=10   leaf=1  | AUC=0.5217
trees=100 depth=10   leaf=3  | AUC=0.5424
trees=100 depth=10   leaf=5  | AUC=0.5423
trees=100 depth=20   leaf=1  | AUC=0.5231
trees=100 depth=20   leaf=3  | AUC=0.5400
trees=100 depth=20   leaf=5  | AUC=0.5423
trees=200 depth=None leaf=1  | AUC=0.5243
trees=200 depth=None leaf=3  | AUC=0.5393
trees=200 depth=None leaf=5  | AUC=0.5283
trees=200 depth=5    leaf=1  | AUC=0.5284
trees=200 depth=5    leaf=3  | AUC=0.5249
trees=200 

- Logistic C=10 is the best
- Random Forest with trees=100 depth=5 leaf=5 is the best

## Train model with tuning

In [18]:
model_lr = LogisticRegression(max_iter=2000, random_state=1, C=10)
model_lr.fit(X_train, y_train)
y_prob_lr = model_lr.predict_proba(X_val)
auc_lr = roc_auc_score(y_val, y_prob_lr, multi_class='ovr')
print("Logistic Regression Model AUC:", auc_lr)

Logistic Regression Model AUC: 0.5775555555555555


In [19]:
# 2. Random Forest
model_rf = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=5, min_samples_leaf=5)
model_rf.fit(X_train, y_train)
y_prob_rf = model_rf.predict_proba(X_val)
auc_rf = roc_auc_score(y_val, y_prob_rf, multi_class='ovr')
print("Random Forest Model AUC:", auc_rf)

Random Forest Model AUC: 0.554111111111111


The model accuracy with LR is 0.58, with RF is 0.55. While CNNs works with 0.84 so we will choose CNNs for this 