In [1]:
import os
import cv2
from skimage.feature import hog, graycomatrix, graycoprops
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
import math
import re
from collections import defaultdict
from sklearn.preprocessing import StandardScaler

In [3]:
# Set the path to the folder containing your dataset
# Adjust this path to where you saved the images
dataset_folder = 'Cell_Images'

# We will store the loaded images in a list
cell_loaded_images = []
# It's also useful to store their filenames
cell_image_filenames = []

print(f"Starting to load images from: {dataset_folder}")

# Loop through every file in the folder
for filename in os.listdir(dataset_folder):
    
    # Check if the file is an image (e.g., .jpg, .png)
    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
        
        # Create the full path to the image
        image_path = os.path.join(dataset_folder, filename)
        
        # Load the image using OpenCV
        image = cv2.imread(image_path)
        
        if image is not None:
            # If loading was successful, add it to our list
            cell_loaded_images.append(image)
            cell_image_filenames.append(filename)
        else:
            print(f"Warning: Could not load {filename}")

print(f"---")
print(f"Success! Loaded {len(cell_loaded_images)} images.")
print(f"Example: The first image '{cell_image_filenames[0]}' has dimensions: {cell_loaded_images[0].shape}")

# ...existing code...
df = pd.read_csv("final_labels_manual.csv")

def normalize_id(s):
    return os.path.splitext(os.path.basename(str(s)).strip())[0].lower()

# build label map keyed by basename without extension (normalized)
label_map = { normalize_id(cid): lab for cid, lab in zip(df["cell_id"], df["label"]) }

y_labels = []
missing = []
for fname in cell_image_filenames:
    key = normalize_id(fname)
    if key in label_map:
        y_labels.append(int(label_map[key]))
    else:
        missing.append(fname)

if missing:
    raise KeyError(f"{len(missing)} filenames not found in label_map. Examples: {missing[:10]}")
# ...existing code...

Starting to load images from: Cell_Images
---
Success! Loaded 27584 images.
Example: The first image 'IMG_4171_63.jpg' has dimensions: (75, 100, 3)


In [4]:
def extract_all_features(cell):
    """
    Extracts a combined feature vector from a single 100x75 image cell.
    
    Features:
    1. HSV Stats (6 features)
    2. GLCM Properties (4 features)
    3. HOG Features (192 features)
    4. Edge Density (1 feature)
    5. Harris Corner Count (1 feature)
    6. FAST Keypoint Count (1 feature)
    
    Total Features: 6 + 4 + 192 + 1 + 1 + 1 = 205 features
    """
    
    # --- 0. Prerequisites ---
    # Create a grayscale version for features that need it
    gray_cell = cv2.cvtColor(cell, cv2.COLOR_BGR2GRAY)
    
    # Ensure grayscale is 8-bit unsigned int (required for GLCM)
    gray_cell_uint8 = cv2.convertScaleAbs(gray_cell)
    
    feature_vector = []

    try:
        # --- 1. HSV Stats (6 features) ---
        hsv_cell = cv2.cvtColor(cell, cv2.COLOR_BGR2HSV)
        h, s, v = cv2.split(hsv_cell)
        hsv_stats = [
            np.mean(h), np.std(h),
            np.mean(s), np.std(s),
            np.mean(v), np.std(v)
        ]
        feature_vector.extend(hsv_stats)
        
        # --- 2. GLCM Properties (4 features) ---
        # 
        # Calculate GLCM
        glcm = graycomatrix(gray_cell_uint8, 
                            distances=[1], 
                            angles=[0], 
                            levels=256, 
                            symmetric=True, 
                            normed=True)
        
        # Calculate properties
        glcm_feats = [
            graycoprops(glcm, 'contrast')[0, 0],
            graycoprops(glcm, 'homogeneity')[0, 0],
            graycoprops(glcm, 'energy')[0, 0],
            graycoprops(glcm, 'correlation')[0, 0]
        ]
        feature_vector.extend(glcm_feats)

        # --- 3. HOG Features (192 features) ---
        # 
        # orientations=8: 8 directions for gradients
        # pixels_per_cell=(16, 16): Each cell is 16x16 pixels
        # cells_per_block=(1, 1): Each block is 1x1 cells
        # (100/16 = 6.25 -> 6 cells wide, 75/16 = 4.68 -> 4 cells high)
        # Total blocks = 6 * 4 = 24
        # Total features = 24 blocks * 1x1 cells/block * 8 orientations = 192
        hog_features = hog(gray_cell, 
                           orientations=8, 
                           pixels_per_cell=(16, 16),
                           cells_per_block=(1, 1), 
                           visualize=False, 
                           block_norm='L2-Hys')
        feature_vector.extend(hog_features)

        # --- 4. Edge Density (1 feature) ---
        # Use standard Canny thresholds
        edges = cv2.Canny(cell, threshold1=100, threshold2=200)
        # Calculate mean (percentage of edge pixels)
        edge_density = np.mean(edges) / 255.0
        feature_vector.append(edge_density)

        # --- 5. Harris Corner Count (1 feature) ---
        # blockSize=2, ksize=3, k=0.04 are standard values
        dst = cv2.cornerHarris(gray_cell, blockSize=2, ksize=3, k=0.04)
        # Count corners by thresholding the response
        harris_corner_count = np.sum(dst > 0.01 * dst.max())
        feature_vector.append(harris_corner_count)

        # --- 6. FAST Keypoint Count (1 feature) ---
        fast = cv2.FastFeatureDetector_create()
        keypoints = fast.detect(gray_cell, None)
        fast_keypoint_count = len(keypoints)
        feature_vector.append(fast_keypoint_count)

    except Exception as e:
        print(f"Error extracting features from a cell: {e}")
        # Return None or a vector of zeros if extraction fails
        return None

    # Return the final, flat vector of 205 features
    return np.array(feature_vector)

In [None]:
# # filenames now like: "<image_id>_<cellno>" e.g. "1244_55.jpg" or "1244_55"
# pat = re.compile(r"(?P<imgid>.+?)_(?P<cell>\d+)(?:\..+)?$")

# groups = defaultdict(list)
# for img_arr, fname in zip(cell_loaded_images, cell_image_filenames):
#     m = pat.match(fname)
#     if not m:
#         # try basename fallback
#         m = pat.match(os.path.basename(fname))
#     if not m:
#         # skip files that don't follow the new naming convention
#         continue
#     imgid = m.group("imgid")
#     cell_no = int(m.group("cell"))  # numeric cell index
#     # label lookup using filename (try exact, then basename)
#     label = label_map.get(fname, label_map.get(os.path.basename(fname), np.nan))
#     groups[imgid].append((cell_no, img_arr, label, fname))

# # Build rows: only keep original images that have exactly 64 cells, sorted by cell number
# cell_images_rows = []
# label_rows = []
# for imgid, items in groups.items():
#     if len(items) != 64:
#         # skip incomplete images (or change this logic to pad/handle differently)
#         continue
#     # sort by numeric cell index (assumes cell indices 0..63 or 1..64)
#     items_sorted = sorted(items, key=lambda x: x[0])
#     imgs = [np.asarray(it[1], dtype=np.uint8) for it in items_sorted]
#     labs = [int(it[2]) if not pd.isna(it[2]) else 0 for it in items_sorted]
#     cell_images_rows.append(imgs)
#     label_rows.append(labs)

# # Final 2D arrays: each row corresponds to one original image (64 cells)
# cell_images_2d = np.array(cell_images_rows, dtype=object)  # shape (num_images, 64)
# y_labels_2d = np.array(label_rows, dtype=int)              # shape (num_images, 64)

# print("cell_images_2d.shape:", cell_images_2d.shape)
# print("y_labels_2d.shape:", y_labels_2d.shape)
# # ...existing code...

cell_images_2d.shape: (431, 64, 75, 100, 3)
y_labels_2d.shape: (431, 64)


In [None]:
# def rebuild_and_mark(cells, labels, grid_size=8, green_value=200, tint_alpha=0.6):
#     """
#     Apply a full green overlay to cells with label==1.
#     tint_alpha: 0.0 = no tint, 1.0 = full green
#     green_value: 0-255 intensity for green channel (BGR)
#     """
#     cells_list = [np.asarray(c, dtype=np.uint8) for c in list(cells)]
#     num_cells = len(cells_list)
#     assert num_cells == grid_size * grid_size, "Expected 64 cells"

#     h, w = cells_list[0].shape[:2]
#     channels = 3
#     H, W = h * grid_size, w * grid_size
#     final_image = np.zeros((H, W, channels), dtype=np.uint8)

#     # green tint image (BGR)
#     tint_img = np.zeros((h, w, 3), dtype=np.uint8)
#     tint_img[:] = (0, green_value, 0)

#     idx = 0
#     for row in range(grid_size):
#         for col in range(grid_size):
#             cell = cells_list[idx].copy()
#             # ensure 3-channel uint8
#             if cell.ndim == 2:
#                 cell = cv2.cvtColor(cell, cv2.COLOR_GRAY2BGR)
#             elif cell.shape[2] == 1:
#                 cell = cv2.cvtColor(cell, cv2.COLOR_GRAY2BGR)
#             cell = cell.astype(np.uint8)

#             if labels[idx] == 1:
#                 # full-cell green overlay
#                 cell = cv2.addWeighted(cell, 1.0 - tint_alpha, tint_img, tint_alpha, 0)

#             final_image[row*h:(row+1)*h, col*w:(col+1)*w] = cell
#             idx += 1

#     return final_image

In [None]:
# # ...existing code...
# # fix LABEL_OUTPUT_FOLDER stray quote
# LABEL_OUTPUT_FOLDER = 'Labeled_images'
# label_dir = os.path.join(LABEL_OUTPUT_FOLDER)
# os.makedirs(label_dir, exist_ok=True)
# for cells, labels in zip(cell_images_2d, y_labels_2d):
#     image = rebuild_and_mark(cells, labels)
#     cv2.imwrite(os.path.join(label_dir, f"labeled_image_{np.random.randint(1e6)}.png"), image)
# # ...existing code...

In [7]:

print("Starting feature extraction for all cells...")

# X_data will be your feature matrix (e.g., 12800 rows x 205 columns)
X_data = []
for image, filename in zip(cell_loaded_images,cell_image_filenames):
        
        # Extract the 205 features from the 100x75 cell
        features = extract_all_features(image)
        
        if features is not None:
            # Add the feature vector to our dataset
            X_data.append(features)

# Convert X_data to a NumPy array for efficiency
X_data = np.array(X_data)

print("---")
print("Feature extraction complete!")
print(f"Feature matrix 'X_data' shape: {X_data.shape}")

Starting feature extraction for all cells...
---
Feature extraction complete!
Feature matrix 'X_data' shape: (27584, 205)


In [None]:
print("\n--- Training Model ---")

# --- 1. Split Data into Training and Testing Sets ---
# We'll use 80% for training, 20% for testing
y_data = y_labels
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_data  # Ensures both sets get a similar % of 0s and 1s
)

print(f"Original training data: {X_train.shape[0]} samples")
print(f"Test data: {X_test.shape[0]} samples")

print("Handling data imbalance with SMOTE...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"New resampled training data: {X_train_resampled.shape[0]} samples")
print(f"Wildlife (1s) in new training set: {np.sum(y_train_resampled)}")
print(f"Background (0s) in new training set: {len(y_train_resampled) - np.sum(y_train_resampled)}")

# --- 3. Train the Random Forest Classifier ---
rf_model = RandomForestClassifier(n_estimators=100, 
                                random_state=42, 
                                n_jobs=-1)

rf_model.fit(X_train_resampled, y_train_resampled)
print("Draft model training complete.")

print("\n--- Draft Model Evaluation (on Test Set) ---")
y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Background (0)', 'Wildlife (1)']))

In [None]:


print("\n--- Training SVM Model ---")

y_data = y_labels
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_data
)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

svm_model = SVC(random_state=42)
svm_model.fit(X_train_resampled, y_train_resampled)

print("SVM model training complete.")

print("\n--- SVM Model Evaluation (on Test Set) ---")
y_pred_svm = svm_model.predict(X_test)
print(classification_report(y_test, y_pred_svm, target_names=['Background (0)', 'Wildlife (1)']))

print("SVM Confusion Matrix:")
cm_svm = confusion_matrix(y_test, y_pred_svm)
print(cm_svm)

In [8]:

y_data = y_labels
X_train, X_test, y_train, y_test = train_test_split(
    X_data, y_data, 
    test_size=0.2, 
    random_state=42, 
    stratify=y_data
)

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

xgb_model = xgb.XGBClassifier(n_estimators=300, 
                            random_state=42, 
                            n_jobs=-1, 
                            use_label_encoder=False, 
                            eval_metric='logloss')

xgb_model.fit(X_train_resampled, y_train_resampled)
print("XGBoost model training complete.")

print("\n--- XGBoost Model Evaluation (on Test Set) ---")
y_pred_xgb = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred_xgb, target_names=['Background (0)', 'Wildlife (1)']))

print("XGBoost Confusion Matrix:")
cm_xgb = confusion_matrix(y_test, y_pred_xgb)
print(cm_xgb)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost model training complete.

--- XGBoost Model Evaluation (on Test Set) ---
                precision    recall  f1-score   support

Background (0)       0.90      0.91      0.90      4287
  Wildlife (1)       0.66      0.64      0.65      1230

      accuracy                           0.85      5517
     macro avg       0.78      0.77      0.78      5517
  weighted avg       0.84      0.85      0.85      5517

XGBoost Confusion Matrix:
[[3880  407]
 [ 441  789]]
