# Persyaratan Teknis:
- Gunakan minimal dua operator pengolahan citra dasar (misalnya,
Gaussian blur, edge detection) untuk meningkatkan kualitas data.
- Implementasikan minimal dua metode detektor atau deskriptor fitur lokal
(misalnya, SIFT, SURF, ORB).
- Buat sistem untuk menemukan korespondensi antara beberapa citra
(misalnya, menggunakan keypoints matching) dan menerapkannya dalam
pengenalan.
- Bangun sistem pengenalan berbasis fitur yang mampu mengklasifikasikan
atau mengidentifikasi objek dalam citra secara otomatis.

In [5]:
import matplotlib.pyplot as plt
from pathlib import Path
import cv2
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [6]:
DATASET_PATH = Path('../dataset/Data')

train_paths = pd.read_csv(DATASET_PATH / "train_paths.csv")
valid_paths = pd.read_csv(DATASET_PATH / "valid_paths.csv")
test_paths = pd.read_csv(DATASET_PATH / "test_paths.csv")

## Functions

In [7]:
def preprocess_image(image): # all good 
    # Apply Gaussian Blur
    blurred = cv2.GaussianBlur(image, (5, 5), 0)
    # Apply Edge Detection (Canny)
    edges = cv2.Canny(blurred, 100, 200)
    return edges

def load_images(annotation): # all good
    images = []
    # heights = []
    # widhts = []
    for path in annotation['path'].values:
        image = cv2.imread(path, flags=cv2.IMREAD_GRAYSCALE)
        image = cv2.resize(image, (256, 256))
        image = preprocess_image(image)
        images.append(image)
        # heights.append(np.shape(image)[0])
        # widhts.append(np.shape(image)[1])
    # print((np.min(heights) - np.max(heights)) // 2)
    # print((np.min(widhts) - np.max(widhts)) // 2)
    return np.array(images)

train_images = load_images(train_paths)
valid_images = load_images(valid_paths)
test_images = load_images(test_paths)

[ WARN:0@17.792] global loadsave.cpp:241 findDecoder imread_('dataset/Data/train/adenocarcinoma_left.lower.lobe_T2_N0_M0_Ib/000046 (5).png'): can't open/read file: check file path/integrity


error: OpenCV(4.10.0) /io/opencv/modules/imgproc/src/resize.cpp:4152: error: (-215:Assertion failed) !ssize.empty() in function 'resize'


In [None]:
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import cv2

# Step 1: Extract Features Using Both SIFT and ORB
def extract_combined_features(images):
    sift = cv2.SIFT_create()
    orb = cv2.ORB_create()
    combined_features = []
    
    for image in tqdm(images, desc="Extracting SIFT + ORB features"):
        # Extract SIFT features
        _, sift_descriptors = sift.detectAndCompute(image, None)
        sift_descriptors = sift_descriptors if sift_descriptors is not None else np.zeros((1, 128))

        # Extract ORB features
        _, orb_descriptors = orb.detectAndCompute(image, None)
        orb_descriptors = orb_descriptors if orb_descriptors is not None else np.zeros((1, 32))

        # Concatenate descriptors
        combined_descriptors = np.hstack([
            np.resize(sift_descriptors, (len(sift_descriptors), 128)),  # Ensure fixed length
            np.resize(orb_descriptors, (len(orb_descriptors), 32))     # Ensure fixed length
        ])
        combined_features.append(combined_descriptors)
    
    return combined_features

# Step 2: Create Bag of Visual Words (BoVW) for Combined Features
def create_bovw_features(features, n_clusters=100):
    # Combine all descriptors for clustering
    all_descriptors = np.vstack(features)
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    kmeans.fit(all_descriptors)
    
    # Create histogram representation for each image
    bovw_features = []
    for descriptors in tqdm(features, desc="Creating BoVW features"):
        histogram = np.zeros(n_clusters)
        if descriptors is not None:
            cluster_labels = kmeans.predict(descriptors)
            for label in cluster_labels:
                histogram[label] += 1
        bovw_features.append(histogram)
    return np.array(bovw_features), kmeans

# Step 3: Prepare Data
train_features_combined = extract_combined_features(train_images)
valid_features_combined = extract_combined_features(valid_images)
test_features_combined = extract_combined_features(test_images)

train_bovw, kmeans = create_bovw_features(train_features_combined, n_clusters=150)
valid_bovw, _ = create_bovw_features(valid_features_combined, kmeans.n_clusters)
test_bovw, _ = create_bovw_features(test_features_combined, kmeans.n_clusters)

# Normalize features
scaler = StandardScaler()
train_bovw = scaler.fit_transform(train_bovw)
valid_bovw = scaler.transform(valid_bovw)
test_bovw = scaler.transform(test_bovw)

# Step 4: Train Classifier
labels = train_paths['label']  # Assuming the labels are in a column named 'label'
clf = RandomForestClassifier(n_estimators=200, random_state=42)
clf.fit(train_bovw, labels)

# Step 5: Evaluate
valid_preds = clf.predict(valid_bovw)
valid_labels = valid_paths['label']
print("Validation Classification Report:")
print(classification_report(valid_labels, valid_preds))

test_preds = clf.predict(test_bovw)
test_labels = test_paths['label']
print("Test Classification Report:")
print(classification_report(test_labels, test_preds))

# Step 6: Visualization
def plot_sample_predictions(images, labels, preds, num_samples=5):
    plt.figure(figsize=(15, 5))
    for i in range(num_samples):
        plt.subplot(1, num_samples, i + 1)
        plt.imshow(images[i], cmap='gray')
        plt.title(f"True: {labels[i]}\nPred: {preds[i]}")
        plt.axis('off')
    plt.show()

plot_sample_predictions(test_images, test_labels, test_preds)

In [55]:
# Step 1: Combine SIFT and ORB Features
def extract_combined_features(image, upper_limit=None):
    # SIFT Feature Extraction
    sift = cv2.SIFT_create(nfeatures=200)
    keypoints_sift, descriptors_sift = sift.detectAndCompute(image, None)

    # ORB Feature Extraction
    orb = cv2.ORB_create(nfeatures=200)
    keypoints_orb, descriptors_orb = orb.detectAndCompute(image, None)

    # Handle None descriptors
    if descriptors_sift is None:
        descriptors_sift = np.zeros((1, 128))
    if descriptors_orb is None:
        descriptors_orb = np.zeros((1, 32))

    if len(descriptors_sift) > 0:
        norms_sift = np.linalg.norm(descriptors_sift, axis=1, keepdims=True)
        descriptors_sift = descriptors_sift / (norms_sift + 1e-7)  # Avoid division by zero

    # Flatten and concatenate
    combined_features = np.concatenate(
        [descriptors_sift.flatten(), descriptors_orb.flatten()]
    )

    # Ensure fixed size (upper_limit) by padding or truncating
    if upper_limit == None:
        return combined_features
    elif len(combined_features) < upper_limit:
        combined_features = np.pad(combined_features, (0, upper_limit - len(combined_features)))
    else:
        combined_features = combined_features[:upper_limit]

    return combined_features

# Step 2: Extract Features from Dataset
def extract_features(images, upper_limit=400):
    features = []
    for image in images:
        combined_features = extract_combined_features(image, upper_limit)
        features.append(combined_features)
    return np.array(features)

In [None]:
import cv2
import numpy as np

# Example labels for training images
train_labels = train_paths['class'].values 
valid_labels = valid_paths['class'].values 
test_labels = test_paths['class'].values 

# Step 1: Prepare the matcher
def create_matcher(matcher_type="BF"):
    if matcher_type == "FLANN":
        index_params = dict(algorithm=1, trees=5)  # FLANN parameters
        search_params = dict(checks=50)  # Search parameters
        matcher = cv2.FlannBasedMatcher(index_params, search_params)
    elif matcher_type == "BF":
        matcher = cv2.BFMatcher(cv2.NORM_L2, crossCheck=False)  # BFMatcher with L2 norm
    else:
        raise ValueError("Invalid matcher type. Choose 'FLANN' or 'BF'.")
    return matcher

# Step 2: Classify an image
def classify_image(image, train_features, train_labels, matcher_type="BF", k=2):
    matcher = create_matcher(matcher_type)
    query_features = extract_combined_features(image).astype(np.float32)

    # Match query features with each set of train features
    best_label = None
    best_index = 0
    max_matches = 0

    for i, train_feature in tqdm(enumerate(train_features)):
        train_feature = train_feature.astype(np.float32)
        matches = matcher.match(query_features.reshape(-1, 1), train_feature.reshape(-1, 1))

        # Count the number of good matches
        good_matches = [m for m in matches if m.distance < 0.7 * max(m.distance for m in matches)]

        # Update best match
        if len(good_matches) > max_matches:
            max_matches = len(good_matches)
            best_index = i
            best_label = train_labels[i]

    return best_label, best_index

# Step 3: Testing Classification
# Assuming you already have `train_features` extracted from your training set
train_features = extract_features(train_images)
valid_features = extract_features(valid_images)
test_features = extract_features(test_images)

In [None]:
# from random import randint
# random_idx = randint(0, len(test_paths)-1)
# test_label = test_paths.iloc[random_idx, 1]
# test_image = test_images[random_idx]  # Replace with any test image
# predicted_label, match_img_idx = classify_image(test_image, train_features, train_labels, matcher_type="BF")
# print(f"True label: {test_label}")
# print(f"Predicted label: {predicted_label}")

0it [00:00, ?it/s]

Predicted label: large.cell.carcinoma


In [60]:
# Combine train and validation sets for final training
X_train = np.vstack((train_features, valid_features))
y_train = np.hstack((train_labels, valid_labels))

# Use test features for evaluation
X_test = test_features
y_test = test_labels

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the test set
rf_predictions = rf_model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, rf_predictions))
print("Classification Report:\n", classification_report(y_test, rf_predictions))

Accuracy: 0.526984126984127
Classification Report:
                          precision    recall  f1-score   support

         adenocarcinoma       0.48      0.78      0.59       120
   large.cell.carcinoma       0.37      0.14      0.20        51
                 normal       0.82      0.87      0.85        54
squamous.cell.carcinoma       0.41      0.21      0.28        90

               accuracy                           0.53       315
              macro avg       0.52      0.50      0.48       315
           weighted avg       0.50      0.53      0.48       315



In [None]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)

print("SVM Accuracy:", accuracy_score(y_test, svm_predictions))
print("SVM Classification Report:\n", classification_report(y_test, svm_predictions))

SVM Accuracy: 0.43174603174603177
SVM Classification Report:
                          precision    recall  f1-score   support

         adenocarcinoma       0.46      0.53      0.49       120
   large.cell.carcinoma       0.12      0.10      0.11        51
                 normal       0.53      0.87      0.66        54
squamous.cell.carcinoma       0.42      0.23      0.30        90

               accuracy                           0.43       315
              macro avg       0.38      0.43      0.39       315
           weighted avg       0.41      0.43      0.40       315

