In [1]:
import os
import cv2
import numpy as np
import joblib
from skimage.feature import hog
from sklearn import metrics
from sklearn.cluster import MiniBatchKMeans
from sklearn.svm import SVC
from cuml import KNeighborsClassifier
from xgboost import XGBClassifier

In [2]:
# Define file paths for saving intermediate data for each feature extraction method
feature_methods = ['hog', 'sift', 'edge', 'color_histogram']

In [3]:
# Feature extraction method
def extract_hog_features(image):
    # Convert the image to grayscale
    if image is None or image.size == 0:
        raise ValueError("Input image is empty")
    # Ensure the image is in the correct format (8-bit grayscale)
    if len(image.shape) == 3:  # Color image
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
    features, _ = hog(image, orientations=8, pixels_per_cell=(16, 16),
                      cells_per_block=(1, 1), visualize=True)
    return features.reshape(1, -1)  # Reshape for consistency

def extract_sift_features(image):
    # Convert the image to grayscale
    if image is None or image.size == 0:
        raise ValueError("Input image is empty")
    # Ensure the image is in the correct format (8-bit grayscale)
    if len(image.shape) == 3:  # Color image
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
        
    sift = cv2.SIFT_create()
    _, descriptors = sift.detectAndCompute(image, None)
    if descriptors is None:
        return np.zeros((1, 128))  # Ensure a vector is returned even if no features are detected
    return descriptors

def extract_edge_features(image):
    # Convert the image to grayscale
    if image is None or image.size == 0:
        raise ValueError("Input image is empty")
    # Ensure the image is in the correct format (8-bit grayscale)
    if len(image.shape) == 3:  # Color image
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    if image.dtype != np.uint8:
        image = image.astype(np.uint8)
    
    # Apply Canny edge detection
    edges = cv2.Canny(image, threshold1=100, threshold2=200)
    # Flatten the edge image to a 1D array and then reshape for consistency
    features = edges.flatten().reshape(1, -1)
    return features

def extract_color_histogram_features(image, bins=32):
    # Initialize the color histogram
    histogram = np.array([])
    # For each color channel (B, G, R)
    for i in range(image.shape[2]):
        # Calculate histograms per channel and normalize
        hist = cv2.calcHist([image], [i], None, [bins], [0, 256])
        hist = cv2.normalize(hist, hist).flatten()
        # Concatenate histograms into a single feature
        histogram = np.concatenate([histogram, hist])
    # Reshape for consistency
    features = histogram.reshape(1, -1)
    return features

def create_bow_features(all_features, n_clusters=100):
    # Flatten the list of features arrays to fit KMeans
    # Ensure all features are of dtype float32 before fitting KMeans
    all_features_flattened = np.vstack(all_features).astype(np.float32)  # Cast to float32 here
    kmeans = MiniBatchKMeans(n_clusters=n_clusters, random_state=0).fit(all_features_flattened)
    bow_features = []

    # Create a histogram for each image's features
    for features in all_features:
        features = features.astype(np.float32)  # Ensure features are float32 before prediction
        labels = kmeans.predict(features)
        hist, _ = np.histogram(labels, bins=np.arange(n_clusters+1), density=True)
        bow_features.append(hist)
    return np.array(bow_features)

In [4]:
# Load images and labels function
def load_img(filename, lb_min=None, lb_max=None, sn_min=None, sn_max=None):
    with open(filename, 'r') as f:
        lines = f.readlines()

    imgs, labels = [], []

    for line in lines:
        fn, label = line.strip().split(' ')

        # Extract serial number from filename
        serial_number = int(fn.split('_')[-1].split('.')[0])

        # Convert label to integer
        label = int(label)

        # Check if the serial number and label are within specified ranges
        if (sn_min is None or sn_min <= serial_number) and \
           (sn_max is None or serial_number <= sn_max) and \
           (lb_min is None or lb_min <= label) and \
           (lb_max is None or label <= lb_max):

            img = cv2.imread(fn)
            img = cv2.resize(img, (256, 256))
            imgs.append(img)
            labels.append(label)

    imgs = np.asarray(imgs, np.float32)
    labels = np.asarray(labels, np.int32)
    return imgs, labels

In [5]:
def process_and_save_data(feature_method):
    # Define file paths
    train_features_path = f'L{lb_max}N{sn_max}/train_features_{feature_method}.joblib'
    test_features_path = f'L{lb_max}N{sn_max}/test_features_{feature_method}.joblib'
    train_labels_path = f'L{lb_max}N{sn_max}/train_labels.joblib'
    test_labels_path = f'L{lb_max}N{sn_max}/test_labels.joblib'
    
    if not os.path.exists(f'L{lb_max}N{sn_max}/'):
        os.makedirs(f'L{lb_max}N{sn_max}/')
    
    train_features = []  # Correctly initialize train_features
    test_features = []  # Correctly initialize test_features
    
    # Load or process images and features
    if not os.path.exists(train_features_path) or not os.path.exists(test_features_path):
        train_imgs, train_labels = load_img('train.txt', lb_min, lb_max,sn_min, sn_max)
        test_imgs, test_labels = load_img('test.txt', lb_min, lb_max)
        
        # Feature extraction
        if feature_method == 'hog':
            train_features = [extract_hog_features(image) for image in train_imgs]
            test_features = [extract_hog_features(image) for image in test_imgs]
        elif feature_method == 'sift':
            train_features = [extract_sift_features(image) for image in train_imgs]
            test_features = [extract_sift_features(image) for image in test_imgs]
        elif feature_method == 'edge':
            train_features = [extract_edge_features(image) for image in train_imgs]
            test_features = [extract_edge_features(image) for image in test_imgs]
        elif feature_method == 'color_histogram':
            train_features = [extract_color_histogram_features(image) for image in train_imgs]
            test_features = [extract_color_histogram_features(image) for image in test_imgs]
        else:
            raise ValueError(f"Unsupported feature extraction method: {feature_method}")
            
        if feature_method in ['hog', 'edge','color_histogram']:
            train_features = np.array(train_features)
            test_features = np.array(test_features)

            train_features = train_features.reshape(train_features.shape[0], -1)
            test_features = test_features.reshape(test_features.shape[0], -1)
        elif feature_method == 'sift':
            # Create Bag of Words features
            all_features = train_features + test_features
            bow_features = create_bow_features(all_features, n_clusters=100)
            train_features, test_features = np.split(bow_features, [len(train_features)])

        # Save processed data
        joblib.dump(train_features, train_features_path)
        joblib.dump(test_features, test_features_path)
        joblib.dump(train_labels, train_labels_path)
        joblib.dump(test_labels, test_labels_path)
    else:
        # Load processed data
        train_features = joblib.load(train_features_path)
        test_features = joblib.load(test_features_path)
        train_labels = joblib.load(train_labels_path)
        test_labels = joblib.load(test_labels_path)

    return train_features, test_features, train_labels, test_labels

In [6]:
def train_and_evaluate_models(feature_method):
    train_features, test_features, train_labels, test_labels = process_and_save_data(feature_method)

    # Convert lists to arrays and ensure correct shape and dtype
    train_features = np.array(train_features, dtype=np.float32)
    test_features = np.array(test_features, dtype=np.float32)

    for name, model in models.items():
        model_filename = f"L{lb_max}N{sn_max}/{feature_method}_{name}_model.joblib"
        metrics_filename = f"L{lb_max}N{sn_max}/{feature_method}_{name}_metrics.txt"
        

        # Check if the model has already been trained and saved
        if os.path.exists(model_filename):
            print(f"Loading saved {name} model trained with {feature_method} features.")
            model = joblib.load(model_filename)
        else:
            print(f"Training {name} model with {feature_method} features.")
            model.fit(train_features, train_labels)
            joblib.dump(model, model_filename)

        # Predict and evaluate
        pred_labels = model.predict(test_features)
        f1 = metrics.f1_score(test_labels, pred_labels, average='weighted')
        accuracy = metrics.accuracy_score(test_labels, pred_labels)
        print(f"{feature_method} - {name} - F1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}")

        # Save metrics
        with open(metrics_filename, 'w') as f:
            f.write(f"F1 Score: {f1:.4f}\n")
            f.write(f"Accuracy: {accuracy:.4f}\n")
        
        # Append performance metrics to "performances.txt"
        with open('performances.txt', 'a') as perf_file:
            perf_file.write(f"{feature_method}, {name}, {sn_max}, {lb_max}, F1-score, {f1:.4f}\n")
            perf_file.write(f"{feature_method}, {name}, {sn_max}, {lb_max}, Accuracy, {accuracy:.4f}\n")
        
        # Compare predictions with actual labels
        comparison = (pred_labels == test_labels).astype(int) # 1 for correct, 0 for incorrect
        
        # Save detailed comparison in long-format data fashion
        with open('predictions_detailed.txt', 'a') as f:
            for actual, predicted, correct in zip(test_labels, pred_labels, comparison):
                f.write(f"{feature_method}, {name}, {sn_max}, {lb_max}, {actual}, {predicted}, {correct}\n") 

In [7]:
sn_min = 0
sn_max = 100
lb_min = 0
lb_max = 50
models = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'SVM': SVC(kernel='linear'),
    'XGBoost': XGBClassifier(tree_method='hist', device='cuda')}
train_and_evaluate_models('sift')
train_and_evaluate_models('color_histogram')
train_and_evaluate_models('edge')
train_and_evaluate_models('hog')

Loading saved KNN model trained with sift features.
sift - KNN - F1 Score: 0.0392, Accuracy: 0.0392
Loading saved SVM model trained with sift features.
sift - SVM - F1 Score: 0.0023, Accuracy: 0.0196
Loading saved XGBoost model trained with sift features.


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




sift - XGBoost - F1 Score: 0.0392, Accuracy: 0.0588
Loading saved KNN model trained with color_histogram features.
color_histogram - KNN - F1 Score: 0.0261, Accuracy: 0.0392
Loading saved SVM model trained with color_histogram features.
color_histogram - SVM - F1 Score: 0.0928, Accuracy: 0.1176
Loading saved XGBoost model trained with color_histogram features.
color_histogram - XGBoost - F1 Score: 0.0588, Accuracy: 0.0784
Loading saved KNN model trained with edge features.


  return _core.array(a, dtype, False, order, blocking=blocking)


edge - KNN - F1 Score: 0.0008, Accuracy: 0.0196
Loading saved SVM model trained with edge features.
edge - SVM - F1 Score: 0.0361, Accuracy: 0.0784
Loading saved XGBoost model trained with edge features.
edge - XGBoost - F1 Score: 0.0079, Accuracy: 0.0392
Loading saved KNN model trained with hog features.
hog - KNN - F1 Score: 0.0025, Accuracy: 0.0196
Loading saved SVM model trained with hog features.
hog - SVM - F1 Score: 0.0732, Accuracy: 0.0980
Loading saved XGBoost model trained with hog features.
hog - XGBoost - F1 Score: 0.0425, Accuracy: 0.0588


In [8]:
sn_min = 0
sn_max = 100
lb_min = 0
lb_max = 100
models = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'XGBoost': XGBClassifier(tree_method='hist', device='cuda')}
train_and_evaluate_models('sift')
train_and_evaluate_models('color_histogram')

Loading saved KNN model trained with sift features.
sift - KNN - F1 Score: 0.0020, Accuracy: 0.0099
Loading saved XGBoost model trained with sift features.
sift - XGBoost - F1 Score: 0.0617, Accuracy: 0.0891
Loading saved KNN model trained with color_histogram features.
color_histogram - KNN - F1 Score: 0.0066, Accuracy: 0.0099
Loading saved XGBoost model trained with color_histogram features.
color_histogram - XGBoost - F1 Score: 0.0528, Accuracy: 0.0693


In [9]:
sn_min = 0
sn_max = 100
lb_min = 0
lb_max = 150
train_and_evaluate_models('sift')
train_and_evaluate_models('color_histogram')

Loading saved KNN model trained with sift features.
sift - KNN - F1 Score: 0.0052, Accuracy: 0.0132
Loading saved XGBoost model trained with sift features.
sift - XGBoost - F1 Score: 0.0453, Accuracy: 0.0596
Loading saved KNN model trained with color_histogram features.
color_histogram - KNN - F1 Score: 0.0044, Accuracy: 0.0066
Loading saved XGBoost model trained with color_histogram features.
color_histogram - XGBoost - F1 Score: 0.0648, Accuracy: 0.0795


In [10]:
sn_min = 0
sn_max = 100
lb_min = 0
lb_max = 200
train_and_evaluate_models('sift')
train_and_evaluate_models('color_histogram')

Loading saved KNN model trained with sift features.
sift - KNN - F1 Score: 0.0192, Accuracy: 0.0250
Loading saved XGBoost model trained with sift features.
sift - XGBoost - F1 Score: 0.0278, Accuracy: 0.0400
Loading saved KNN model trained with color_histogram features.
color_histogram - KNN - F1 Score: 0.0083, Accuracy: 0.0100
Loading saved XGBoost model trained with color_histogram features.
color_histogram - XGBoost - F1 Score: 0.0353, Accuracy: 0.0500


In [11]:
sn_min = 0
sn_max = 400
lb_min = 0
lb_max = 50
train_and_evaluate_models('sift')
train_and_evaluate_models('color_histogram')

Loading saved KNN model trained with sift features.
sift - KNN - F1 Score: 0.0157, Accuracy: 0.0392
Loading saved XGBoost model trained with sift features.
sift - XGBoost - F1 Score: 0.1059, Accuracy: 0.1373
Loading saved KNN model trained with color_histogram features.
color_histogram - KNN - F1 Score: 0.0327, Accuracy: 0.0392
Loading saved XGBoost model trained with color_histogram features.
color_histogram - XGBoost - F1 Score: 0.1307, Accuracy: 0.1961


In [12]:
sn_min = 0
sn_max = 200
lb_min = 0
lb_max = 50
train_and_evaluate_models('sift')
train_and_evaluate_models('color_histogram')

Loading saved KNN model trained with sift features.
sift - KNN - F1 Score: 0.0131, Accuracy: 0.0196
Loading saved XGBoost model trained with sift features.
sift - XGBoost - F1 Score: 0.0327, Accuracy: 0.0392
Loading saved KNN model trained with color_histogram features.
color_histogram - KNN - F1 Score: 0.0523, Accuracy: 0.0588
Loading saved XGBoost model trained with color_histogram features.
color_histogram - XGBoost - F1 Score: 0.0392, Accuracy: 0.0588


In [13]:
sn_min = 0
sn_max = 50
lb_min = 0
lb_max = 50
train_and_evaluate_models('sift')
train_and_evaluate_models('color_histogram')

Loading saved KNN model trained with sift features.
sift - KNN - F1 Score: 0.0065, Accuracy: 0.0196
Loading saved XGBoost model trained with sift features.
sift - XGBoost - F1 Score: 0.1046, Accuracy: 0.1176
Loading saved KNN model trained with color_histogram features.
color_histogram - KNN - F1 Score: 0.0314, Accuracy: 0.0588
Loading saved XGBoost model trained with color_histogram features.
color_histogram - XGBoost - F1 Score: 0.0405, Accuracy: 0.0588
