In [1]:
import cv2
import numpy as np
import os
import pandas as pd
from skimage.feature import local_binary_pattern

In [2]:
def segment_gemstone(image, iterations=5):
    """
    Remove background from an image using the GrabCut algorithm.
    
    Parameters:
        image (numpy.ndarray): Input BGR image.
        iterations (int): Number of iterations for the GrabCut algorithm.
    
    Returns:
        segmented (numpy.ndarray): The segmented image with background removed (background set to black).
    """
    # Initialize mask and define a rectangle covering the foreground
    mask = np.zeros(image.shape[:2], np.uint8)
    height, width = image.shape[:2]
    rect_margin = 10  # Adjust margin as needed
    rect = (rect_margin, rect_margin, width - 2 * rect_margin, height - 2 * rect_margin)
    
    # Initialize background and foreground models (required by GrabCut)
    bgdModel = np.zeros((1, 65), np.float64)
    fgdModel = np.zeros((1, 65), np.float64)
    
    # Apply GrabCut algorithm
    cv2.grabCut(image, mask, rect, bgdModel, fgdModel, iterations, cv2.GC_INIT_WITH_RECT)
    
    # Create a mask where sure and probable foreground are marked as 1, background as 0
    mask2 = np.where((mask == cv2.GC_FGD) | (mask == cv2.GC_PR_FGD), 1, 0).astype('uint8')
    
    # Multiply the original image with the mask to obtain the segmented image
    segmented = image * mask2[:, :, np.newaxis]
    return segmented

In [3]:
def extract_color_features(image, bins=32):
    """
    Extract color features using histograms from the HSV color space.
    
    Parameters:
        image (numpy.ndarray): Input BGR image.
        bins (int): Number of bins per channel for the histogram.
    
    Returns:
        features (numpy.ndarray): Flattened and normalized histogram vector.
    """
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    
    # Compute histograms for each channel
    hist_h = cv2.calcHist([hsv], [0], None, [bins], [0, 180])
    hist_s = cv2.calcHist([hsv], [1], None, [bins], [0, 256])
    hist_v = cv2.calcHist([hsv], [2], None, [bins], [0, 256])
    
    # Normalize and flatten the histograms
    hist_h = cv2.normalize(hist_h, hist_h).flatten()
    hist_s = cv2.normalize(hist_s, hist_s).flatten()
    hist_v = cv2.normalize(hist_v, hist_v).flatten()
    
    features = np.concatenate([hist_h, hist_s, hist_v])
    return features

In [4]:
def extract_texture_features(image, radius=1, n_points=8):
    """
    Extract texture features using Local Binary Patterns (LBP).
    
    Parameters:
        image (numpy.ndarray): Input BGR image.
        radius (int): Radius for LBP.
        n_points (int): Number of points to consider for LBP.
    
    Returns:
        hist (numpy.ndarray): Normalized histogram of LBP values.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Compute LBP using the 'uniform' method
    lbp = local_binary_pattern(gray, n_points, radius, method='uniform')
    
    # Build the histogram of the LBP values
    n_bins = int(lbp.max() + 1)
    hist, _ = np.histogram(lbp, bins=n_bins, range=(0, n_bins), density=True)
    
    return hist

In [5]:
def extract_shape_features(image):
    """
    Extract shape features by computing Hu Moments from a binary version of the image.
    
    Parameters:
        image (numpy.ndarray): Input BGR image.
    
    Returns:
        huMoments (numpy.ndarray): Hu Moments vector (7 values) as shape descriptors.
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    # Find contours and use the largest contour for shape description
    contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    if contours:
        contour = max(contours, key=cv2.contourArea)
        moments = cv2.moments(contour)
        huMoments = cv2.HuMoments(moments).flatten()
        return huMoments
    else:
        # If no contour is found, return a zero vector
        return np.zeros(7)

In [11]:
def process_dataset(main_folder):
    """
    Process the dataset to extract features from gemstone images and create a structured DataFrame.

    Parameters:
        main_folder (str): Path to the main folder containing gemstone subfolders.

    Returns:
        df (pandas.DataFrame): DataFrame containing extracted features and class labels.
    """
    data = []

    for gemstone_class in os.listdir(main_folder):
        class_folder = os.path.join(main_folder, gemstone_class)
        if os.path.isdir(class_folder):
            for img_name in os.listdir(class_folder):
                img_path = os.path.join(class_folder, img_name)
                image = cv2.imread(img_path)
                if image is None:
                    continue  # Skip invalid images

                # 1. Segment the gemstone from the background
                segmented = segment_gemstone(image)

                # 2. Extract features
                color_feats = extract_color_features(segmented).flatten()
                texture_feats = extract_texture_features(segmented).flatten()
                shape_feats = extract_shape_features(segmented).flatten()

                # Combine features into one vector
                combined_features = np.concatenate([color_feats, texture_feats, shape_feats])

                # Append features and label to data list
                data.append(np.concatenate([combined_features, [gemstone_class]]))  # Convert class to a single-element array

    # Convert list to DataFrame
    df = pd.DataFrame(data)

    # Rename columns (last column is the label)
    num_features = df.shape[1] - 1  # Number of feature columns
    feature_columns = [f"feature_{i}" for i in range(num_features)]  # Feature names
    df.columns = feature_columns + ["class_label"]  # Assign column names

    # Ensure class_label is stored as a string (if not already)
    df["class_label"] = df["class_label"].astype(str)

    return df


In [13]:
main_folder_path = "GemstoneDataset"
df_features = process_dataset(main_folder_path)

In [18]:
df_features.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_104,feature_105,feature_106,feature_107,feature_108,feature_109,feature_110,feature_111,feature_112,class_label
0,0.909498691558838,0.0,0.0,0.0,0.0,0.0,0.0,3.738177838386037e-05,7.476355676772073e-05,0.0001495271135354,...,0.5139358024691358,0.0512395061728395,0.1674226530417223,0.0001654208993648,0.0006351571420113,4.519627279994888e-06,-2.3668302379902255e-10,-5.49186103496597e-08,-5.1190653886492456e-11,Alexandrite
1,0.7840370535850525,0.0,0.0,0.0026001010555773,0.0,0.0115938931703567,0.0,0.0,0.0052428268827497,0.0,...,0.407964632697281,0.0918082057733976,0.1652551736720344,0.0007840294596064,1.1932710061146096e-06,1.7806392684348444e-08,2.2967742554026386e-15,-1.3212206242342023e-10,1.2090632442722517e-15,Alexandrite
2,0.9700371026992798,0.0007631977787241,0.0011447966098785,0.0011447966098785,0.0011447966098785,0.0161797925829887,0.0007250378839671,0.0002798391797114,0.0028111117426306,0.000114479662443,...,0.678786469449958,0.0593892483120032,0.1619161159080352,1.680865590947662e-05,0.0002004495086442,2.3021385811510957e-07,-1.5516703914853081e-12,-3.9284238084803824e-10,1.949297515859703e-13,Alexandrite
3,0.8103930354118347,0.0009468788630329,0.0004508947022259,0.0008566999458707,0.0006763420533388,0.0032915312331169,0.0013977735070511,0.0007214315119199,0.0026602786965668,0.0019388472428545,...,0.4219587751697843,0.0588188569839946,0.2330466557645847,0.018589248145221,0.0009348944017801,0.0001165659369013,-3.661310561260433e-08,-1.5695415451905667e-05,-1.1841290328958834e-08,Alexandrite
4,0.953279435634613,0.0007722325972281,0.0015235940227285,0.0014401094522327,0.0015862075379118,0.0106025449931621,0.002066243905574,0.0006365701556205,0.0036524515599012,0.0009496373822912,...,0.618849543342413,0.0433584361480531,0.1753019983651038,0.0009934874675716,0.0013222746047281,4.42298325524319e-05,1.0669171134476236e-08,1.394015616758353e-06,-7.61396331963071e-10,Alexandrite


In [85]:
df_features.shape

(4400, 114)

In [20]:
df_features.to_csv('GemstoneDataset', index=False) 