In [1]:
import numpy as np
import os
import pandas as pd
from PIL import Image
import cv2
from skimage.feature import local_binary_pattern

import tensorflow as tf

train_path = "training data"
test_path = "test data"

train_output = "processed training"
test_output = "processed test"

image_size = (500,500)

In [2]:
# Function to preprocess a single image
def preprocess_image(path, size):
    img = Image.open(path) # open image
    img = img.resize(size) # resize image
    img_array = np.array(img) # image to numpy array
    img_array = img_array / 255.0
    return img_array

# Function for data augmentation
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal_and_vertical"),
    tf.keras.layers.RandomZoom(0.2, 0.2),
])

# Function for image enhancement
def enhance_image(img):
    # Convert to grayscale
    gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    
    # Histogram equalization
    equalized_img = cv2.equalizeHist(gray_img)

    # Noise reduction
    denoised_img = cv2.GaussianBlur(equalized_img, (5, 5), 0)
    
    return denoised_img


# processing training images
for file in os.listdir(train_path):
    if file.endswith('.jpg'):
        img_path = os.path.join(train_path, file)
        processed_image = preprocess_image(img_path, image_size)
        
        # expanding dimensions to get a batch size of 1
        processed_image_batch = np.expand_dims(processed_image, axis=0)
        augmented_img_batch = data_augmentation(processed_image_batch, training=True)
        
        # squeezing batch and converting to uint8
        augmented_img = tf.squeeze(augmented_img_batch, axis=0).numpy()
        augmented_img = (augmented_img * 255).astype('uint8')

        # enhance image
        enhanced_img = enhance_image(augmented_img)

        output_path = os.path.join(train_output, file)
        Image.fromarray(enhanced_img).save(output_path)

# processeing test images
for file in os.listdir(test_path):
    if file.endswith('.jpg'):
        img_path = os.path.join(test_path, file)
        processed_image = preprocess_image(img_path, image_size)

        processed_image_batch = np.expand_dims(processed_image, axis=0)
        augmented_img_batch = data_augmentation(processed_image_batch, training=True)
        
        augmented_img = tf.squeeze(augmented_img_batch, axis=0).numpy()
        augmented_img = (augmented_img * 255).astype('uint8')

        enhanced_img = enhance_image(augmented_img)

        output_path = os.path.join(test_output, file)
        Image.fromarray(enhanced_img).save(output_path)

In [3]:
# function for segmenting image
def segment_image(img_path):
    img = cv2.imread(img_path)

    gray_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    ret, bin_img = cv2.threshold(gray_img, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    sobel_x = cv2.Sobel(bin_img, cv2.CV_64F, 1, 0, ksize=3)
    sobel_y = cv2.Sobel(bin_img, cv2.CV_64F, 0, 1, ksize=3)
    sobel_combined = cv2.magnitude(sobel_x, sobel_y)

    return bin_img, sobel_combined

# function for extracting features
def extract_features(bin_img, edges):
    # Feature 1: Local Binary Pattern
    lbp = local_binary_pattern(bin_img, P=8, R=1, method="uniform")
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 10), range=(0, 9))
    lbp_hist = lbp_hist.astype("float")

    # # Feature 2: Hu Moments
    moments = cv2.moments(bin_img)
    hu_moments = cv2.HuMoments(moments).flatten()

    # Feature 3 and 4: Mean and Standard Deviation of Sobel edges
    mean_val = np.mean(edges)
    std_dev_val = np.std(edges)

    # Combine all features into a single feature vector
    features = np.hstack([lbp_hist, hu_moments, mean_val, std_dev_val])

    return features

training_features_lst = []
num = 0
for file in os.listdir(train_output):
    if file.endswith('.jpg'):
        img_path = os.path.join(train_output, file)
        bin_img, edge = segment_image(img_path)

        features = extract_features(bin_img, edge)
        training_features_lst.append(features)
    num = num + 1
    if num % 200 == 0:
        print(num)

num = 0
test_features_lst = []
for file in os.listdir(test_output):
    if file.endswith('.jpg'):
        img_path = os.path.join(test_output, file)
        bin_img, edge = segment_image(img_path)

        features = extract_features(bin_img, edge)
        test_features_lst.append(features)
    num = num + 1
    if num % 200 == 0:
        print(num)

200
400
600
800
1000
1200
1400
1600
1800
2000
2200
2400
200
400
600
800
1000


In [4]:
# Convert features to a DataFrame
columns = [f'lbp_{i}' for i in range(9)]  # LBP feature names
columns += [f'hu_moment{i}' for i in range(7)] # Hu Moments
columns += ['mean_edge', 'std_dev_edge']  # Mean and std dev names

train_df = pd.DataFrame(training_features_lst, columns=columns)
test_df = pd.DataFrame(test_features_lst, columns=columns)

train_df

Unnamed: 0,lbp_0,lbp_1,lbp_2,lbp_3,lbp_4,lbp_5,lbp_6,lbp_7,lbp_8,hu_moment0,hu_moment1,hu_moment2,hu_moment3,hu_moment4,hu_moment5,hu_moment6,mean_edge,std_dev_edge
0,3.0,52.0,0.0,781.0,0.0,1325.0,0.0,839.0,247000.0,0.000669,3.201483e-08,2.348844e-11,3.784496e-13,-8.891085e-25,-4.785441e-17,6.947177e-25,18.834079,129.656612
1,20.0,326.0,0.0,3456.0,0.0,6896.0,0.0,3339.0,235963.0,0.000849,2.564127e-09,1.140597e-10,7.953073e-14,-1.925967e-26,3.498419e-18,2.387592e-25,89.703222,272.274655
2,10.0,63.0,0.0,1007.0,0.0,1647.0,0.0,985.0,246288.0,0.000648,6.512954e-10,4.247212e-12,6.382182e-14,-3.139389e-26,-3.077280e-19,-1.088722e-26,24.528624,148.309928
3,6.0,107.0,0.0,1308.0,0.0,4009.0,0.0,1230.0,243340.0,0.001693,1.591157e-08,1.662653e-12,2.445360e-11,1.556692e-22,2.422890e-15,8.922484e-24,31.933168,168.110947
4,58.0,549.0,0.0,3718.0,0.0,7408.0,0.0,3447.0,234820.0,0.001088,4.496830e-09,5.357170e-11,7.049863e-11,-2.607757e-21,-3.656995e-15,-3.459794e-21,97.202895,279.566519
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2589,30.0,449.0,0.0,5908.0,0.0,10749.0,0.0,5861.0,227003.0,0.001272,5.088607e-08,1.495056e-11,4.185765e-12,1.176664e-23,-5.479233e-16,-3.095125e-23,146.116914,335.373611
2590,78.0,941.0,0.0,8493.0,0.0,13520.0,0.0,8076.0,218892.0,0.001306,4.463930e-09,2.718837e-11,1.971129e-11,3.514490e-22,6.216895e-16,2.910437e-22,203.354691,377.371966
2591,20.0,265.0,0.0,6580.0,0.0,14372.0,0.0,7053.0,221710.0,0.001453,2.697392e-08,1.182427e-11,3.125650e-11,5.458397e-22,4.959958e-15,2.512614e-22,177.274429,364.957789
2592,56.0,653.0,0.0,6170.0,0.0,10470.0,0.0,5679.0,226972.0,0.001491,6.998716e-09,2.124743e-11,2.449372e-11,-3.621635e-22,-2.343994e-16,-4.255170e-22,150.670265,338.024983


In [5]:
# saving features to CSV
train_df.to_csv("training_features.csv", index=False)
test_df.to_csv("test_features.csv", index=False)