In [2]:
import os
import shutil
import cv2
import pandas as pd
import numpy as np
import h5py
import seaborn as sns
import mahotas as mt

from matplotlib import pyplot as plt

# Split Dataset to Train & Testing (Run 2 times) change fromName with augmented_1 & augmented 2

In [None]:
import os
import shutil

fromName = 'augmented_2'
toName = 'augmented_2_split'
classes = os.listdir('./' + fromName)
names = classes
for name in names:
    if not os.path.exists("./" + toName + "/train/" + name):
        os.makedirs("./" + toName + "/train/" + name)
        os.makedirs("./" + toName + "/test/" + name)
ii = 0
for classe in classes:
    # Construct the directory path for the current class
    class_dir = './' + fromName + '/' + classe

    # Check if the path is a directory
    if not os.path.isdir(class_dir):
        # If not, skip to the next class
        continue

    tempDic = os.listdir(class_dir)
    tempLength = int(0.8 * len(tempDic))
    src = './' + fromName + '/' + classe
    dist1 = './' + toName + '/train/' + classe
    dist2 = './' + toName + '/test/' + classe
    ii += 1
    for i in range(len(tempDic)):
        if tempDic[i] == '.DS_Store':
            continue  # Skip .DS_Store files
        if i < tempLength:
            shutil.copy(src + '/' + tempDic[i], dist1 + '/' + tempDic[i])
        else:
            shutil.copy(src + '/' + tempDic[i], dist2 + '/' + tempDic[i])


# Load Dataset

In [4]:
PATH_DATASET_AUG_1_TRAIN = "./augmented_1_split/train"
PATH_DATASET_AUG_1_TEST = "./augmented_1_split/test"
PATH_DATASET_AUG_2_TRAIN = "./augmented_2_split/train"
PATH_DATASET_AUG_2_TEST = "./augmented_2_split/test"

# BGR to RGB Conversion

In [6]:
# Converting each image to RGB from BGR format

def rgb_bgr(image):
    rgb_img = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return rgb_img

# RGB to HSV(Hue Saturation Value) Conversion

In [7]:
# Conversion to HSV image format from RGB

def bgr_hsv(rgb_img):
    hsv_img = cv2.cvtColor(rgb_img, cv2.COLOR_RGB2HSV)
    return hsv_img

# Image segmentation

In [8]:
# for extraction of green and brown color

def img_segmentation(rgb_img,hsv_img):
    lower_green = np.array([25,0,20])
    upper_green = np.array([100,255,255])
    healthy_mask = cv2.inRange(hsv_img, lower_green, upper_green)
    result = cv2.bitwise_and(rgb_img,rgb_img, mask=healthy_mask)
    lower_brown = np.array([10,0,10])
    upper_brown = np.array([30,255,255])
    disease_mask = cv2.inRange(hsv_img, lower_brown, upper_brown)
    disease_result = cv2.bitwise_and(rgb_img, rgb_img, mask=disease_mask)
    final_mask = healthy_mask + disease_mask
    final_result = cv2.bitwise_and(rgb_img, rgb_img, mask=final_mask)
    return final_result

# Feature Descriptor:

### 1. Hu Moments

In [9]:
# feature-descriptor-1: Hu Moments
def fd_hu_moments(image):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    feature = cv2.HuMoments(cv2.moments(image)).flatten()
    return feature

### 2. Haralick Texture

In [10]:
# feature-descriptor-2: Haralick Texture
def fd_haralick(image):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    haralick = mt.features.haralick(gray).mean(axis=0)
    return haralick

### 3. Histogram

In [11]:
# feature-descriptor-3: Color Histogram
def fd_histogram(image, mask=None):
    image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist  = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    cv2.normalize(hist, hist)
    return hist.flatten()

# Loading up the training dataset

In [22]:
# get the training labels
train_labels_1 = []
train_temp_1 = os.listdir(PATH_DATASET_AUG_1_TRAIN)
for i in range(len(train_temp_1)):
    if train_temp_1[i] == '.DS_Store':
        continue  # Skip .DS_Store files
    train_labels_1.append(train_temp_1[i])
    
train_labels_2 = []
train_temp_2 = os.listdir(PATH_DATASET_AUG_2_TRAIN)
for i in range(len(train_temp_2)):
    if train_temp_2[i] == '.DS_Store':
        continue  # Skip .DS_Store files
    train_labels_2.append(train_temp_1[i])
    
# sort the training labels
train_labels_1.sort()
print(train_labels_1)

train_labels_2.sort()
print(train_labels_2)

['Apple___Apple_scab', 'Apple___Black_rot', 'Apple___Cedar_apple_rust', 'Apple___healthy', 'Blueberry___healthy', 'Cherry_(including_sour)___Powdery_mildew', 'Cherry_(including_sour)___healthy', 'Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot', 'Corn_(maize)___Common_rust_', 'Corn_(maize)___Northern_Leaf_Blight', 'Corn_(maize)___healthy', 'Grape___Black_rot', 'Grape___Esca_(Black_Measles)', 'Grape___Leaf_blight_(Isariopsis_Leaf_Spot)', 'Grape___healthy', 'Orange___Haunglongbing_(Citrus_greening)', 'Peach___Bacterial_spot', 'Peach___healthy', 'Pepper,_bell___Bacterial_spot', 'Pepper,_bell___healthy', 'Potato___Early_blight', 'Potato___Late_blight', 'Potato___healthy', 'Raspberry___healthy', 'Soybean___healthy', 'Squash___Powdery_mildew', 'Strawberry___Leaf_scorch', 'Strawberry___healthy', 'Tomato___Bacterial_spot', 'Tomato___Early_blight', 'Tomato___Late_blight', 'Tomato___Leaf_Mold', 'Tomato___Septoria_leaf_spot', 'Tomato___Spider_mites Two-spotted_spider_mite', 'Tomato___Target_Sp

# Generating the Features and Label Embeddings from the dataset

In [28]:
fixed_size = (500, 500)

# Combine both label sets and prepare separate storage
train_label_sets = [train_labels_1, train_labels_2]

# Separate storage for labels and global features 
# Augmented 1
labels_1 = []
global_features_1 = []

# Augmented 2
labels_2 = []
global_features_2 = []

# Loop over both training label sets
for i, train_labels in enumerate(train_label_sets):
    print(f"[STATUS] Starting Global Feature Extraction for label set {i+1}...")

    # Use separate storage based on index
    if i == 0:
        current_sets = train_labels_1
        current_labels = labels_1
        current_global_features = global_features_1
        dataset_path = PATH_DATASET_AUG_1_TRAIN
    else:
        current_sets = train_labels_2
        current_labels = labels_2
        current_global_features = global_features_2
        dataset_path = PATH_DATASET_AUG_2_TRAIN

    # Loop through the training data sub-folders
    for training_name in train_labels:
        # Join the training data path and each species training folder
        img_dir_path = os.path.join(dataset_path, training_name)

        # Get the current training label
        current_label = training_name

        # Loop over the images in each sub-folder
        for img in os.listdir(img_dir_path):
            # Get the image file name
            file = os.path.join(img_dir_path, img)

            # Read the image and resize it to a fixed size
            image = cv2.imread(file)
            image = cv2.resize(image, fixed_size)

            # Running Function Bit By Bit
            RGB_BGR = rgb_bgr(image)
            BGR_HSV = bgr_hsv(RGB_BGR)
            IMG_SEGMENT = img_segmentation(RGB_BGR, BGR_HSV)

            # Call for Global Feature Descriptors
            fv_hu_moments = fd_hu_moments(IMG_SEGMENT)
            fv_haralick = fd_haralick(IMG_SEGMENT)
            fv_histogram = fd_histogram(IMG_SEGMENT)

            # Concatenate global features
            global_feature = np.hstack([fv_histogram, fv_haralick, fv_hu_moments])

            # Update the list of labels and feature vectors
            current_labels.append(current_label)
            current_global_features.append(global_feature)

        print("[STATUS] Processed folder: {}".format(current_label))

print("[STATUS] Completed Global Feature Extraction for both label sets...")

# Final Results
print("Label set 1 has {} labels and {} global features.".format(len(labels_1), len(global_features_1)))
print("Label set 2 has {} labels and {} global features.".format(len(labels_2), len(global_features_2)))


[STATUS] Starting Global Feature Extraction for label set 1...
[STATUS] Processed folder: Apple___Apple_scab
[STATUS] Processed folder: Apple___Black_rot
[STATUS] Processed folder: Apple___Cedar_apple_rust
[STATUS] Processed folder: Apple___healthy
[STATUS] Processed folder: Blueberry___healthy
[STATUS] Processed folder: Cherry_(including_sour)___Powdery_mildew
[STATUS] Processed folder: Cherry_(including_sour)___healthy
[STATUS] Processed folder: Corn_(maize)___Cercospora_leaf_spot Gray_leaf_spot
[STATUS] Processed folder: Corn_(maize)___Common_rust_
[STATUS] Processed folder: Corn_(maize)___Northern_Leaf_Blight
[STATUS] Processed folder: Corn_(maize)___healthy
[STATUS] Processed folder: Grape___Black_rot
[STATUS] Processed folder: Grape___Esca_(Black_Measles)
[STATUS] Processed folder: Grape___Leaf_blight_(Isariopsis_Leaf_Spot)
[STATUS] Processed folder: Grape___healthy
[STATUS] Processed folder: Orange___Haunglongbing_(Citrus_greening)
[STATUS] Processed folder: Peach___Bacterial_sp

In [29]:
# get the overall feature vector size
print("[STATUS] feature Augmented 1 vector size {}".format(np.array(global_features_1).shape))
print("[STATUS] feature Augmented 2 vector size {}".format(np.array(global_features_2).shape))

[STATUS] feature Augmented 1 vector size (31539, 532)
[STATUS] feature Augmented 2 vector size (31539, 532)


In [30]:
# get the overall training label size
# print(labels)
print("[STATUS] training Labels Augmented 1 {}".format(np.array(labels_1).shape))
print("[STATUS] training Labels Augmented 2 {}".format(np.array(labels_2).shape))

[STATUS] training Labels Augmented 1 (31539,)
[STATUS] training Labels Augmented 2 (31539,)


# Encoding the labels

| Label                                  | Encoded Value |
|----------------------------------------|---------------|
| Apple___Apple_scab                     | 0             |
| Apple___Black_rot                      | 1             |
| Apple___Cedar_apple_rust               | 2             |
| Apple___healthy                        | 3             |
| Blueberry___healthy                    | 4             |
| Cherry_(including_sour)___healthy      | 5             |
| Cherry_(including_sour)___Powdery_mildew | 6           |
| Corn_(maize)___Cercospora_leaf_spot_Gray_leaf_spot | 7 |
| Corn_(maize)___Common_rust_            | 8             |
| Corn_(maize)___healthy                 | 9             |
| Corn_(maize)___Northern_Leaf_Blight    | 10            |
| Grape___Black_rot                      | 11            |
| Grape___Esca_(Black_Measles)           | 12            |
| Grape___healthy                        | 13            |
| Grape___Leaf_blight_(Isariopsis_Leaf_Spot) | 14         |
| Orange___Haunglongbing_(Citrus_greening) | 15          |
| Peach___Bacterial_spot                 | 16            |
| Peach___healthy                        | 17            |
| Pepper,_bell___Bacterial_spot          | 18            |
| Pepper,_bell___healthy                 | 19            |
| Potato___Early_blight                  | 20            |
| Potato___healthy                       | 21            |
| Potato___Late_blight                   | 22            |
| Raspberry___healthy                    | 23            |
| Soybean___healthy                      | 24            |
| Squash___Powdery_mildew                | 25            |
| Strawberry___healthy                   | 26            |
| Strawberry___Leaf_scorch               | 27            |
| Tomato___Bacterial_spot                | 28            |
| Tomato___Early_blight                  | 29            |
| Tomato___healthy                       | 30            |
| Tomato___Late_blight                   | 31            |
| Tomato___Leaf_Mold                     | 32            |
| Tomato___Septoria_leaf_spot            | 33            |
| Tomato___Spider_mites_Two-spotted_spider_mite | 34     |
| Tomato___Target_Spot                   | 35            |
| Tomato___Tomato_mosaic_virus           | 36            |
| Tomato___Tomato_Yellow_Leaf_Curl_Virus | 37            |


In [31]:
from sklearn.preprocessing import LabelEncoder
label_sets = [labels_1, labels_2]
# encode the target labels
for i in enumerate(label_sets):
    print(f"[STATUS] Starting Encode for label set {i+1}...")

    # Use separate storage based on index
    if i == 0:
        targetNames_1 = np.unique(labels_1)
        le_1 = LabelEncoder()
        target_1 = le_1.fit_transform(labels_1)
        print(targetNames_1)
        print("[STATUS] training labels 1 encoded...")
    else:
        targetNames_2 = np.unique(labels_2)
        le_2 = LabelEncoder()
        target_2 = le_2.fit_transform(labels_2)
        print(targetNames_2)
        print("[STATUS] training labels 2 encoded...")


TypeError: can only concatenate tuple (not "int") to tuple

# Feature Scaling using MinMaxScaler

In [None]:
# scale features in the range (0-1)
from sklearn.preprocessing import MinMaxScaler

global_features_sets = [global_features_1, global_features_2]
for i in enumerate(train_label_sets):
    print(f"[STATUS] Starting feature vector Normalization for global feature {i+1}...")

    # Use separate storage based on index
    if i == 0:
        scaler_1 = MinMaxScaler(feature_range=(0, 1))
        rescaled_features_1 = scaler_1.fit_transform(global_features_1)
        print("[STATUS] feature vector 1 normalized...")
        rescaled_features_1
    else:
        scaler_2 = MinMaxScaler(feature_range=(0, 1))
        rescaled_features_2 = scaler_2.fit_transform(global_features_2)
        print("[STATUS] feature vector 2 normalized...")
        rescaled_features_2

In [None]:
# Augmented 1
print("[STATUS] target labels 1: {}".format(target_1))
print("[STATUS] target labels 1 shape: {}".format(target_1.shape))

# Augmented 2
print("[STATUS] target labels 2: {}".format(target_2))
print("[STATUS] target labels 2 shape: {}".format(target_2.shape))

# Saving the Features and Labels Embeddings in h5py format 

In [None]:
# Augmented 1
# save the feature vector 1 using HDF5
h5f_data = h5py.File('../embeddings/features/features_1.h5', "w")
h5f_data.create_dataset("dataset_1", data=np.array(rescaled_features_1))

# save the label vector 1 using HDF5
h5f_label = h5py.File('../embeddings/labels/labels_1.h5', "w")
h5f_label.create_dataset("dataset_1", data=np.array(target_1))

# Augmented 2
# save the feature vector 2 using HDF5
h5f_data = h5py.File('../embeddings/features/features_2.h5', "w")
h5f_data.create_dataset("dataset_2", data=np.array(rescaled_features_2))

# save the label vector 2 using HDF5
h5f_label = h5py.File('../embeddings/labels/labels_2.h5', "w")
h5f_label.create_dataset("dataset_2", data=np.array(target_2))

In [None]:
h5f_data.close()
h5f_label.close()

# Evaluating the different models and calculating the accuracy


## 1. Loading the Features and Labels Embeddings from the h5py format


### Augmented 1

In [None]:
# training
import h5py
import numpy as np
import os
import cv2
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.svm import SVC


num_trees = 100
test_size = 0.20
seed = 9
scoring = "accuracy"

# get the training labels
train_labels_1 = os.listdir(PATH_DATASET_AUG_1_TRAIN)


# sort the training labels
train_labels_1.sort()

# create all the machine learning models
models = []
models.append(("SVM", SVC(random_state=seed)))

# variables to hold the results and names
# Augmented 1
results_1 = []
names_1 = []

# import the feature vector and trained labels
# Augemented 1
h5f_data_1 = h5py.File('../embeddings/features/features_1.h5', "r")
h5f_label_1 = h5py.File('../embeddings/labels/labels_1.h5', "r")


global_features_string_1 = h5f_data_1["dataset_1"]
global_labels_string_1 = h5f_label_1["dataset_1"]

global_features_1 = np.array(global_features_string_1)
global_labels_1 = np.array(global_labels_string_1)

h5f_data_1.close()
h5f_label_1.close()

# verify the shape of the feature vector and labels
print("[STATUS] features shape: {}".format(global_features_1.shape))
print("[STATUS] labels shape: {}".format(global_labels_1.shape))

print("[STATUS] training started...")
print(global_labels_1, len(global_labels_1), len(global_features_1))

### Augmented 2

In [None]:
# get the training labels
train_labels_2 = os.listdir(PATH_DATASET_AUG_2_TRAIN)


# sort the training labels
train_labels_2.sort()

# create all the machine learning models
models = []
models.append(("SVM", SVC(random_state=seed)))

# variables to hold the results and names
# Augmented 1
results_2 = []
names_2 = []

# import the feature vector and trained labels
# Augemented 1
h5f_data_2 = h5py.File('../embeddings/features/features_2.h5', "r")
h5f_label_2 = h5py.File('../embeddings/labels/labels_2.h5', "r")


global_features_string_2 = h5f_data_2["dataset_2"]
global_labels_string_2 = h5f_label_2["dataset_2"]

global_features_2 = np.array(global_features_string_2)
global_labels_2 = np.array(global_labels_string_2)

h5f_data_2.close()
h5f_label_2.close()

# verify the shape of the feature vector and labels
print("[STATUS] features shape: {}".format(global_features_2.shape))
print("[STATUS] labels shape: {}".format(global_labels_2.shape))

print("[STATUS] training started...")
print(global_labels_2, len(global_labels_2), len(global_features_2))

### Spliting Dataset

#### Augmented 1

In [None]:
# split the training and testing data
(
    trainDataGlobal_1,
    testDataGlobal_1,
    trainLabelsGlobal_1,
    testLabelsGlobal_1,
) = train_test_split(np.array(global_features_1), np.array(global_labels_1), test_size=test_size, random_state=seed)

print("[STATUS] splitted train and test data...")
print("Train data  : {}".format(trainDataGlobal_1.shape))
print("Test data   : {}".format(testDataGlobal_1.shape))

#### Augmented 2

In [None]:
# split the training and testing data
(
    trainDataGlobal_2,
    testDataGlobal_2,
    trainLabelsGlobal_2,
    testLabelsGlobal_2,
) = train_test_split(np.array(global_features_2), np.array(global_labels_2), test_size=test_size, random_state=seed)

print("[STATUS] splitted train and test data...")
print("Train data  : {}".format(trainDataGlobal_2.shape))
print("Test data   : {}".format(testDataGlobal_2.shape))

In [None]:
trainDataGlobal_1 
trainDataGlobal_2 

#### Model Evaluation using Augmented 1

In [None]:
# 10-fold cross validation
for name_1, model in models:
    kfold = KFold(n_splits=10)
    cv_results_1 = cross_val_score(
        model, trainDataGlobal_1, trainLabelsGlobal_1, cv=kfold, scoring=scoring
    )
    results_1.append(cv_results_1)
    names_1.append(name_1)
    msg = "%s: %f (%f)" % (name_1, cv_results_1.mean(), cv_results_1.std())
    print(msg)

#### Model Evaluation using Augmented 2

In [None]:
# 10-fold cross validation
for name_2, model in models:
    kfold = KFold(n_splits=10)
    cv_results_2 = cross_val_score(
        model, trainDataGlobal_2, trainLabelsGlobal_2, cv=kfold, scoring=scoring
    )
    results_2.append(cv_results_2)
    names_2.append(name_2)
    msg = "%s: %f (%f)" % (name_2, cv_results_2.mean(), cv_results_2.std())
    print(msg)

#### Plotting

In [None]:
# boxplot algorithm
fig = plt.figure()
fig.suptitle("SVM with Augmented 1")
ax = fig.add_subplot(2)
plt.boxplot(results_1)
ax.set_xticklabels(names_1)


# boxplot algorithm
fig = plt.figure()
fig.suptitle("SVM with Augmented 2")
ax = fig.add_subplot(2)
plt.boxplot(results_2)
ax.set_xticklabels(names_2)
plt.show()

## Verifying the accuracy for the SVM Model

In [None]:
svm = SVC(random_state=seed)

#### Augmented 1

In [None]:
svm.fit(trainDataGlobal_1, trainLabelsGlobal_1)
len(trainDataGlobal_1), len(trainLabelsGlobal_1)

In [None]:
y_predict_1 = svm.predict(testDataGlobal_1)
testLabelsGlobal_1

#### Augmented 2

In [None]:
svm.fit(trainDataGlobal_2, trainLabelsGlobal_2)
len(trainDataGlobal_2), len(trainLabelsGlobal_2)

In [None]:
y_predict_2 = svm.predict(testDataGlobal_2)
testLabelsGlobal_2

## Confusion Matrix

In [None]:
def plot_confusion_matrix(y_true, y_pred, labels, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(12, 8))
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix - {title}', fontsize=16)
    plt.show()
    
plot_confusion_matrix(testLabelsGlobal_1, y_predict_1, labels_1, 'Augmented 1')
plot_confusion_matrix(testLabelsGlobal_2, y_predict_2, labels_2, 'Augmented 2')

In [None]:
print(classification_report(testLabelsGlobal_1, y_predict_1))

print(classification_report(testLabelsGlobal_2, y_predict_2))

In [None]:
from sklearn.metrics import accuracy_score
print(f"Accuracy SVM augmented 1 : {accuracy_score(testLabelsGlobal_1, y_predict_1)}")

print(f"Accuracy SVM augmented 2 : {accuracy_score(testLabelsGlobal_2, y_predict_2)}")