In [1]:
import numpy as np
from datasets import load_dataset
from datasets import concatenate_datasets
from collections import Counter

#loading the dataset
ds = load_dataset("wellCh4n/tomato-leaf-disease-image")


original_labels = sorted(list(set(ds["train"]["label"])))
label_mapping = {old: new for new, old in enumerate(original_labels)}

def remap_label(example):
    example["label"] = label_mapping[example["label"]]
    return example

ds = ds.map(remap_label)

#Combining the 2 splits into one dataset
ds_full = concatenate_datasets([ds["train"], ds["validation"]])

#Data split into 70/15/15
train_test = ds_full.train_test_split(test_size=0.3, stratify_by_column="label", seed=0)
dev_test = train_test["test"].train_test_split(test_size=0.5,stratify_by_column="label", seed=0)

ds_split = {
    "train": train_test["train"],
    "validation": dev_test["train"],
    "test": dev_test["test"]
}


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/224M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/56.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14218 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3569 [00:00<?, ? examples/s]

Map:   0%|          | 0/14218 [00:00<?, ? examples/s]

Map:   0%|          | 0/3569 [00:00<?, ? examples/s]

<h3>Displaying the Class Distribution</h3>

In [2]:
def print_distribution(name, dataset):
    labels = dataset["label"]
    counts = Counter(labels)
    print(f"\n{name} set distribution:")
    for label, count in sorted(counts.items()):
        print(f"  Class {label}: {count} samples ({count/len(labels) * 100:.2f}%)")
    print(f"  Total: {len(labels)} samples")

print_distribution("Train", ds_split["train"])
print_distribution("Validation", ds_split["validation"])
print_distribution("Test", ds_split["test"])



Train set distribution:
  Class 0: 1114 samples (8.95%)
  Class 1: 666 samples (5.35%)
  Class 2: 983 samples (7.90%)
  Class 3: 1336 samples (10.73%)
  Class 4: 700 samples (5.62%)
  Class 5: 1489 samples (11.96%)
  Class 6: 1239 samples (9.95%)
  Class 7: 3750 samples (30.12%)
  Class 8: 1173 samples (9.42%)
  Total: 12450 samples

Validation set distribution:
  Class 0: 239 samples (8.96%)
  Class 1: 143 samples (5.36%)
  Class 2: 211 samples (7.91%)
  Class 3: 286 samples (10.72%)
  Class 4: 150 samples (5.62%)
  Class 5: 319 samples (11.96%)
  Class 6: 266 samples (9.97%)
  Class 7: 803 samples (30.10%)
  Class 8: 251 samples (9.41%)
  Total: 2668 samples

Test set distribution:
  Class 0: 238 samples (8.92%)
  Class 1: 143 samples (5.36%)
  Class 2: 210 samples (7.87%)
  Class 3: 287 samples (10.75%)
  Class 4: 150 samples (5.62%)
  Class 5: 319 samples (11.95%)
  Class 6: 266 samples (9.97%)
  Class 7: 804 samples (30.12%)
  Class 8: 252 samples (9.44%)
  Total: 2669 samples


<h3>Extracting features for Machine Learning</h3>

In [3]:
import cv2
from skimage.feature import hog, local_binary_pattern
from skimage.color import rgb2gray
from skimage.transform import resize

RESIZE_SHAPE = (128, 128)

# HOG parameters
HOG_ORIENTATIONS = 9
HOG_PIXELS_PER_CELL = (8, 8)
HOG_CELLS_PER_BLOCK = (2, 2)

# LBP parameters
LBP_RADIUS = 3
LBP_POINTS = 8 * LBP_RADIUS

def extract_features(image):

    img_resized = resize(image, RESIZE_SHAPE, anti_aliasing=True)
    img_resized = (img_resized * 255).astype(np.uint8)

   #HOG
    gray_float = rgb2gray(img_resized)
    gray = (gray_float * 255).astype(np.uint8) 
    
    hog_features = hog(
        gray,
        orientations=HOG_ORIENTATIONS,
        pixels_per_cell=HOG_PIXELS_PER_CELL,
        cells_per_block=HOG_CELLS_PER_BLOCK,
        visualize=False,
    )

    #LBP
    lbp = local_binary_pattern(gray, LBP_POINTS, LBP_RADIUS, method="uniform")

    lbp_hist, _ = np.histogram(
        lbp.ravel(),
        bins=np.arange(0, LBP_POINTS + 3),
        range=(0, LBP_POINTS + 2)
    )
    lbp_hist = lbp_hist.astype("float")
    lbp_hist /= (lbp_hist.sum() + 1e-7) 

    #Color Histogram
    hsv = cv2.cvtColor(img_resized, cv2.COLOR_RGB2HSV)

    hsv_hist = []
    hist_sizes = [180, 256, 256]
    ranges = [(0, 180), (0, 256), (0, 256)]

    for ch, (bins, rng) in enumerate(zip(hist_sizes, ranges)):
        hist = cv2.calcHist([hsv], [ch], None, [bins], rng)
        hsv_hist.extend(hist.flatten())

    hsv_hist = np.array(hsv_hist, dtype=np.float32)

    hsv_hist /= (np.sum(hsv_hist) + 1e-7)

    final_vector = np.concatenate([
        hog_features,
        lbp_hist,
        hsv_hist
    ])

    return final_vector

In [4]:
from tqdm import tqdm

def process_split(dataset_split):
    X = []
    y = []

    for item in tqdm(dataset_split, desc="Extracting features", ncols=100):
        img = np.array(item["image"].convert("RGB"))

        features = extract_features(img)

        X.append(features)
        y.append(item["label"])

    X = np.array(X, dtype=np.float32)
    y = np.array(y, dtype=np.int64)

    return X, y


X_train, y_train = process_split(ds_split["train"])
print("Train features shape:", X_train.shape)

X_validation, y_validation = process_split(ds_split["validation"])
print("Dev features shape:", X_validation.shape)

X_test, y_test = process_split(ds_split["test"])
print("Test features shape:", X_test.shape)


Extracting features: 100%|████████████████████████████████████| 12450/12450 [04:05<00:00, 50.72it/s]


Train features shape: (12450, 8818)


Extracting features: 100%|██████████████████████████████████████| 2668/2668 [00:51<00:00, 51.60it/s]


Dev features shape: (2668, 8818)


Extracting features: 100%|██████████████████████████████████████| 2669/2669 [00:51<00:00, 51.33it/s]

Test features shape: (2669, 8818)





In [5]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_validation_scaled = scaler.transform(X_validation)
X_test_scaled = scaler.transform(X_test)

<h3>Training Machine Learning Models</h3>

In [6]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
from xgboost import XGBClassifier

final_model = XGBClassifier(
    objective="multi:softmax",
    num_class=4,
    tree_method="hist",   # fast and stable
    learning_rate=0.05,
    n_estimators=400,
    max_depth=6,
    subsample=0.9,
    colsample_bytree=0.9,
    gamma=0.1,
    min_child_weight=3,
    eval_metric="mlogloss"
)

final_model.fit(X_train_scaled, y_train)

KeyboardInterrupt: 

<h4>Random Forest</h4>

In [None]:
def evaluate(model, Xv, yv):
    preds = model.predict(Xv)
    return accuracy_score(yv, preds)

rf_param_grid = {
    "n_estimators": [50, 100],
    "max_depth": [10, 20],
    "min_samples_split": [2, 5],
}

best_rf_acc = -1
best_rf_params = None

for params in tqdm(ParameterGrid(rf_param_grid)):
    rf = RandomForestClassifier(
        class_weight='balanced',
        random_state=0,
        **params
    )
    rf.fit(X_train_scaled, y_train)
    acc = evaluate(rf, X_validation_scaled, y_validation)

    if acc > best_rf_acc:
        best_rf_acc = acc
        best_rf_params = params

print("Best RF Params:", best_rf_params)
print("Validation Accuracy:", best_rf_acc)

#Retraining
rf_final = RandomForestClassifier(
    class_weight='balanced',
    random_state=34,
    **best_rf_params
)
rf_final.fit(
    np.concatenate([X_train_scaled, X_validation_scaled]),
    np.concatenate([y_train, y_validation])
)

<h4>SVM</h4>

In [None]:

svm_final = SVC(
    class_weight="balanced",
    random_state=0,
    C=10,
    gamma="scale",
    kernel="rbf"
)

svm_final.fit(
    np.concatenate([X_train_scaled, X_validation_scaled]),
    np.concatenate([y_train, y_validation])
)

<h4>K-Nearest Neighbor (KNN)</h4>

In [None]:
knn_param_grid = {
    "n_neighbors": [3, 5, 7, 9],
    "weights": ["uniform", "distance"],
    "p": [1, 2],
}

best_knn_acc = -1
best_knn_params = None

for params in tqdm(ParameterGrid(knn_param_grid)):
    knn = KNeighborsClassifier(**params)
    knn.fit(X_train_scaled, y_train)
    acc = evaluate(knn, X_validation_scaled, y_validation)

    if acc > best_knn_acc:
        best_knn_acc = acc
        best_knn_params = params

print("Best KNN Params:", best_knn_params)
print("Validation Accuracy:", best_knn_acc)

# Retraining
knn_final = KNeighborsClassifier(**best_knn_params)
knn_final.fit(
    np.concatenate([X_train_scaled, X_validation_scaled]),
    np.concatenate([y_train, y_validation])
)


<h3>Testing the Models</h3>

In [None]:
def final_results(model, Xtest, ytest, name):
    preds = model.predict(Xtest)
    print(f"{name}:")
    print(f"Accuracy: {(accuracy_score(ytest, preds) * 100):.2f}%")
    print("\nClassification Report:\n", classification_report(ytest, preds))
    cm = confusion_matrix(ytest, preds)

    # Plot Confusion Matrix
    plt.figure(figsize=(7,6))
    sns.heatmap(cm, annot=False, cmap="Blues")
    plt.title(f"{name} Confusion Matrix")
    plt.show()

In [None]:
final_results(final_model,  x_test, y_test, "XGboost")

In [None]:
final_results(rf_final,  X_test_scaled, y_test, "Random Forest")
final_results(svm_final, X_test_scaled, y_test, "SVM")
final_results(knn_final, X_test_scaled, y_test, "KNN")


<h3>Processing Images for Deep Learning</h3>

In [None]:
import tensorflow as tf

def hf_to_tfds(hf_split):
    def gen():
        for item in hf_split:
            img = item["image"]
            img = tf.keras.preprocessing.image.img_to_array(img)
            label = item["label"]
            yield img, label

    return tf.data.Dataset.from_generator(
        gen,
        output_types=(tf.float32, tf.int32),
        output_shapes=((None, None, 3), ())
    )

train_ds_raw = hf_to_tfds(ds_split["train"])
val_ds_raw   = hf_to_tfds(ds_split["validation"])
test_ds_raw  = hf_to_tfds(ds_split["test"])


In [None]:
IMG_SIZE = 128

# Resize images and prepare for resnet
def preprocess(image, label):
    image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
    image = image / 255
    return image, label

# Augmentation
data_augmentation = tf.keras.Sequential([
    tf.keras.layers.RandomFlip("horizontal"),
    tf.keras.layers.RandomRotation(0.15),
    tf.keras.layers.RandomZoom(0.1),
    tf.keras.layers.RandomContrast(0.15),
], name="data_augmentation")

def augment(image, label):
    return data_augmentation(image), label

BATCH_SIZE = 32

# Train dataset
train_ds = (
    train_ds_raw
    .shuffle(2000)
    .map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    .map(augment, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

# Validation dataset
val_ds = (
    val_ds_raw
    .map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)

# Test dataset
test_ds = (
    test_ds_raw
    .map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .prefetch(tf.data.AUTOTUNE)
)


In [None]:
# Calculating the class weights
def compute_class_weights(labels):
    counts = Counter(labels)
    total = sum(counts.values())

    class_weights = {
        cls: total / (len(counts) * count)
        for cls, count in counts.items()
    }
    return class_weights

labels_list = [item["label"] for item in ds_split["train"]]
class_weights = compute_class_weights(labels_list)

<h3>Building and training the CNN</h3>

In [None]:
def create_cnn_model(input_shape=(IMG_SIZE, IMG_SIZE, 3), num_classes=9):
    inputs = tf.keras.Input(shape=input_shape)

    x = tf.keras.layers.Conv2D(32, (3, 3), padding="same", activation="relu")(inputs)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)

    x = tf.keras.layers.Conv2D(64, (3, 3), padding="same", activation="relu")(x)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)

    x = tf.keras.layers.Conv2D(128, (3, 3), padding="same", activation="relu")(x)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)

    x = tf.keras.layers.Conv2D(256, (3, 3), padding="same", activation="relu")(x)
    x = tf.keras.layers.MaxPooling2D((2, 2))(x)

    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    x = tf.keras.layers.Dropout(0.5)(x)

    x = tf.keras.layers.Dense(256, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.5)(x)

    outputs = tf.keras.layers.Dense(num_classes, activation="softmax")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    return model

model = create_cnn_model(input_shape=(IMG_SIZE, IMG_SIZE, 3))
model.summary()


model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"]
)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=5,
    restore_best_weights=True
)

checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "best_epoch.h5",
    monitor="val_loss",
    save_best_only=True,
    verbose=1
)


EPOCHS = 50

history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=val_ds,
    class_weight=class_weights,
    callbacks=[early_stop, checkpoint]
)

<h3>Testing the CNN</h3>

In [None]:

y_true = []
y_pred = []

for images, labels in test_ds:
    preds = model.predict(images)
    preds = np.argmax(preds, axis=1)

    y_true.extend(labels.numpy())
    y_pred.extend(preds)

y_true = np.array(y_true)
y_pred = np.array(y_pred)

acc = accuracy_score(y_true, y_pred)
print(f"Test Accuracy: {acc * 100:.2f}%")

print("\nCNN Classification Report:")
print(classification_report(y_true, y_pred))

cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=False, cmap='Blues')
plt.title("CNN Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()
