Loads a dataset of images of 4 kinds of different seeds represented in jpg format, trains a random forest classifier (and a kNN classifier) on the data, and tests the models.

In [9]:
import os
import numpy as np
from skimage.io import imread
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle
from skimage.transform import resize
from sklearn.metrics import confusion_matrix
import tkinter as tk
from tkinter import filedialog

directory = "train"
data = []
labels = []
failed_files = []
output_files = ["training_data.npy", "training_labels.npy"]

Load and process the data:

In [10]:
def load_image(file, i):

    try:

        if not file.lower().endswith(".png"):
            failed_files.append(file)
            return None
        print(f"Loading {file}")

        # load the image, resize it to a uniform size and flatten it to a 1D array

        img = imread(file, as_gray=False)
        resized_img = resize(img, (150, 150, 3)).flatten()
        label = i
        return resized_img, label
    
    except EOFError:

        print(f"EOFError reading {file}" if i == 0 else f"EOFError reading {file}")
        failed_files.append(file)
        return None
    
    except OSError:

        print(f"OSError reading {file}" if i == 0 else f"OSError reading {file}")
        failed_files.append(file)
        return None
    
    except ValueError:

        print(f"ValueError reading {file}" if i == 0 else f"ValueError reading {file}")
        failed_files.append(file)
        return None
    
    
def prepare_data():

    # load the images and labels from the directory

    for i in range(4):
        path = os.path.join(directory, "kaura" if i == 0 else "ohra" if i == 1 else "ruis" if i == 2 else "vehna")
        files = [os.path.join(path, file) for file in os.listdir(path)]
        for file in files:
            result = load_image(file, i)
            if result is not None:
                img, label = result
                data.append(img)
                labels.append(label)
                print(f"Loaded {file}")

    if not failed_files:
        print("All files loaded successfully")
    else:
        print(f"Failed to load files {failed_files}")


    # save the data and labels as numpy arrays for classification

    data_np = np.array(data)
    labels_np = np.array(labels)
    print("Saving data...")
    np.save(output_files[0], data_np)
    np.save(output_files[1], labels_np)
    print(f"Data saved successfully in {output_files[0]} and {output_files[1]}")
    

prepare_data()


Loading train\kaura\seed_003_001.png
Loaded train\kaura\seed_003_001.png
Loading train\kaura\seed_003_002.png
Loaded train\kaura\seed_003_002.png
Loading train\kaura\seed_003_003.png
Loaded train\kaura\seed_003_003.png
Loading train\kaura\seed_003_004.png
Loaded train\kaura\seed_003_004.png
Loading train\kaura\seed_003_007.png
Loaded train\kaura\seed_003_007.png
Loading train\kaura\seed_003_008.png
Loaded train\kaura\seed_003_008.png
Loading train\kaura\seed_003_009.png
Loaded train\kaura\seed_003_009.png
Loading train\kaura\seed_003_011.png
Loaded train\kaura\seed_003_011.png
Loading train\kaura\seed_003_012.png
Loaded train\kaura\seed_003_012.png
Loading train\kaura\seed_003_013.png
Loaded train\kaura\seed_003_013.png
Loading train\kaura\seed_003_014.png
Loaded train\kaura\seed_003_014.png
Loading train\kaura\seed_003_015.png
Loaded train\kaura\seed_003_015.png
Loading train\kaura\seed_003_016.png
Loaded train\kaura\seed_003_016.png
Loading train\kaura\seed_003_017.png
Loaded train\k

Split the data into training and validation sets:

In [11]:
def split_data():
        
    x_train, x_validation, y_train, y_validation = train_test_split(np.load("training_data.npy"), np.load("training_labels.npy"), test_size=0.2, shuffle=True, stratify=np.load("training_labels.npy"))

    # the data is 3-dimensional (RGB), but the models expect a 2-dimensional array
    x_train = x_train.reshape(x_train.shape[0], -1)
    x_validation = x_validation.reshape(x_validation.shape[0], -1)

    return x_train, x_validation, y_train, y_validation

print("Splitting data...")
x_train, x_validation, y_train, y_validation = split_data()
print("Data split successfully")

Splitting data...
Data split successfully


Train a random forest classifier:

In [12]:
def train_models(x_train, y_train, x_validation, y_validation):

    print("Training random forest model #1...")
    random_forest_model = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1)
    random_forest_model.fit(x_train, y_train)
    pickle.dump(random_forest_model, open("random_forest_model.sav", 'wb'))
    print("Random forest model trained successfully and saved in random_forest_model.sav")

    # print("Training kNN model...")
    # knn_3_model = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
    # knn_3_model.fit(x_train, y_train)
    # pickle.dump(knn_3_model, open("knn_3_model.sav", 'wb'))
    # print("kNN model trained successfully and saved in knn_3_model.sav")


train_models(x_train, y_train, x_validation, y_validation)

Training random forest model #1...
Random forest model trained successfully and saved in random_forest_model.sav


Evaluate the models on the validation data:

In [13]:
def evaluate_models(x_validation, y_validation):
    random_forest_model = pickle.load(open("random_forest_model.sav", 'rb'))
    print("Random forest accuracy: ", random_forest_model.score(x_validation, y_validation))

evaluate_models(x_validation, y_validation)

Random forest accuracy:  0.7579250720461095


Load the test data:

In [14]:
directory = "holdout"
data = []
labels = []
failed_files = []
output_files = ["holdout_data.npy", "holdout_labels.npy"]

prepare_data()

x_test, y_test = np.load("holdout_data.npy"), np.load("holdout_labels.npy")
x_test = x_test.reshape(x_test.shape[0], -1) # the data is 3-dimensional (RGB), but the models expect a 2-dimensional array

Loading holdout\kaura\seed_004_001.png
Loaded holdout\kaura\seed_004_001.png
Loading holdout\kaura\seed_004_002.png
Loaded holdout\kaura\seed_004_002.png
Loading holdout\kaura\seed_004_003.png
Loaded holdout\kaura\seed_004_003.png
Loading holdout\kaura\seed_004_004.png
Loaded holdout\kaura\seed_004_004.png
Loading holdout\kaura\seed_004_005.png
Loaded holdout\kaura\seed_004_005.png
Loading holdout\kaura\seed_004_007.png
Loaded holdout\kaura\seed_004_007.png
Loading holdout\kaura\seed_004_008.png
Loaded holdout\kaura\seed_004_008.png
Loading holdout\kaura\seed_004_009.png
Loaded holdout\kaura\seed_004_009.png
Loading holdout\kaura\seed_004_011.png
Loaded holdout\kaura\seed_004_011.png
Loading holdout\kaura\seed_004_012.png
Loaded holdout\kaura\seed_004_012.png
Loading holdout\kaura\seed_004_013.png
Loaded holdout\kaura\seed_004_013.png
Loading holdout\kaura\seed_004_014.png
Loaded holdout\kaura\seed_004_014.png
Loading holdout\kaura\seed_004_015.png
Loaded holdout\kaura\seed_004_015.png

Evaluate the models on the test data:

In [15]:
model = pickle.load(open("random_forest_model.sav", 'rb'))
print("Random forest accuracy: ", model.score(x_test, y_test))

# model = pickle.load(open("knn_3_model.sav", 'rb'))
# print("kNN accuracy: ", model.score(x_test, y_test))

# confusion matrix:

y_pred = model.predict(x_test)
cm = confusion_matrix(y_test, y_pred, labels=[0, 1, 2, 3])

# Convert to percentages
cm_sum = np.sum(cm, axis=1, keepdims=True)
cm_perc = cm / cm_sum.astype(float) * 100

print(cm_perc)

Random forest accuracy:  0.7914167051222889
[[79.96389892 14.62093863  3.06859206  2.3465704 ]
 [19.06542056 73.08411215  2.99065421  4.85981308]
 [ 7.83972125  2.43902439 78.91986063 10.80139373]
 [ 4.36507937  3.37301587  7.34126984 84.92063492]]


Test image files manually:

In [16]:
while True:

    try:

        # open a file browser to select a file from your storage

        root = tk.Tk()
        root.withdraw()

        file_path = filedialog.askopenfilename()

        # read the image

        img = load_image(file_path, 0)[0]

        # predict the class

        prediction = model.predict([img])

        if prediction[0] == 0:
            print("Prediction: kaura")
        elif prediction[0] == 1:
            print("Prediction: ohra")
        elif prediction[0] == 2:
            print("Prediction: ruis")
        elif prediction[0] == 3:
            print("Prediction: vehnä")


    except TypeError:

        print("File browser closed")
        break


Loading C:/Users/mirok/Documents/Code/GitHub/kandi/seed_demo/holdout/ruis/seed_010_005.png
Prediction: ruis
Loading C:/Users/mirok/Documents/Code/GitHub/kandi/seed_demo/holdout/ruis/seed_010_009.png
Prediction: ruis
Loading C:/Users/mirok/Documents/Code/GitHub/kandi/seed_demo/holdout/ruis/seed_010_019.png
Prediction: ruis
Loading C:/Users/mirok/Documents/Code/GitHub/kandi/seed_demo/holdout/ruis/seed_024_018.png
Prediction: ruis
Loading C:/Users/mirok/Documents/Code/GitHub/kandi/seed_demo/holdout/ruis/seed_024_018.png
Prediction: ruis
Loading C:/Users/mirok/Documents/Code/GitHub/kandi/seed_demo/holdout/ruis/seed_024_034.png
Prediction: kaura
Loading C:/Users/mirok/Documents/Code/GitHub/kandi/seed_demo/holdout/kaura/seed_004_017.png
Prediction: kaura
Loading C:/Users/mirok/Documents/Code/GitHub/kandi/seed_demo/holdout/kaura/seed_008_028.png
Prediction: kaura
Loading C:/Users/mirok/Documents/Code/GitHub/kandi/seed_demo/holdout/kaura/seed_014_018.png
Prediction: kaura
Loading C:/Users/miro