Loads a dataset of images of 4 kinds of different seeds represented in jpg format, trains a random forest classifier (and a kNN classifier) on the data, and tests the models.

In [None]:
import os
import numpy as np
from skimage.io import imread
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle
from skimage.transform import resize
from sklearn.metrics import confusion_matrix
import tkinter as tk
from tkinter import filedialog

Load and process the data:

In [None]:
from img2vec_pytorch import Img2Vec
from PIL import Image

img2vec = Img2Vec()

def load_image(file, i):
    if not file.lower().endswith(".png"):
        raise Exception('Invalid file name')

    print(f"Loading {file}")

    # load the image, resize it to a uniform size and flatten it to a 1D array

    # img = imread(file, as_gray=False)
    # resized_img = resize(img, (150, 150, 3)).flatten()
    features = img2vec.get_vec(Image.open(file))
    label = i
    return features, label


def prepare_data(directory, data_file, labels_file):
    failed_files = []
    data = []
    labels = []

    # load the images and labels from the directory

    for i in range(4):
        path = os.path.join(directory, ["kaura", "ohra", "ruis", "vehna"][i])
        files = [os.path.join(path, file) for file in os.listdir(path)]
        for file in files:
            try:
                img, label = load_image(file, i)
            except Exception as e:
                print(f"Error reading {file}: {e}")
                failed_files.append(file)
                continue

            data.append(img)
            labels.append(label)
            print(f"Loaded {file}")

    if not failed_files:
        print("All files loaded successfully")
    else:
        print(f"Failed to load files {failed_files}")


    # save the data and labels as numpy arrays for classification

    data_np = np.array(data)
    labels_np = np.array(labels)
    print("Saving data...")
    np.save(data_file, data_np)
    np.save(labels_file, labels_np)
    print(f"Data saved successfully in {data_file} and {labels_file}")
    return data_np, labels_np


def load_cached(directory, data_file, labels_file):
    try:
        data, labels = np.load(data_file), np.load(labels_file)
        print('Successfully loaded cached data')
        return data, labels
    except:
        return prepare_data(directory, data_file, labels_file)

Split the data into training and validation sets:

In [None]:
def split_data():
    x, y = load_cached('train', 'training_data_f.npy', 'training_labels_f.npy')
    x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=0.2, shuffle=True, stratify=y)

    # the data is 3-dimensional (RGB), but the models expect a 2-dimensional array

    # No, it's not. load_image flattens it.
    # x_train = x_train.reshape(x_train.shape[0], -1)
    # x_validation = x_validation.reshape(x_validation.shape[0], -1)

    return x_train, x_validation, y_train, y_validation

print("Splitting data...")
x_train, x_validation, y_train, y_validation = split_data()
print("Data split successfully")

Train a random forest classifier:

In [None]:
def train_models(x_train, y_train, x_validation, y_validation):

    print("Training random forest model #1...")
    random_forest_model = RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1)
    random_forest_model.fit(x_train, y_train)
    with open("random_forest_model_f.sav", 'wb') as f:
        pickle.dump(random_forest_model, f)
    print("Random forest model trained successfully and saved in random_forest_model.sav")

    # print("Training kNN model...")
    # knn_3_model = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
    # knn_3_model.fit(x_train, y_train)
    # pickle.dump(knn_3_model, open("knn_3_model.sav", 'wb'))
    # print("kNN model trained successfully and saved in knn_3_model.sav")


train_models(x_train, y_train, x_validation, y_validation)

Evaluate the models on the validation data:

In [None]:
def evaluate_models(x_validation, y_validation):
    with open("random_forest_model_f.sav", 'rb') as f:
        random_forest_model = pickle.load(f)
    print("Random forest accuracy: ", random_forest_model.score(x_validation, y_validation))

evaluate_models(x_validation, y_validation)

Load the test data:

In [None]:
x_test, y_test = load_cached('holdout', 'holdout_data_f.npy', 'holdout_labels_f.npy')
x_test = x_test.reshape(x_test.shape[0], -1) # the data is 3-dimensional (RGB), but the models expect a 2-dimensional array

Evaluate the models on the test data:

In [None]:
with open("random_forest_model_f.sav", 'rb') as f:
    model = pickle.load(f)
print("Random forest accuracy: ", model.score(x_test, y_test))

# model = pickle.load(open("knn_3_model.sav", 'rb'))
# print("kNN accuracy: ", model.score(x_test, y_test))

# confusion matrix:

y_pred = model.predict(x_test)
cm = confusion_matrix(y_test, y_pred, labels=[0, 1, 2, 3])

# Convert to percentages
cm_sum = np.sum(cm, axis=1, keepdims=True)
cm_perc = cm / cm_sum.astype(float) * 100

print(cm_perc)

Test image files manually:

In [None]:
while True:

    try:

        # open a file browser to select a file from your storage

        root = tk.Tk()
        root.withdraw()

        file_path = filedialog.askopenfilename()

        # read the image

        img = load_image(file_path, 0)[0]

        # predict the class

        prediction = model.predict([img])

        if prediction[0] == 0:
            print("Prediction: kaura")
        elif prediction[0] == 1:
            print("Prediction: ohra")
        elif prediction[0] == 2:
            print("Prediction: ruis")
        elif prediction[0] == 3:
            print("Prediction: vehnä")


    except TypeError:

        print("File browser closed")
        break
