In [1]:
# This is a test run to see if we can create an image classifier using Sci-kit Learn instead of Tensorflow.
# Following tutorial found here:
#    https://kapernikov.com/tutorial-image-classification-with-scikit-learn/

# Import dependencies
import matplotlib.pyplot as plt
import numpy as np
import os

# Had to look this one up. Apparently it's for quick pipelining?
import joblib

from skimage.io import imread
from skimage.transform import resize
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from collections import Counter

In [None]:
# Define a function to grab the train/test data and store them as numpy arrays... in a pickle file. 
# I'm not sure I fully understand the purpose of a pickle file.

def resize_images(path,pickle_name,include,width=150,height=150):
    '''
    The goal: load in all the images from the path, resize them, and write them as arrays to a dictionary with labels.
    Save that dictionary to a pickle file.
    I picked width 150 just because the Tensorflow classifier used that sizing.
    
    The parameters:
    path: the file path where the images are stored. 
        Hopefully I can make this work with a link to the github repository at some point.
    pickle_name: a string that becomes part of the file name for the new pickle file
    width: target width of the image in pixels, default 150
    height: target width of the image in pixels, default 150
    include: set[str] = a set containing strings. These strings should be the names of the subdirectories in that path location.
    '''
    
    # Make a dictionary, since I've got to write one to a file eventually.
    data = dict()
    data['label']=[]
    data['filename']=[]
    data['data']=[]
    
    # Make up a file name for the upcoming pickle file.
    pickle_file_name = f"{pickle_name}_{width}x{height}px.pkl"
    
    # Read all the images in the path 
    for subdirectory in os.listdir(path):
        # Have to start by making it through all the subdirectories in the path.
        # I intend to use this by setting my paths to:
        #    "images/train" and "images/test"
        # so all the subdirectories will be the particular types of plants.
        if subdirectory in include:
            current_path = os.path.join(path,subdirectory)
            
            for image_file in os.listdir(current_path):
                # I happen to know that all the images in the dataset are PNG files, so I'm not going to check for file type.
                # Nicely resize the image using the tools imported from SKLearn
                image = imread(os.path.join(current_path,image_file))
                resized_image = resize(image,(width,height))
                
                # It's time to toss the info in our dictionary.
                data['label'].append(subdirectory) # since our plant label is in the folder name
                data['filename'].append(image_file)
                data['data'].append(resized_image)
    
    # Throw that data in a pickle file.
    joblib.dump(data,pickle_file_name)

In [None]:
# It's time to get resizing. First up, the training set.
# Let's find the labeled set first.
train_path = "images/train"
# Store the names of all the subdirectories in there. 
# This will become our include list, since I don't want to exclude any plant types.
train_subdir = os.listdir(train_path)

In [None]:
# Let's try using that function... I'll leave the default values for height and width.
resize_images(path=train_path,pickle_name="training_set",include=train_subdir)

In [2]:
# Check to see if it worked.
check = joblib.load("training_set_150x150px.pkl")

print(f"Number of samples: {len(check['data'])}")
print(f"Keys: {check.keys()}")
print(f"Labels: {np.unique(check['label'])}")

Counter(check['label'])

Number of samples: 4750
Keys: dict_keys(['label', 'filename', 'data'])
Labels: ['Black-grass' 'Charlock' 'Cleavers' 'Common Chickweed' 'Common wheat'
 'Fat Hen' 'Loose Silky-bent' 'Maize' 'Scentless Mayweed'
 'Shepherds Purse' 'Small-flowered Cranesbill' 'Sugar beet']


Counter({'Black-grass': 263,
         'Charlock': 390,
         'Cleavers': 287,
         'Common Chickweed': 611,
         'Common wheat': 221,
         'Fat Hen': 475,
         'Loose Silky-bent': 654,
         'Maize': 221,
         'Scentless Mayweed': 516,
         'Shepherds Purse': 231,
         'Small-flowered Cranesbill': 496,
         'Sugar beet': 385})

In [3]:
data = np.array(check['data'])

  data = np.array(check['data'])


ValueError: could not broadcast input array from shape (150,150,3) into shape (150,150)

In [4]:
# I'm leaving this error here because... it's an educational experience.
shapes = [i.shape for i in check['data']]

In [5]:
# I expect only to have one shape, so then just one element in this set.
set(shapes)

{(150, 150, 3), (150, 150, 4)}

In [6]:
# Okay, so my data shapes are not consistent... I don't love that
# But I also wonder if I can get away with just resizing every entry.
data = [np.resize(i,(150,150,4)).flatten() for i in check['data']]

In [7]:
# Check my array shapes again.
shapes = [i.shape for i in data]
set(shapes)

{(90000,)}

In [8]:
labels = np.array(check['label'])

In [9]:
# check the length of each of those arrays
len(data)

4750

In [10]:
len(labels)

4750

In [11]:
# They match up, nice. 
# The data now has to be split into training and testing sets.
x_train, x_test, y_train, y_test = train_test_split(data,labels,stratify=labels)

In [13]:
# This tutorial uses a few different transformations.
# This other one just goes straight to a support vector classification. I want to see what happens there.
classifier = SVC()

# Try out a whole bunch of parameters and see what happens.
parameters = [{'gamma': [0.01, 0.001], 'C': [1, 10], 'kernel':['poly','rbf']}]

In [None]:
# Train the classifier (or rather, the classifiers)
grid_search = GridSearchCV(classifier, parameters)

grid_search.fit(x_train, y_train)