In [1]:
# This is a test run to see if we can create an image classifier using Sci-kit Learn instead of Tensorflow.
# Following tutorials found here:
#    https://kapernikov.com/tutorial-image-classification-with-scikit-learn/
#    https://www.youtube.com/watch?v=il8dMDlXrIE

# Import dependencies
import matplotlib.pyplot as plt
import numpy as np
import os

# Had to look this one up. Apparently it's for quick pipelining?
import joblib

from skimage.io import imread
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
# Define a function to grab the train/test data and store them as numpy arrays... in a pickle file. 
# I'm not sure I fully understand the purpose of a pickle file.

def resize_images(path,pickle_name,include,width=150,height=150):
    '''
    The goal: load in all the images from the path, resize them, and write them as arrays to a dictionary with labels.
    Save that dictionary to a pickle file.
    I picked width 150 just because the Tensorflow classifier used that sizing.
    
    The parameters:
    path: the file path where the images are stored. 
        Hopefully I can make this work with a link to the github repository at some point.
    pickle_name: a string that becomes part of the file name for the new pickle file
    width: target width of the image in pixels, default 150
    height: target width of the image in pixels, default 150
    include: set[str] = a set containing strings. These strings should be the names of the subdirectories in that path location.
    '''
    
    # Make a dictionary, since I've got to write one to a file eventually.
    data = dict()
    data['label']=[]
    data['filename']=[]
    data['data']=[]
    
    # Make up a file name for the upcoming pickle file.
    pickle_file_name = f"{pickle_name}_{width}x{height}px.pkl"
    
    # Read all the images in the path 
    for subdirectory in os.listdir(path):
        # Have to start by making it through all the subdirectories in the path.
        # I intend to use this by setting my paths to:
        #    "images/train" and "images/test"
        # so all the subdirectories will be the particular types of plants.
        if subdirectory in include:
            current_path = os.path.join(path,subdirectory)
            
            for image_file in os.listdir(current_path):
                # I happen to know that all the images in the dataset are PNG files, so I'm not going to check for file type.
                # Nicely resize the image using the tools imported from SKLearn
                image = imread(os.path.join(current_path,image_file))
                resized_image = resize(image,(width,height))
                
                # It's time to toss the info in our dictionary.
                data['label'].append(subdirectory) # since our plant label is in the folder name
                data['filename'].append(image_file)
                data['data'].append(resized_image)
    
    # Throw that data in a pickle file.
    joblib.dump(data,pickle_file_name)

In [3]:
# It's time to get resizing. First up, the training set.
# Let's find the labeled set first.
train_path = "images/train"
# Store the names of all the subdirectories in there. 
# This will become our include list, since I don't want to exclude any plant types.
train_subdir = os.listdir(train_path)

In [4]:
# Let's try using that function... I'll leave the default values for height and width.
resize_images(path=train_path,pickle_name="training_set",include=train_subdir)

In [5]:
# Check to see if it worked.
check = joblib.load("training_set_150x150px.pkl")

print(f"Number of samples: {len(check['data'])}")
print(f"Keys: {check.keys()}")
print(f"Labels: {np.unique(check['label'])}")

Counter(check['label'])

Number of samples: 4750
Keys: dict_keys(['label', 'filename', 'data'])
Labels: ['Black-grass' 'Charlock' 'Cleavers' 'Common Chickweed' 'Common wheat'
 'Fat Hen' 'Loose Silky-bent' 'Maize' 'Scentless Mayweed'
 'Shepherds Purse' 'Small-flowered Cranesbill' 'Sugar beet']


Counter({'Black-grass': 263,
         'Charlock': 390,
         'Cleavers': 287,
         'Common Chickweed': 611,
         'Common wheat': 221,
         'Fat Hen': 475,
         'Loose Silky-bent': 654,
         'Maize': 221,
         'Scentless Mayweed': 516,
         'Shepherds Purse': 231,
         'Small-flowered Cranesbill': 496,
         'Sugar beet': 385})

In [13]:
# That worked. I happen to know there are 4750 images in the labeled dataset.
# Let's grab the data and labels, then we'll have to split them up into training and testing.
data = np.array(check['data'])
labels = np.array(check['label'])

  data = np.array(check['data'])


ValueError: could not broadcast input array from shape (150,150,3) into shape (150,150)

In [17]:
# I'm leaving this error here instead of trying to fix it and go back because... it's an educational experience.
# I don't know why the data array is shaped this way, I'm just gonna try to work around it.
# Let's try another transformation - flattening. Vectors are a lot easier to work with.
data = np.array([i.flatten() for i in check['data']],dtype=object)

In [19]:
labels = np.array(check['label'])

In [20]:
# It's time to pray... or split into training and testing sets.
x_train,x_test,y_train,y_test = train_test_split(data,labels,stratify=labels)

In [21]:
# Set up a classifier and a whole bunch of parameter options, for curiosity's sake.

from sklearn.svm import SVC

classifier = SVC()

parameters = [{'C':[1,10,100],'kernel':['linear','poly','rbf','sigmoid'],'gamma':[0.01,0.001,0.0001]}]

In [22]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(classifier, parameters)

In [26]:
# Fit the model(s).
grid_search.fit(x_train,y_train)

180 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
180 fits failed with the following error:
TypeError: only size-1 arrays can be converted to Python scalars

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "D:\Anaconda\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "D:\Anaconda\lib\site-packages\sklearn\svm\_base.py", line 190, in fit
    X, y = self._validate_data(
  File "D:\Anaconda\lib\site-packages\sklearn\base.py", line 581, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "D:\Anaconda\lib\site-packages\sklearn\utils\v

ValueError: setting an array element with a sequence.