In [1]:
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt

In [2]:
# progress bar for pandas
from tqdm.notebook import tqdm
tqdm.pandas()

  from pandas import Panel


### Read labels & Images

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [4]:
EPOCHS = 20
SAMPLE_LEN = 1821
IMAGE_PATH = "images/"

In [5]:
def load_image(image_id):
    file_path = image_id + ".jpg"
    image = cv2.imread(IMAGE_PATH + file_path)
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

train_images = train_data["image_id"][:SAMPLE_LEN].progress_apply(load_image)

HBox(children=(IntProgress(value=0, max=1821), HTML(value='')))




In [6]:
test_images = test_data["image_id"][:SAMPLE_LEN].progress_apply(load_image)

HBox(children=(IntProgress(value=0, max=1821), HTML(value='')))




#### Covert labels to label encoding

In [7]:
encoded_labels = train_data[['healthy', 'multiple_diseases', 'rust', 'scab']]
# rust = 3, scab = 4, healthy = 1, multiple_diseases = 2

In [8]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
labels = [[1],[2],[3],[4]]
encoder = enc.fit(labels)
decoded_labels = encoder.inverse_transform(encoded_labels)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [9]:
unique, counts = np.unique(decoded_labels, return_counts=True)
print (np.asarray((unique, counts)).T)

[[  1. 516.]
 [  2.  91.]
 [  3. 622.]
 [  4. 592.]]


### Preprocessing

#### Histogram of Gradients

In [10]:
### initialize hog with default parameters
winSize = (16,16)
blockSize = (16,16)
blockStride = (8,8)
cellSize = (8,8)
nbins = 9
hog = cv2.HOGDescriptor(winSize,blockSize,blockStride,cellSize,nbins)
winStride = (8,8)

In [11]:
# train 
hog_tr = [hog.compute(cv2.resize(im,(136,136),interpolation=cv2.INTER_AREA),winStride).reshape(9216) for im in train_images]

In [12]:
hog_tr2 = [hog.compute(cv2.resize(im,(200,200),interpolation=cv2.INTER_AREA),winStride).reshape(20736) for im in train_images]

In [13]:
# test
hog_te = [hog.compute(cv2.resize(im,(136,136),interpolation=cv2.INTER_AREA),winStride).reshape(9216) for im in test_images]

In [14]:
hog_te2 = [hog.compute(cv2.resize(im,(200,200),interpolation=cv2.INTER_AREA),winStride).reshape(20736) for im in test_images]

#### Resizing (interpolation method, inter area)

In [15]:
# inner area
img_size = 100
ptrain_images = []
for image in train_images:
    image=cv2.resize(image,(img_size,img_size),interpolation=cv2.INTER_AREA)
    ptrain_images.append(image)

In [16]:
# inner area
ptest_images = []
for image in test_images:
    image=cv2.resize(image,(img_size,img_size),interpolation=cv2.INTER_AREA)
    ptest_images.append(image)

#### Extracting and vectorizing blue channel

In [17]:
blue_train = [(ptrain_images[idx][:,:,2].reshape(img_size**2)) for idx in range(len(ptrain_images))]
blue_train = np.array(blue_train)
blue_train.shape

(1821, 10000)

In [18]:
blue_test = [(ptest_images[idx][:,:,2].reshape(img_size**2)) for idx in range(len(ptest_images))]
blue_test = np.array(blue_test)
blue_test.shape

(1821, 10000)

#### Vectorizing all channels

In [19]:
X_train = np.ndarray(shape=(len(ptrain_images), img_size*img_size*3), dtype=np.float32)
for i, image in enumerate(ptrain_images):
    X_train[i,] = image.reshape(img_size*img_size*3)
X_train.shape    

(1821, 30000)

In [20]:
X_test = np.ndarray(shape=(len(ptest_images), img_size*img_size*3), dtype=np.float32)
for i, image in enumerate(ptest_images):
    X_test[i,] = image.reshape(img_size*img_size*3)
X_test.shape    

(1821, 30000)

#### Over-sampling

In [21]:
from imblearn.over_sampling import SMOTE 

Using TensorFlow backend.


In [22]:
# for training data with all 3 channels
y_train = decoded_labels.ravel()
sm = SMOTE(random_state = 0) 
X_train, y_train = sm.fit_resample(X_train,y_train)
X_train.shape, y_train.shape

((2488, 30000), (2488,))

In [23]:
# for training data with only the blue channel 
y_tr_blue = decoded_labels.ravel()
sm_blue = SMOTE(random_state = 0) 
blue_train, y_tr_blue = sm_blue.fit_resample(blue_train, y_tr_blue)
blue_train.shape, y_tr_blue.shape

((2488, 10000), (2488,))

In [24]:
# for training data preprocess with HOG (smaller dimension)
y_tr_hog = decoded_labels.ravel()
sm_hog = SMOTE(random_state = 0) 
hog_tr, y_tr_hog = sm_hog.fit_resample(hog_tr, y_tr_hog)
hog_tr.shape, y_tr_hog.shape

((2488, 9216), (2488,))

In [25]:
# for training data preprocess with HOG (larger dimension)
y_tr_hog2 = decoded_labels.ravel()
hog_tr2, y_tr_hog2 = sm_hog.fit_resample(hog_tr2, y_tr_hog2)
hog_tr2.shape, y_tr_hog2.shape

((2488, 20736), (2488,))

In [26]:
# inspect composition of labels after oversampling
unique, counts = np.unique(y_train, return_counts=True)
print (np.asarray((unique, counts)).T)

[[  1. 622.]
 [  2. 622.]
 [  3. 622.]
 [  4. 622.]]


### KNN (blue channel only)  Kaggle Score = 0.50302

In [27]:
# from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [28]:
# helper function for printing out grid search results 
def print_grid_search_metrics(gs):
    print ("Best score: " + str(gs.best_score_))
    print ("Best parameters set:")
    best_parameters = gs.best_params_
    for param_name in sorted(parameters.keys()):
        print(param_name + ':' + str(best_parameters[param_name]))

In [29]:
# CV with Grid Search
parameters = {
    'n_neighbors' : [3, 5, 10, 20],
}
Grid_KNNb = GridSearchCV(neighbors.KNeighborsClassifier(), parameters, cv=5)
Grid_KNNb.fit(blue_train, y_tr_blue)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [3, 5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [30]:
# display accuracy and best hyperparameters
print_grid_search_metrics(Grid_KNNb)

Best score: 0.43086816720257237
Best parameters set:
n_neighbors:3


In [31]:
# predict 
Knnb_pred = Grid_KNNb.predict(blue_test)

In [32]:
# inspect composition of labels
unique, counts = np.unique(Knnb_pred, return_counts=True)
print (np.asarray([unique, counts], dtype=np.int32).T)

[[   1 1192]
 [   2   99]
 [   3  162]
 [   4  368]]


In [33]:
# output csv files for submission
labels = pd.get_dummies(Knnb_pred)
labels.columns=['healthy', 'multiple_diseases', 'rust', 'scab']
knnb_result = test_data.join(labels)
knnb_result.to_csv("knnb_result.csv", index=False)

In [34]:
labels.columns=['healthy', 'multiple_diseases', 'rust', 'scab']

### KNN (all channels) Kaggle score = 0.51046

In [35]:
# CV with Grid Search
knn = neighbors.KNeighborsClassifier(n_neighbors=4)
parameters = {
    'n_neighbors' : [3, 5, 10, 20],
}
Grid_KNN = GridSearchCV(neighbors.KNeighborsClassifier(), parameters, cv=5)
Grid_KNN.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [3, 5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [36]:
# display accuracy and best hyperparameters
print_grid_search_metrics(Grid_KNN)

Best score: 0.40836012861736337
Best parameters set:
n_neighbors:3


In [37]:
# predict
Knn_pred = Grid_KNN.predict(X_test)

In [38]:
# inspect composition of labels
unique, counts = np.unique(Knn_pred, return_counts=True)
print (np.asarray([unique, counts], dtype=np.int32).T)

[[   1  601]
 [   2 1040]
 [   3   23]
 [   4  157]]


In [39]:
# output csv files for submission
labels = pd.get_dummies(Knn_pred)
labels.columns=['healthy', 'multiple_diseases', 'rust', 'scab']
Knn_result = test_data.join(labels)
Knn_result.to_csv("Knn_result.csv", index=False)

### KNN with HOG (vec_len 9216)  Kaggle score = 0.54315

In [40]:
# CV with Grid Search
knn = neighbors.KNeighborsClassifier(n_neighbors=4)
parameters = {
    'n_neighbors' : [3, 5, 10, 20],
}
Grid_Khog = GridSearchCV(neighbors.KNeighborsClassifier(), parameters, cv=5)
Grid_Khog.fit(hog_tr, y_tr_hog)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [3, 5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [41]:
# display accuracy and best hyperparameters
print_grid_search_metrics(Grid_Khog)

Best score: 0.3842443729903537
Best parameters set:
n_neighbors:3


In [42]:
# predict and output csv files for submission
Khog_pred = Grid_Khog.predict(hog_te)

In [43]:
# inspect composition of labels
unique, counts = np.unique(Khog_pred, return_counts=True)
print (np.asarray([unique, counts], dtype=np.int32).T)

[[   1  471]
 [   2 1239]
 [   4  111]]


In [44]:
# output csv files for submission
labels = pd.get_dummies(np.append(Khog_pred, 3))
labels.columns=['healthy', 'multiple_diseases', 'rust', 'scab']
Khog_result = test_data.join(labels[:-1])
Khog_result.to_csv("Khog_result.csv", index=False)

### KNN with HOG (vec_len 20736)  Kaggle score = 0.52084

In [45]:
# CV with Grid Search
knn = neighbors.KNeighborsClassifier(n_neighbors=4)
parameters = {
    'n_neighbors' : [3, 5, 10, 20],
}
Grid_Khog2 = GridSearchCV(neighbors.KNeighborsClassifier(), parameters, cv=5)
Grid_Khog2.fit(hog_tr2, y_tr_hog2)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='warn', n_jobs=None,
             param_grid={'n_neighbors': [3, 5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [46]:
# display accuracy and best hyperparameters
print_grid_search_metrics(Grid_Khog2)

Best score: 0.39308681672025725
Best parameters set:
n_neighbors:3


In [47]:
# predict and output csv files for submission
Khog_pred2 = Grid_Khog2.predict(hog_te2)

In [48]:
# inspect composition of labels
unique, counts = np.unique(Khog_pred2, return_counts=True)
print (np.asarray([unique, counts], dtype=np.int32).T)

[[   1  535]
 [   2 1221]
 [   4   65]]


In [49]:
# output csv files for submission
labels = pd.get_dummies(np.append(Khog_pred2, 3))
labels.columns=['healthy', 'multiple_diseases', 'rust', 'scab']
Khog_result2 = test_data.join(labels[:-1])
Khog_result2.to_csv("Khog_result2.csv", index=False)

---