### SVM

In this part, we run SVM model with not only whole RGB channels, but also run it on single blue channel.

In [1]:
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

# progress bar for pandas
from tqdm.notebook import tqdm
tqdm.pandas()

Using TensorFlow backend.
  from pandas import Panel


### Read labels & Images

In [2]:
train_data = pd.read_csv('plant-pathology-2020-fgvc7/train.csv')
test_data = pd.read_csv('plant-pathology-2020-fgvc7/test.csv')

In [3]:
EPOCHS = 20
SAMPLE_LEN = 1821
IMAGE_PATH = "plant-pathology-2020-fgvc7/images/"

In [4]:
def load_image(image_id):
    file_path = image_id + ".jpg"
    image = cv2.imread(IMAGE_PATH + file_path)
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

train_images = train_data["image_id"][:SAMPLE_LEN].progress_apply(load_image)

HBox(children=(FloatProgress(value=0.0, max=1821.0), HTML(value='')))




In [5]:
test_images = test_data["image_id"][:SAMPLE_LEN].progress_apply(load_image)

HBox(children=(FloatProgress(value=0.0, max=1821.0), HTML(value='')))




#### Covert labels to label encoding

In [6]:
encoded_labels = train_data[['healthy', 'multiple_diseases', 'rust', 'scab']]
# rust = 3, scab = 4, healthy = 1, multiple_diseases = 2

In [7]:
enc = OneHotEncoder()
labels = [[1],[2],[3],[4]]
encoder = enc.fit(labels)
decoded_labels = encoder.inverse_transform(encoded_labels)

### Preprocessing images

#### Resizing (interpolation method, inter area)

In [8]:
# inner area
img_size = 100
ptrain_images = []
for image in train_images:
    image=cv2.resize(image,(img_size,img_size),interpolation=cv2.INTER_AREA)
    ptrain_images.append(image)

In [9]:
# inner area
ptest_images = []
for image in test_images:
    image=cv2.resize(image,(img_size,img_size),interpolation=cv2.INTER_AREA)
    ptest_images.append(image)

### Using all channels
#### Vectorizing the channels

In [10]:
X_train = np.ndarray(shape=(len(ptrain_images), img_size*img_size*3), dtype=np.float32)
for i, image in enumerate(ptrain_images):
    X_train[i,] = image.reshape(img_size*img_size*3)
X_train.shape    

(1821, 30000)

In [11]:
X_test = np.ndarray(shape=(len(ptest_images), img_size*img_size*3), dtype=np.float32)
for i, image in enumerate(ptest_images):
    X_test[i,] = image.reshape(img_size*img_size*3)
X_test.shape    

(1821, 30000)

#### over-sampling

In [12]:
y_train = decoded_labels.ravel()
sm = SMOTE(random_state = 0) 
X_train, y_train = sm.fit_resample(X_train,y_train)
X_train.shape, y_train.shape

((2488, 30000), (2488,))

In [13]:
unique, counts = np.unique(y_train, return_counts=True)
print (np.asarray((unique, counts)).T)

[[  1 622]
 [  2 622]
 [  3 622]
 [  4 622]]


#### SVM model

In [15]:
# helper function for printing out grid search results 
def print_grid_search_metrics(gs):
    print ("Best score: " + str(gs.best_score_))
    print ("Best parameters set:")
    best_parameters = gs.best_params_
    for param_name in sorted(parameters.keys()):
        print(param_name + ':' + str(best_parameters[param_name]))

In [16]:
# CV with Grid Search
# In order to save time, we use cv=3 in this grid search
parameters = {
    'C' : [0.01,0.1,1,10],
    'kernel' : ['linear','poly','rbf']
}
Grid_SVM = GridSearchCV(SVC(), parameters, cv=3,n_jobs=-1)
Grid_SVM.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10],
                         'kernel': ['linear', 'poly', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [17]:
print_grid_search_metrics(Grid_SVM)

Best score: 0.6471139079841682
Best parameters set:
C:10
kernel:rbf


In [None]:
# predict and output csv files for submission
pred_test = Grid_SVM.predict(X_test)
labels = pd.get_dummies(pred_test)
labels.columns=['healthy', 'multiple_diseases', 'rust', 'scab']
test_data = test_data.join(labels)
test_data.to_csv("test_data_svm.csv", index=False)

### Only use blue channel

#### Extracting and vectorizing blue channel

In [18]:
blue_train = [(ptrain_images[idx][:,:,2].reshape(img_size**2)) for idx in range(len(ptrain_images))]
blue_train = np.array(blue_train)

In [19]:
blue_train.shape

(1821, 10000)

In [20]:
blue_test = [(ptest_images[idx][:,:,2].reshape(img_size**2)) for idx in range(len(ptest_images))]
blue_test = np.array(blue_test)

In [21]:
blue_test.shape

(1821, 10000)

#### over-sampling

In [22]:
y_tr_blue = decoded_labels.ravel()
sm_blue = SMOTE(random_state = 0) 
blue_train, y_tr_blue = sm_blue.fit_resample(blue_train, y_tr_blue)
blue_train.shape, y_tr_blue.shape

((2488, 10000), (2488,))

In [23]:
unique, counts = np.unique(y_tr_blue, return_counts=True)
print (np.asarray((unique, counts)).T)

[[  1 622]
 [  2 622]
 [  3 622]
 [  4 622]]


#### SVM model

In [25]:
# CV with Grid Search
parameters = {
    'C' : [0.01,0.1,1,10],
    'kernel' : ['linear','poly','rbf']
}
Grid_SVMb = GridSearchCV(SVC(), parameters, cv=3,n_jobs=-1)
Grid_SVMb.fit(blue_train, y_tr_blue)

GridSearchCV(cv=3, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 10],
                         'kernel': ['linear', 'poly', 'rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [26]:
print_grid_search_metrics(Grid_SVMb)

Best score: 0.5474297673201854
Best parameters set:
C:10
kernel:rbf


In [None]:
# predict and output csv files for submission
SVMb_pred = Grid_SVMb.predict(blue_test)
labels = pd.get_dummies(SVMb_pred)
labels.columns=['healthy', 'multiple_diseases', 'rust', 'scab']
SVMb_result = test_data.join(labels)
SVMb_result.to_csv("SVMb_result.csv", index=False)