### XGBoost

In this part, we run XGBoost model with not only whole RGB channels, but also run it on single blue channel.

In [29]:
import pandas as pd
import numpy as np
import cv2
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE 
from sklearn.preprocessing import OneHotEncoder
import xgboost
from sklearn.model_selection import GridSearchCV

# progress bar for pandas
from tqdm.notebook import tqdm
tqdm.pandas()

### Read labels & Images

In [3]:
train_data = pd.read_csv('plant-pathology-2020-fgvc7/train.csv')
test_data = pd.read_csv('plant-pathology-2020-fgvc7/test.csv')

In [4]:
EPOCHS = 20
SAMPLE_LEN = 1821
IMAGE_PATH = "plant-pathology-2020-fgvc7/images/"

In [5]:
def load_image(image_id):
    file_path = image_id + ".jpg"
    image = cv2.imread(IMAGE_PATH + file_path)
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

train_images = train_data["image_id"][:SAMPLE_LEN].progress_apply(load_image)

HBox(children=(FloatProgress(value=0.0, max=1821.0), HTML(value='')))




In [6]:
test_images = test_data["image_id"][:SAMPLE_LEN].progress_apply(load_image)

HBox(children=(FloatProgress(value=0.0, max=1821.0), HTML(value='')))




#### Covert labels to label encoding

In [14]:
encoded_labels = train_data[['healthy', 'multiple_diseases', 'rust', 'scab']]
# rust = 3, scab = 4, healthy = 1, multiple_diseases = 2

In [16]:
enc = OneHotEncoder()
labels = [[1],[2],[3],[4]]
encoder = enc.fit(labels)
decoded_labels = encoder.inverse_transform(encoded_labels)

### Preprocessing images
#### Resizing (interpolation method, inter area)

In [7]:
# inner area
img_size = 100
ptrain_images = []
for image in train_images:
    image=cv2.resize(image,(img_size,img_size),interpolation=cv2.INTER_AREA)
    ptrain_images.append(image)

In [8]:
# inner area
ptest_images = []
for image in test_images:
    image=cv2.resize(image,(img_size,img_size),interpolation=cv2.INTER_AREA)
    ptest_images.append(image)

### Using all channels

#### Vectorizing the channels

In [9]:
X_train = np.ndarray(shape=(len(ptrain_images), img_size*img_size*3), dtype=np.float32)
for i, image in enumerate(ptrain_images):
    X_train[i,] = image.reshape(img_size*img_size*3)
X_train.shape    

(1821, 30000)

In [10]:
X_test = np.ndarray(shape=(len(ptest_images), img_size*img_size*3), dtype=np.float32)
for i, image in enumerate(ptest_images):
    X_test[i,] = image.reshape(img_size*img_size*3)
X_test.shape    

(1821, 30000)

#### over-sampling

In [17]:
y_train = decoded_labels.ravel()
sm = SMOTE(random_state = 0) 
X_train, y_train = sm.fit_resample(X_train,y_train)
X_train.shape, y_train.shape

((2488, 30000), (2488,))

In [18]:
unique, counts = np.unique(y_train, return_counts=True)
print (np.asarray((unique, counts)).T)

[[  1 622]
 [  2 622]
 [  3 622]
 [  4 622]]


#### XGBoost model

In [27]:
# helper function for printing out grid search results 
def print_grid_search_metrics(gs):
    print ("Best score: " + str(gs.best_score_))
    print ("Best parameters set:")
    best_parameters = gs.best_params_
    for param_name in sorted(parameters.keys()):
        print(param_name + ':' + str(best_parameters[param_name]))

In [30]:
# CV with Grid Search
# In order to save time, we use cv=3 in this grid search
parameters = {
    'objective' : ['multi:softmax'],
    'num_classes': [4],
    'max_depth': [3,5,7],
    'eta' : [0.05,0.15],
}
Grid_XGB = GridSearchCV(xgboost.XGBClassifier(), parameters, cv=3)
Grid_XGB.fit(X_train, y_train)

Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  

GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, 

In [31]:
print_grid_search_metrics(Grid_XGB)

Best score: 0.657968423755335
Best parameters set:
eta:0.15
max_depth:5
num_classes:4
objective:multi:softmax


In [32]:
# predict and output csv files for submission
pred_test = Grid_XGB.predict(X_test)
labels = pd.get_dummies(pred_test)
labels.columns=['healthy', 'multiple_diseases', 'rust', 'scab']
test_data = test_data.join(labels)
test_data.to_csv("test_data_xgb.csv", index=False)

### Only use blue channel

#### Extracting and vectorizing blue channel

In [19]:
blue_train = [(ptrain_images[idx][:,:,2].reshape(img_size**2)) for idx in range(len(ptrain_images))]
blue_train = np.array(blue_train)

In [20]:
blue_train.shape

(1821, 10000)

In [21]:
blue_test = [(ptest_images[idx][:,:,2].reshape(img_size**2)) for idx in range(len(ptest_images))]
blue_test = np.array(blue_test)

In [22]:
blue_test.shape

(1821, 10000)

#### over-sampling

In [23]:
y_tr_blue = decoded_labels.ravel()
sm_blue = SMOTE(random_state = 0) 
blue_train, y_tr_blue = sm_blue.fit_resample(blue_train, y_tr_blue)
blue_train.shape, y_tr_blue.shape

((2488, 10000), (2488,))

In [24]:
unique, counts = np.unique(y_tr_blue, return_counts=True)
print (np.asarray((unique, counts)).T)

[[  1 622]
 [  2 622]
 [  3 622]
 [  4 622]]


#### XGBoost model

In [33]:
# CV with Grid Search
parameters = {
    'objective' : ['multi:softmax'],
    'num_classes': [4],
    'max_depth': [3,5,7],
    'eta' : [0.05,0.15],
}
Grid_XGBb = GridSearchCV(xgboost.XGBClassifier(), parameters, cv=3,n_jobs=-1)
Grid_XGBb.fit(blue_train, y_tr_blue)

Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, 

In [34]:
print_grid_search_metrics(Grid_XGBb)

Best score: 0.553861283493443
Best parameters set:
eta:0.15
max_depth:7
num_classes:4
objective:multi:softmax


In [None]:
# predict and output csv files for submission
XGBb_pred = Grid_XGBb.predict(blue_test)
labels = pd.get_dummies(XGBb_pred)
labels.columns=['healthy', 'multiple_diseases', 'rust', 'scab']
SVMb_result = test_data.join(labels)
SVMb_result.to_csv("XGBb_result.csv", index=False)