In [1]:
import pandas as pd
import numpy as np
import cv2

In [2]:
# progress bar for pandas
from tqdm.notebook import tqdm
tqdm.pandas()

  from pandas import Panel


### Read labels & Images

In [3]:
train_data = pd.read_csv('plant-pathology-2020-fgvc7/train.csv')
test_data = pd.read_csv('plant-pathology-2020-fgvc7/test.csv')

In [4]:
EPOCHS = 20
SAMPLE_LEN = 1821
IMAGE_PATH = "plant-pathology-2020-fgvc7/images/"

In [5]:
def load_image(image_id):
    file_path = image_id + ".jpg"
    image = cv2.imread(IMAGE_PATH + file_path)
    return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

train_images = train_data["image_id"][:SAMPLE_LEN].progress_apply(load_image)

HBox(children=(FloatProgress(value=0.0, max=1821.0), HTML(value='')))




In [6]:
test_images = test_data["image_id"][:SAMPLE_LEN].progress_apply(load_image)

HBox(children=(FloatProgress(value=0.0, max=1821.0), HTML(value='')))




#### Covert labels to label encoding

In [7]:
encoded_labels = train_data[['healthy', 'multiple_diseases', 'rust', 'scab']]
# rust = 3, scab = 4, healthy = 1, multiple_diseases = 2

In [8]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
labels = [[1],[2],[3],[4]]
encoder = enc.fit(labels)
decoded_labels = encoder.inverse_transform(encoded_labels)

#### Resizing (interpolation method, inter area)

In [9]:
# inner area
img_size = 100
ptrain_images = []
for image in train_images:
    image=cv2.resize(image,(img_size,img_size),interpolation=cv2.INTER_AREA)
    ptrain_images.append(image)

In [10]:
# inner area
ptest_images = []
for image in test_images:
    image=cv2.resize(image,(img_size,img_size),interpolation=cv2.INTER_AREA)
    ptest_images.append(image)

In [11]:
X_train = np.ndarray(shape=(len(ptrain_images), img_size*img_size*3), dtype=np.float32)
for i, image in enumerate(ptrain_images):
    X_train[i,] = image.reshape(img_size*img_size*3)
X_train.shape    

(1821, 30000)

In [12]:
X_test = np.ndarray(shape=(len(ptest_images), img_size*img_size*3), dtype=np.float32)
for i, image in enumerate(ptest_images):
    X_test[i,] = image.reshape(img_size*img_size*3)
X_test.shape    

(1821, 30000)

In [13]:
from imblearn.over_sampling import SMOTE 
y_train = decoded_labels.ravel()
sm = SMOTE(random_state = 0) 
X_train, y_train = sm.fit_resample(X_train,y_train)
X_train.shape,y_train.shape

Using TensorFlow backend.


((2488, 30000), (2488,))

### Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix

In [52]:
# helper function for printing out grid search results 
def print_grid_search_metrics(gs):
    print ("Best score: " + str(gs.best_score_))
    print ("Best parameters set:")
    best_parameters = gs.best_params_
    for param_name in sorted(parameters.keys()):
        print(param_name + ':' + str(best_parameters[param_name]))

In [54]:
# CV with Grid Search
parameters = {
    'n_estimators' : [100, 150, 200],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [50, 100, 200, 500]    
}
Grid_RF = GridSearchCV(RandomForestClassifier(oob_score=True), parameters, cv=5)
Grid_RF.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=True, random_state=None,
                                 

In [55]:
print_grid_search_metrics(Grid_RF)

Best score: 0.6636065388313819
Best parameters set:
max_depth:500
max_features:sqrt
n_estimators:200


In [56]:
# predict and output csv files for submission
pred_test = Grid_RF.predict(X_test)
labels = pd.get_dummies(pred_test)
labels.columns=['healthy', 'multiple_diseases', 'rust', 'scab']
test_data = test_data.join(labels)
test_data.to_csv("test_data_rf.csv", index=False)

### Kaggle Score: 0.566

### XGBoost

In [15]:
import xgboost
# helper function for printing out grid search results 
def print_grid_search_metrics(gs):
    print ("Best score: " + str(gs.best_score_))
    print ("Best parameters set:")
    best_parameters = gs.best_params_
    for param_name in sorted(parameters.keys()):
        print(param_name + ':' + str(best_parameters[param_name]))

In [17]:
# CV with Grid Search
parameters = {
    'objective' : ['multi:softmax'],
    'num_classes': [4],
    'max_depth': [3,5,7],
    'eta' : [0.05,0.15],
    #'min_child_weigiht' : [1,5,10]
}
Grid_XGB = GridSearchCV(xgboost.XGBClassifier(), parameters, cv=3)
Grid_XGB.fit(X_train, y_train)

Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { num_classes } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  

GridSearchCV(cv=3, error_score=nan,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estim...
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight=None,
                                     subsample=None, tree_method=None,
                                     validate_parameters=None, 

In [18]:
print_grid_search_metrics(Grid_XGB)

Best score: 0.657968423755335
Best parameters set:
eta:0.15
max_depth:5
num_classes:4
objective:multi:softmax


In [19]:
# predict and output csv files for submission
pred_test = Grid_XGB.predict(X_test)
labels = pd.get_dummies(pred_test)
labels.columns=['healthy', 'multiple_diseases', 'rust', 'scab']
test_data = test_data.join(labels)
test_data.to_csv("test_data_xgb.csv", index=False)

### Kaggle Score: 0.557