In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, classification_report
from sklearn.preprocessing import label_binarize
from sklearn import metrics
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
import numpy as np
import pandas as pd
import os
import time
import cv2

## Check data

In [18]:
train_dir = "../data/train"

CATEGORIES = ['Black-grass', 'Charlock', 'Cleavers', 'Common-Chickweed', 'Common-wheat', 'Fat-Hen', 'Loose-Silky-bent',
              'Maize', 'Scentless-Mayweed', 'Shepherds-Purse', 'Small-flowered-Cranesbill', 'Sugar-beet']

# check numbers of categories
NumCatergories = len(CATEGORIES)
print("Numbers of categories:", NumCatergories)

# check numbers of images
for category in CATEGORIES:
    print('{}: {} images'.format(category, len(os.listdir(os.path.join(train_dir, category)))))

Numbers of categories: 12
Black-grass: 35 images
Charlock: 35 images
Cleavers: 35 images
Common-Chickweed: 35 images
Common-wheat: 35 images
Fat-Hen: 35 images
Loose-Silky-bent: 35 images
Maize: 35 images
Scentless-Mayweed: 35 images
Shepherds-Purse: 35 images
Small-flowered-Cranesbill: 35 images
Sugar-beet: 35 images


In [19]:
# creat a dataframe including filename_path, catagory and id
train = []
for category_id, category in enumerate(CATEGORIES):
    for file in os.listdir(os.path.join(train_dir, category)):
        train.append(['../data/train/{}/{}'.format(category, file), category, category_id])
        
train = pd.DataFrame(train, columns=['file', 'category','category_id'])
print("The shape of train: ", train.shape)
train.head(2)

The shape of train:  (420, 3)


Unnamed: 0,file,category,category_id
0,../data/train/Black-grass/4a3b96198.png,Black-grass,0
1,../data/train/Black-grass/0d28c429b.png,Black-grass,0


## Sample traning data

In [20]:
# concat and sample traning data
SAMPLE_PER_CATEGORY = min([len(os.listdir(os.path.join(train_dir, category))) for category in CATEGORIES])
train = pd.concat([train[train['category'] == c][:SAMPLE_PER_CATEGORY] for c in CATEGORIES])
train = train.sample(frac=1)   ##shuffle
train.index = np.arange(len(train))
print(train.shape)
train.head(2)

(420, 3)


Unnamed: 0,file,category,category_id
0,../data/train/Scentless-Mayweed/0d58d5433.png,Scentless-Mayweed,8
1,../data/train/Charlock/0d5f555a3.png,Charlock,1


## Define pre-processing function

In [21]:
def extract_HSV_histogram(image, bins=(32, 32, 32)):
    """extract a 3D color histogram from the HSV color space using
    the supplied number of `bins` per channel"""
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    Hist = []
    for i in range(3):
        hist = cv2.calcHist([hsv],[i],None,[128],[0,256])
        Hist = np.append(Hist, hist) 

    # return the flattened histogram as the feature vector
    return Hist.flatten()


def create_mask_for_plant(image):
    """create a mask for the plants image"""
    image_hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    sensitivity = 35
    lower_hsv = np.array([60 - sensitivity, 100, 50])
    upper_hsv = np.array([60 + sensitivity, 255, 255])

    mask = cv2.inRange(image_hsv, lower_hsv, upper_hsv)
    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (11,11))
    mask = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
    return mask


def segment_plant(image):
    """segement out the plants images using the mask"""
    mask = create_mask_for_plant(image)
    output = cv2.bitwise_and(image, image, mask = mask)
    return output


def sharpen_image(image):
    """sharpen image"""
    image_blurred = cv2.GaussianBlur(image, (0, 0), 3)
    image_sharp = cv2.addWeighted(image, 1.5, image_blurred, -0.5, 0)
    return image_sharp

## Load images

In [22]:
# initialize the raw pixel intensities matrix, the features matrix,
# and labels list

HSVImages = []
his_mask_HSV = []
labels = []

# loop over the input images
for i, row in train.iterrows():
    # load the image and extract the class label
    # our images were named as labels.image_number.format
    image = cv2.imread(row['file'])
    # get the labels from the name of the images by extract the string before "."
    label = row['category_id']

    # extract raw pixel intensity "features"
    #followed by a color histogram to characterize the color distribution of the pixels
    # in the image
    
    resize_image = cv2.resize(image, (128, 128))

    hsvimage = cv2.cvtColor(resize_image, cv2.COLOR_BGR2HSV).flatten()

    
    image_mask = create_mask_for_plant(resize_image)
    image_mask_sharpen = sharpen_image(segment_plant(resize_image))
    his_mask_hsv = extract_HSV_histogram(image_mask_sharpen)


    # add the messages we got to the raw images, features, and labels matricies
    HSVImages.append(hsvimage)
    his_mask_HSV.append(his_mask_hsv)
    labels.append(label)    

    # show an update every 200 images until the last image
    if i > 0 and ((i + 1)% 200 == 0 or i == train.shape[0]-1):
        print("[INFO] processed {}/{}".format(i+1, train.shape[0]))

[INFO] processed 200/420
[INFO] processed 400/420
[INFO] processed 420/420


In [23]:
# show some information on the memory consumed by the raw images
# matrix and features matrix
HSVImages = np.array(HSVImages)
his_mask_HSV = np.array(his_mask_HSV)
labels = np.array(labels)

In [24]:
# partition the data into training and testing splits, using 85%
# of the data for training and the remaining 15% for testing
(trainVI, testVI, trainVL, testVL) = train_test_split(
    HSVImages, labels, test_size=0.15, random_state=27)
(trainHMVI, testHMVI, trainHMVL, testHMVL) = train_test_split(
    his_mask_HSV, labels, test_size=0.15, random_state=27)

## Random forest

In [25]:
## Random Forest
from sklearn.ensemble import RandomForestClassifier
print("[INFO] evaluating raw pixel accuracy...")
model = RandomForestClassifier(n_estimators =90, max_depth =3, random_state=27)
model.fit(trainVI, trainVL) # 训练数据集
acc = model.score(testVI, testVL)
print("[INFO] HSVImages accuracy: {:.2f}%".format(acc * 100))

[INFO] evaluating raw pixel accuracy...
[INFO] HSVImages accuracy: 34.92%


In [26]:
## Random Forest
print("[INFO] evaluating histogram accuracy...")
model = RandomForestClassifier(n_estimators =80, max_depth =3, random_state=27)
model.fit(trainHMVI, trainHMVL) # 训练数据集
acc = model.score(testHMVI, testHMVL)
print("[INFO] his_mask_HSV histogram accuracy: {:.2f}%".format(acc * 100))

[INFO] evaluating histogram accuracy...
[INFO] his_mask_HSV histogram accuracy: 63.49%


## Tunning

### default

In [27]:
rf0 = RandomForestClassifier(oob_score=True, random_state=27)
rf0.fit(trainHMVI, trainHMVL)
print ("[INFO] his_mask_HSV histogram oob_scor: {:.2f}%".format(rf0.oob_score_* 100))
acc = rf0.score(testHMVI, testHMVL)
print("[INFO] his_mask_HSV histogram accuracy: {:.2f}%".format(acc * 100))

[INFO] his_mask_HSV histogram oob_scor: 50.42%
[INFO] his_mask_HSV histogram accuracy: 58.73%


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


In [28]:
print('''RandomForestClassifier(
    ["n_estimators='warn'", 
     "criterion='gini'", 
     'max_depth=None', 
     'min_samples_split=2', 
     'min_samples_leaf=1', 
     'min_weight_fraction_leaf=0.0', 
     "max_features='auto'", 
     'max_leaf_nodes=None', 
     'min_impurity_decrease=0.0', 
     'min_impurity_split=None', 
     'bootstrap=True', 
     'oob_score=False', 
     'n_jobs=None', 
     'random_state=None', 
     'verbose=0', 
     'warm_start=False', 
     'class_weight=None'],
)''')

RandomForestClassifier(
    ["n_estimators='warn'", 
     "criterion='gini'", 
     'max_depth=None', 
     'min_samples_split=2', 
     'min_samples_leaf=1', 
     'min_weight_fraction_leaf=0.0', 
     "max_features='auto'", 
     'max_leaf_nodes=None', 
     'min_impurity_decrease=0.0', 
     'min_impurity_split=None', 
     'bootstrap=True', 
     'oob_score=False', 
     'n_jobs=None', 
     'random_state=None', 
     'verbose=0', 
     'warm_start=False', 
     'class_weight=None'],
)


In [29]:
rf1 = RandomForestClassifier(n_estimators= 180, max_depth=5, min_samples_split=2, max_features=9,
                                  min_samples_leaf=10 ,oob_score=True, random_state=27)
rf1.fit(trainHMVI, trainHMVL)
print ("[INFO] his_mask_HSV histogram oob_scor: {:.2f}%".format(rf1.oob_score_* 100))
acc = rf1.score(testHMVI, testHMVL)
print("[INFO] his_mask_HSV histogram accuracy: {:.2f}%".format(acc * 100))

[INFO] his_mask_HSV histogram oob_scor: 61.90%
[INFO] his_mask_HSV histogram accuracy: 69.84%


In [30]:
testHMVL = label_binarize(testHMVL, np.arange(12))
testHMVL[0:4]

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]])

In [31]:
y_predprob = rf0.predict_proba(testHMVI)
print ("AUC Score (Train): %f" % metrics.roc_auc_score(testHMVL, y_predprob))

AUC Score (Train): 0.886485


In [32]:
y_predprob = rf1.predict_proba(testHMVI)
print ("AUC Score (Train): %f" % metrics.roc_auc_score(testHMVL, y_predprob))

AUC Score (Train): 0.918994


### gridsearchcv

In [33]:
## HLS Mask Histograms KNN Model evaluation
from sklearn.metrics import roc_curve, auc 
from sklearn.preprocessing import label_binarize
from sklearn.metrics import classification_report
from sklearn import metrics

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
testHMVL = label_binarize(testHMVL, np.arange(12))

In [36]:
param_test1 = {'n_estimators':range(100,200,10)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(oob_score=True,random_state=27), 
                       param_grid = param_test1, scoring='roc_auc',cv=5)
gsearch1.fit(testHMVI, testHMVL)
gsearch1.best_params_, gsearch1.best_score_

ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
param_test2 = {'max_depth':range(3,14,2), 'min_samples_split':range(2,50,2)}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 190,oob_score=True, random_state=27),
   param_grid = param_test2, scoring='roc_auc', cv=5)
gsearch2.fit(testHMVI, testHMVL)
gsearch2.best_params_, gsearch2.best_score_

In [None]:
param_test3 = {'min_samples_split':range(2,50,2), 'min_samples_leaf':range(10,60,10)}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 190, max_depth=13, oob_score=True, random_state=27),
   param_grid = param_test3, scoring='roc_auc', cv=5)
gsearch3.fit(testHMVI, testHMVL)
gsearch3.best_params_, gsearch3.best_score_

In [None]:
param_test4 = {'max_features':range(3,11,2)}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 180, max_depth=5, min_samples_split=2,
                                  min_samples_leaf=10 ,oob_score=True, random_state=27),
   param_grid = param_test4, scoring='roc_auc',iid=False, cv=5)
gsearch4.fit(testHMVI, testHMVL)
gsearch4.best_params_, gsearch4.best_score_

## Report

In [None]:
## Random Forest
print("[INFO] evaluating histogram accuracy...")
model = RandomForestClassifier(random_state=27)
model.fit(trainHMVI, trainHMVL) # 训练数据集
acc = model.score(testHMVI, testHMVL)
print("[INFO] his_mask_HSV histogram accuracy: {:.2f}%".format(acc * 100))

In [None]:
print(classification_report(testHMVL, model.predict(testHMVI)))

In [None]:
y_score = model.predict_proba(testHMVI)
y_score[0:4]

In [None]:
testHMVL = label_binarize(testHMVL, np.arange(12))

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(testHMVL.ravel(),y_score.ravel())
auc = metrics.auc(fpr, tpr)
#FPR就是横坐标,TPR就是纵坐标
plt.plot(fpr, tpr, c = 'r', lw = 2, alpha = 0.7, label = u'AUC=%.3f' % auc)
plt.plot((0, 1), (0, 1), c = '#808080', lw = 1, ls = '--', alpha = 0.7)
plt.xlim((-0.01, 1.02))
plt.ylim((-0.01, 1.02))
plt.xticks(np.arange(0, 1.1, 0.1))
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xlabel('False Positive Rate', fontsize=13)
plt.ylabel('True Positive Rate', fontsize=13)
plt.grid(b=True, ls=':')
plt.legend(loc='lower right', fancybox=True, framealpha=0.8, fontsize=12)
plt.title(u'RF ROC curve', fontsize=17)
plt.show() 