In [5]:
from skimage import io
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import glob
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss

%matplotlib inline

path1 = '/home/gs/DataScientist/planet'
trainPath = '/train-tif'
testPath = '/test-tif'

VERBOSE_INTERVAL = 5000

NUM_BINS = 64
MAX_PIX_VAL = 65535


In [6]:
# definitions

def getImageHistograms (filePath):
    try:
        img = io.imread(filePath)
        r, g, b, nir = img[:, :, 0], img[:, :, 1], img[:, :, 2], img[:, :, 3]
        hr, bins = np.histogram(r,NUM_BINS,[0, MAX_PIX_VAL])
        hg, bins = np.histogram(g,NUM_BINS,[0, MAX_PIX_VAL])
        hb, bins = np.histogram(b,NUM_BINS,[0, MAX_PIX_VAL])
        hnir, bins = np.histogram(nir,NUM_BINS,[0, MAX_PIX_VAL])
    except:
        print ('  error reading file {}'.format(filePath))
        hr = np.zeros(NUM_BINS)
        hg = np.zeros(NUM_BINS)
        hb = np.zeros(NUM_BINS)
        hnir = np.zeros(NUM_BINS)
        
    return hr, hg, hb, hnir

def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=2000):
    br = 0
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.01
    param['max_depth'] = 8
    param['silent'] = 1
    param['eval_metric'] = "logloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.8
    param['colsample_bytree'] = 0.8
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20, verbose_eval = 50)
        br = model.best_iteration
        #print ('best iteration for DICT: {}'.format(br))
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds, verbose_eval = 50)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model, br



In [4]:
# read Y_train

print ('read train y...')

try:
    Y_train = pd.read_csv(path1+'/train.csv')
except:
    path1 = '/home/ec2-user/DataScientist/planet'
    Y_train = pd.read_csv(path1+'/train.csv')

print ('Y_train lines read: {}'.format(len(Y_train)))

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in Y_train['tags'].values])))
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}
#print(label_map)
#print
#print(inv_label_map)

Y_trainDict = {}
for i, row in Y_train.iterrows():
    name = row['image_name']
    tags = row['tags']
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    Y_trainDict[name] = targets

#print (Y_trainDict['train_0'])
#print (Y_trainDict['train_1'])
#print (Y_trainDict['train_2'])


# read train images

X_train = [] # arrays

Y_trainAll = []
X_train_id = []
lines = 0

print('Read train images')
path = os.path.join(path1+trainPath, '*.tif')
print (path)
files = glob.glob(path)
for fl in files:
    lines += 1
    if lines % VERBOSE_INTERVAL == 0:
        print ('  files read: {}'.format(lines))
    flbase = os.path.basename(fl)
    a,b,c,d = getImageHistograms(fl)
    r = np.concatenate((a, b, c, d), axis = 0)
    X_train.append(r)
    name = flbase.replace('.tif', '')
    X_train_id.append(name)
    Y_trainAll.append(Y_trainDict[name])


Y_trainAll = pd.DataFrame(Y_trainAll)

X_train = pd.DataFrame(X_train)
print ('X_train shape   : {}'.format(X_train.shape))
print ('Y_trainAll shape: {}'.format(Y_trainAll.shape))


X_test = [] # arrays
X_test_id = []
lines = 0 

print('Read test images')
path = os.path.join(path1+testPath, '*.tif')
print (path)
files = glob.glob(path)
for fl in files:
    lines += 1
    if lines % VERBOSE_INTERVAL == 0:
        print ('  files read: {}'.format(lines))
    flbase = os.path.basename(fl)
    a,b,c,d = getImageHistograms(fl)
    r = np.concatenate((a, b, c, d), axis = 0)
    X_test.append(r)
    name = flbase.replace('.tif', '')
    X_test_id.append(name)

X_test = pd.DataFrame(X_test)
print (X_test.shape)



Y_train lines read: 40479
Read train images
/home/gs/DataScientist/planet/train-tif/*.tif
  files read: 5000




  files read: 10000
  files read: 15000
  files read: 20000
  error reading file /home/gs/DataScientist/planet/train-tif/train_28173.tif
  error reading file /home/gs/DataScientist/planet/train-tif/train_18772.tif
  files read: 25000
  files read: 30000
  files read: 35000
  error reading file /home/gs/DataScientist/planet/train-tif/train_5023.tif
  files read: 40000
X_train shape   : (40479, 256)
Y_trainAll shape: (40479, 17)
Read test images
/home/gs/DataScientist/planet/test-tif/*.tif
  files read: 5000
  files read: 10000
  files read: 15000
  files read: 20000
  error reading file /home/gs/DataScientist/planet/test-tif/test_17393.tif
  files read: 25000
  files read: 30000
  files read: 35000
  files read: 40000
(40669, 256)


In [None]:

# save data frames of histograms for later use


In [7]:

# xgb cross validation

# save number of rounds!
brDict = {}

for i in range(0,17):
    print ('target: {} {}'.format(i, inv_label_map[i]))
    Y_train = Y_trainAll.ix[:,i]

    kf = model_selection.KFold(n_splits=3, shuffle=True, random_state=2016)
    for dev_index, val_index in kf.split(range(X_train.shape[0])):
        dev_X, val_X = X_train.iloc[dev_index], X_train.iloc[val_index]
        dev_y, val_y = Y_train.iloc[dev_index], Y_train.iloc[val_index]
        preds, model, br = runXGB(dev_X, dev_y, val_X, val_y)
        brDict[i] = br

print ('bestRounds:')
print (brDict)

0
[0]	train-logloss:0.683392	test-logloss:0.683414
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 20 rounds.
[50]	train-logloss:0.363098	test-logloss:0.364062
[100]	train-logloss:0.211254	test-logloss:0.213039
[150]	train-logloss:0.129911	test-logloss:0.132961
[200]	train-logloss:0.083853	test-logloss:0.088359
[250]	train-logloss:0.056667	test-logloss:0.063031
[300]	train-logloss:0.040168	test-logloss:0.048575
[350]	train-logloss:0.029672	test-logloss:0.040321
[400]	train-logloss:0.022921	test-logloss:0.035718
[450]	train-logloss:0.018424	test-logloss:0.033244
[500]	train-logloss:0.015272	test-logloss:0.03196
[550]	train-logloss:0.013041	test-logloss:0.031326
[600]	train-logloss:0.011361	test-logloss:0.03113
Stopping. Best iteration:
[620]	train-logloss:0.010813	test-logloss:0.031104

[0]	train-logloss:0.68344	test-logloss:0.683397
Multiple eval metrics have been passed: 'test-logloss' will be us

In [9]:
# xgboost predict




predsDF = pd.DataFrame()
for i in range(0,17):
    print ('predicting feature ' + str(i))
    Y_train = Y_trainAll.ix[:,i]
    #print (Y_train.shape)
    preds, model, br = runXGB(X_train, Y_train, X_test, num_rounds=int(brDict[i]*1.33))
    predsDF[i] = preds

print (predsDF.shape)

predicting feature 0
predicting feature 1
predicting feature 2
predicting feature 3
predicting feature 4
predicting feature 5
predicting feature 6
predicting feature 7
predicting feature 8
predicting feature 9
predicting feature 10
predicting feature 11
predicting feature 12
predicting feature 13
predicting feature 14
predicting feature 15
predicting feature 16
(40669, 17)


In [None]:
print (predsDF.head())
print (X_train_id[0:5])

In [10]:
# predsDF to prediction file

def mapf (arr):
    res = ''
    for i in range(0,17):
        if arr[i] > 0.5:
            res += inv_label_map[i] + ' '
    res = res.rstrip()
    return res

print(predsDF.shape)

textResults = []

for i, row in predsDF.iterrows():
    #print (i)
    #print (list(row))
    textResults.append ( mapf( list (row)))
    
print (textResults[0:5])

res = pd.DataFrame()
res['image_name'] = X_test_id
res['tags'] = textResults

print (res.head())

res.to_csv('SUB_21.csv', index=False)




(40669, 17)
['clear primary', 'clear primary', 'clear primary water', 'clear primary water', 'primary partly_cloudy']
   image_name                   tags
0  test_34045          clear primary
1  test_32937          clear primary
2  test_10537    clear primary water
3  test_30945    clear primary water
4  test_13470  primary partly_cloudy
