In [1]:
from skimage import io
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import glob
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss
import scipy
from scipy import stats

%matplotlib inline

path1 = '/home/gs/DataScientist/planet'
trainPath = '/train-tif'
testPath = '/test-tif'

VERBOSE_INTERVAL = 2500

NUM_BINS = 64
MAX_PIX_VAL = 65535




In [2]:
# definitions

def getImageData (filePath):
    stats = []
    try:
        img = io.imread(filePath)
        r, g, b, nir = img[:, :, 0], img[:, :, 1], img[:, :, 2], img[:, :, 3]
        if not 0 in (nir+r):
            n = (nir-r) / (nir+r)
        else:
            n = np.zeros(256)
                
        hr, bins = np.histogram(r,NUM_BINS,[0, MAX_PIX_VAL])
        hg, bins = np.histogram(g,NUM_BINS,[0, MAX_PIX_VAL])
        hb, bins = np.histogram(b,NUM_BINS,[0, MAX_PIX_VAL])
        hnir, bins = np.histogram(nir,NUM_BINS,[0, MAX_PIX_VAL])
        ndvi, bins = np.histogram(n,NUM_BINS,[-1, +1])

        # stats
        stats.append( np.mean(r) )
        stats.append( np.mean(g) )
        stats.append( np.mean(b) )
        stats.append( np.mean(nir) )
        stats.append( np.std(r) )
        stats.append( np.std(g) )
        stats.append( np.std(b) )
        stats.append( np.std(nir) )
        stats.append( np.min(r) )
        stats.append( np.min(g) )
        stats.append( np.min(b) )
        stats.append( np.min(nir) )
        stats.append( np.max(r) )
        stats.append( np.max(g) )
        stats.append( np.max(b) )
        stats.append( np.max(nir) )

        stats.append( scipy.stats.kurtosis(r.ravel()) )
        stats.append( scipy.stats.kurtosis(g.ravel()) )
        stats.append( scipy.stats.kurtosis(b.ravel()) )
        stats.append( scipy.stats.kurtosis(nir.ravel()) )
        stats.append( scipy.stats.skew(r.ravel()) )
        stats.append( scipy.stats.skew(g.ravel()) )
        stats.append( scipy.stats.skew(b.ravel()) )
        stats.append( scipy.stats.skew(nir.ravel()) )
        
        # to array
        stats = np.array(stats)
        
    except Exception as  e:
        print ('{}  error reading file {}'.format(e, filePath))
        hr = np.zeros(NUM_BINS)
        hg = np.zeros(NUM_BINS)
        hb = np.zeros(NUM_BINS)
        hnir = np.zeros(NUM_BINS)
        ndvi = np.zeros(NUM_BINS)
        stats = np.zeros(24)

    r = np.concatenate((hr, hg, hb, hnir, ndvi, stats), axis = 0)
    return r

def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=2000):
    br = 0
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.01
    param['max_depth'] = 8
    param['silent'] = 1
    param['eval_metric'] = "logloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.8
    param['colsample_bytree'] = 0.8
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20, verbose_eval = 50)
        br = model.best_iteration
        #print ('best iteration for DICT: {}'.format(br))
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds, verbose_eval = 50)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model, br



In [None]:
print (getImageData('/home/gs/DataScientist/planet/train-tif/train_1.tif'))

In [3]:
# read Y_train

print ('read train y...')

try:
    Y_train = pd.read_csv(path1+'/train.csv')
except:
    path1 = '/home/ec2-user/DataScientist/planet'
    Y_train = pd.read_csv(path1+'/train.csv')

print ('Y_train lines read: {}'.format(len(Y_train)))

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in Y_train['tags'].values])))
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}
#print(label_map)
#print
#print(inv_label_map)

Y_trainDict = {}
for i, row in Y_train.iterrows():
    name = row['image_name']
    tags = row['tags']
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    Y_trainDict[name] = targets

#print (Y_trainDict['train_0'])
#print (Y_trainDict['train_1'])
#print (Y_trainDict['train_2'])


# read train images

X_train = [] # arrays

Y_trainAll = []
X_train_id = []
lines = 0

print('Read train images')
path = os.path.join(path1+trainPath, '*.tif')
print (path)
files = glob.glob(path)
for fl in files:
    lines += 1
    if lines % VERBOSE_INTERVAL == 0:
        print ('  files read: {}'.format(lines))
    flbase = os.path.basename(fl)
    r = getImageData(fl)
    X_train.append(r)
    name = flbase.replace('.tif', '')
    X_train_id.append(name)
    Y_trainAll.append(Y_trainDict[name])


Y_trainAll = pd.DataFrame(Y_trainAll)

X_train = pd.DataFrame(X_train)
print ('X_train shape   : {}'.format(X_train.shape))
print ('Y_trainAll shape: {}'.format(Y_trainAll.shape))


X_test = [] # arrays
X_test_id = []
lines = 0 

print('Read test images')
path = os.path.join(path1+testPath, '*.tif')
print (path)
files = glob.glob(path)
for fl in files:
    lines += 1
    if lines % VERBOSE_INTERVAL == 0:
        print ('  files read: {}'.format(lines))
    flbase = os.path.basename(fl)
    r = getImageData(fl)
    X_test.append(r)
    name = flbase.replace('.tif', '')
    X_test_id.append(name)

X_test = pd.DataFrame(X_test)
print (X_test.shape)



read train y...
Y_train lines read: 40479
Read train images
/home/gs/DataScientist/planet/train-tif/*.tif
  files read: 2500
  files read: 5000




  files read: 7500
  files read: 10000
  files read: 12500
  files read: 15000
  files read: 17500
  files read: 20000
invalid offset  error reading file /home/gs/DataScientist/planet/train-tif/train_28173.tif
  files read: 22500
invalid offset  error reading file /home/gs/DataScientist/planet/train-tif/train_18772.tif
  files read: 25000
  files read: 27500
  files read: 30000
  files read: 32500
  files read: 35000
invalid TIFF file  error reading file /home/gs/DataScientist/planet/train-tif/train_5023.tif
  files read: 37500
  files read: 40000
X_train shape   : (40479, 344)
Y_trainAll shape: (40479, 17)
Read test images
/home/gs/DataScientist/planet/test-tif/*.tif
  files read: 2500
  files read: 5000
  files read: 7500
  files read: 10000
  files read: 12500
  files read: 15000
  files read: 17500
  files read: 20000
  files read: 22500
too many indices for array  error reading file /home/gs/DataScientist/planet/test-tif/test_17393.tif
  files read: 25000
  files read: 27500
  fil

In [None]:

# save data frames for later use

# Y_trainAll
# X_train
# X_test

import pickle
pickle.dump( Y_trainAll, open( "SAVE_yTrainAll.p", "wb" ) )
pickle.dump( X_train, open( "SAVE_xTrain.p", "wb" ) )
pickle.dump( X_test, open( "SAVE_xTest.p", "wb" ) )
pickle.dump( Y_trainDict, open( "SAVE_yTrainDict.p", "wb" ) )
pickle.dump( label_map, open( "SAVE_labelMap.p", "wb" ) )
pickle.dump( inv_label_map, open( "SAVE_invLabelMap.p", "wb" ) )



In [None]:
# load data
import pickle
Y_trainAll = pickle.load( open( "SAVE_yTrainAll.p", "rb" ) )
X_train = pickle.load( open( "SAVE_xTrain.p", "rb" ) )
X_test = pickle.load( open( "SAVE_xTest.p", "rb" ) )
Y_trainDict = pickle.load( open( "SAVE_yTrainDict.p", "rb" ) )
label_map = pickle.load( open( "SAVE_labelMap.p", "rb" ) )
inv_label_map = pickle.load( open( "SAVE_invLabelMap.p", "rb" ) )




In [4]:

# xgb cross validation

# save number of rounds!
brDict = {}

for i in range(0,17):
    print ('target: {} {}'.format(i, inv_label_map[i]))
    Y_train = Y_trainAll.ix[:,i]

    kf = model_selection.KFold(n_splits=3, shuffle=True, random_state=2016)
    for dev_index, val_index in kf.split(range(X_train.shape[0])):
        dev_X, val_X = X_train.iloc[dev_index], X_train.iloc[val_index]
        dev_y, val_y = Y_train.iloc[dev_index], Y_train.iloc[val_index]
        preds, model, br = runXGB(dev_X, dev_y, val_X, val_y)
        brDict[i] = br

print ('bestRounds:')
print (brDict)

target: 0 slash_burn
[0]	train-logloss:0.683392	test-logloss:0.683414
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 20 rounds.
[50]	train-logloss:0.363071	test-logloss:0.364071
[100]	train-logloss:0.210861	test-logloss:0.212984
[150]	train-logloss:0.129377	test-logloss:0.132788
[200]	train-logloss:0.083144	test-logloss:0.088174
[250]	train-logloss:0.055791	test-logloss:0.062815
[300]	train-logloss:0.039081	test-logloss:0.048299
[350]	train-logloss:0.028395	test-logloss:0.039935
[400]	train-logloss:0.021515	test-logloss:0.035241
[450]	train-logloss:0.016828	test-logloss:0.03263
[500]	train-logloss:0.013608	test-logloss:0.031289
[550]	train-logloss:0.011333	test-logloss:0.030652
[600]	train-logloss:0.00961	test-logloss:0.030437
Stopping. Best iteration:
[620]	train-logloss:0.009053	test-logloss:0.030397

[0]	train-logloss:0.68344	test-logloss:0.683397
Multiple eval metrics have been passed: 'test-

In [5]:
# xgboost predict




predsDF = pd.DataFrame()
for i in range(0,17):
    print ('predicting feature ' + str(i))
    Y_train = Y_trainAll.ix[:,i]
    #print (Y_train.shape)
    preds, model, br = runXGB(X_train, Y_train, X_test, num_rounds=int(brDict[i]*1.33))
    predsDF[i] = preds

print (predsDF.shape)

predicting feature 0
predicting feature 1
predicting feature 2
predicting feature 3
predicting feature 4
predicting feature 5
predicting feature 6
predicting feature 7
predicting feature 8
predicting feature 9
predicting feature 10
predicting feature 11
predicting feature 12
predicting feature 13
predicting feature 14
predicting feature 15
predicting feature 16
(40669, 17)


In [None]:
i = xgb.importance(model=model)
print (i)

In [None]:
print (predsDF.head())
print (X_train_id[0:5])

In [None]:
# predsDF to prediction file

def mapf (arr):
    res = ''
    for i in range(0,17):
        if arr[i] > 0.5:
            res += inv_label_map[i] + ' '
    res = res.rstrip()
    return res

print(predsDF.shape)

textResults = []

for i, row in predsDF.iterrows():
    #print (i)
    #print (list(row))
    textResults.append ( mapf( list (row)))
    
print (textResults[0:5])

res = pd.DataFrame()
res['image_name'] = X_test_id
res['tags'] = textResults

print (res.head())

res.to_csv('SUB_25_0.01eta.csv', index=False)




In [8]:
predsDF.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,id
0,0.001509,0.972671,0.047312,0.999827,8.5e-05,6.5e-05,0.042768,0.000434,0.438457,0.011927,2.1e-05,0.01792,0.005873,0.003358,0.719696,0.074135,0.029849,test_34045
1,0.00176,0.989465,0.00257,0.999623,0.00023,0.000114,0.059147,0.001212,0.020107,0.004467,0.000177,0.001416,0.001669,0.001612,0.052388,0.010477,0.001279,test_32937
2,0.006104,0.973876,0.005513,0.998994,0.000235,0.000189,0.639454,0.007449,0.048404,0.010324,7.3e-05,0.013641,0.004402,0.001366,0.074483,0.047072,0.003743,test_10537
3,0.012243,0.935445,0.001582,0.995544,0.000567,0.000693,0.828803,0.008132,0.146062,0.035113,0.001794,0.067513,0.020096,0.000648,0.143409,0.116787,0.005036,test_30945
4,0.002334,0.01853,0.005153,0.998468,0.000225,0.000387,0.094218,0.026948,0.027737,0.964892,0.000227,0.013253,0.009426,0.003657,0.038879,0.036904,0.002558,test_13470


In [7]:
# save RAW

predsDF['id'] = X_test_id
predsDF.to_csv('RAW_25.csv', index=False)
