In [16]:
from skimage import io
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
import os
import glob
import xgboost as xgb
from sklearn import model_selection, preprocessing, ensemble
from sklearn.metrics import log_loss

%matplotlib inline

path1 = '/home/gs/DataScientist/planet'
trainPath = '/train-tif-sample'
testPath = '/test-tif'

VERBOSE_INTERVAL = 50

NUM_BINS = 32
MAX_PIX_VAL = 65535


In [13]:
# definitions

def getImageHistograms (filePath):
    img = io.imread(filePath)
    r, g, b, nir = img[:, :, 0], img[:, :, 1], img[:, :, 2], img[:, :, 3]
    hr, bins = np.histogram(r,NUM_BINS,[0, MAX_PIX_VAL])
    hg, bins = np.histogram(g,NUM_BINS,[0, MAX_PIX_VAL])
    hb, bins = np.histogram(b,NUM_BINS,[0, MAX_PIX_VAL])
    hnir, bins = np.histogram(nir,NUM_BINS,[0, MAX_PIX_VAL])
    
    return hr, hg, hb, hnir

def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=0, num_rounds=2000):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.02
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = "logloss"
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20, verbose_eval = 50)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds, verbose_eval = 50)

    pred_test_y = model.predict(xgtest)
    return pred_test_y, model



In [7]:
# read Y_train

try:
    Y_train = pd.read_csv(path1+'/train.csv')
except:
    path1 = '/home/ec2-user/DataScientist/planet'
    Y_train = pd.read_csv(path1+'/train.csv')

print ('Y_train lines read: {}'.format(len(Y_train)))

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in Y_train['tags'].values])))
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}
#print(label_map)
#print
#print(inv_label_map)

Y_trainDict = {}
for i, row in Y_train.iterrows():
    name = row['image_name']
    tags = row['tags']
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    Y_trainDict[name] = targets

#print (Y_trainDict['train_0'])
#print (Y_trainDict['train_1'])
#print (Y_trainDict['train_2'])


Y_train lines read: 40479


In [17]:
# read train images

X_train = [] # arrays
Y_trainAll = []
X_train_id = []
lines = 0

print('Read train images')
path = os.path.join(path1+trainPath, '*.tif')
print (path)
files = glob.glob(path)
for fl in files:
    lines += 1
    if lines % VERBOSE_INTERVAL == 0:
        print ('  lines read: {}'.format(lines))
    flbase = os.path.basename(fl)
    a,b,c,d = getImageHistograms(fl)
    r = np.concatenate((a, b, c, d), axis = 0)
    X_train.append(r)
    name = flbase.replace('.tif', '')
    X_train_id.append(name)
    Y_trainAll.append(Y_trainDict[name])


Y_trainAll = pd.DataFrame(Y_trainAll)

X_train = pd.DataFrame(X_train)
print ('X_train shape   : {}'.format(X_train.shape))
print ('Y_trainAll shape: {}'.format(Y_trainAll.shape))


Read train images
/home/gs/DataScientist/planet/train-tif-sample/*.tif
  lines read: 50
  lines read: 100
X_train shape   : (100, 128)
Y_trainAll shape: (100, 17)


In [18]:
# train / test / eval
# to be deleted

X_train, X_test = X_train[:80], X_train[80:]
Y_trainAll, Y_testAll = Y_trainAll[:80], Y_trainAll[80:]
print (X_train.shape)
print (X_test.shape)
print (X_test.shape)
print (Y_trainAll.shape)


(80, 128)
(20, 128)
(20, 128)
(80, 17)


In [20]:
# xgboost predict




predsDF = pd.DataFrame()
for i in range(0,17):
    print ('feature ' + str(i))
    Y_train = Y_trainAll.ix[:,i]
    #print (Y_train.shape)
    preds, model = runXGB(X_train, Y_train, X_test, num_rounds=220)
    predsDF[i] = preds

print (predsDF.shape)

feature 0
feature 1
feature 2
feature 3
feature 4
feature 5
feature 6
feature 7
feature 8
feature 9
feature 10
feature 11
feature 12
feature 13
feature 14
feature 15
feature 16
(20, 17)


In [21]:
# predsDF to prediction file

def mapf (arr):
    res = ''
    for i in range(0,17):
        if arr[i] > 0.5:
            res += inv_label_map[i] + ' '
    res = res.rstrip()
    return res

print(predsDF.shape)

textResults = []

for i, row in predsDF.iterrows():
    #print (i)
    #print (list(row))
    textResults.append ( mapf( list (row)))
    
print (textResults[0:5])
    
# TODO / zip?

# get test ids

# write sub file



(20, 17)
['clear primary', 'clear primary water', 'clear primary', 'clear primary agriculture', 'clear primary']


In [None]:

# xgb cross validation

for i in range(0,17):
    print (i)
    Y_train = Y_trainAll.ix[:,i]

    kf = model_selection.KFold(n_splits=3, shuffle=True, random_state=2016)
    for dev_index, val_index in kf.split(range(X_train.shape[0])):
        dev_X, val_X = X_train.iloc[dev_index], X_train.iloc[val_index]
        dev_y, val_y = Y_train.iloc[dev_index], Y_train.iloc[val_index]
        preds, model = runXGB(dev_X, dev_y, val_X, val_y)
        #cv_scores.append(log_loss(val_y, preds))
        #print(cv_scores)
