In [2]:
import numpy as np
np.random.seed(1984)

import os
import glob
import cv2
import datetime
import pandas as pd
import time
import warnings
warnings.filterwarnings("ignore")

from skimage import io
from skimage import transform
import skimage

from sklearn.cross_validation import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D, AveragePooling2D
from keras.optimizers import SGD, Adagrad
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.constraints import maxnorm
from sklearn.metrics import log_loss
from keras import __version__ as keras_version
from collections import Counter

import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

Using TensorFlow backend.


In [None]:
path1 = '/home/gs/DataScientist/planet'
trainPath = '/train-tif-64'
testPath = '/test-tif-64'

PIC_SIZE = 64
PIC_DEPTH = 4



In [None]:
# read Y_train

try:
    Y_train = pd.read_csv(path1+'/train.csv')
except:
    path1 = '/home/ec2-user/DataScientist/planet'
    Y_train = pd.read_csv(path1+'/train.csv')

print (Y_train[0:5])

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in Y_train['tags'].values])))
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}
print(label_map)
print
print(inv_label_map)

Y_trainDict = {}
for i, row in Y_train.iterrows():
    name = row['image_name']
    tags = row['tags']
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    Y_trainDict[name] = targets

print (Y_trainDict['train_0'])
print (Y_trainDict['train_1'])
print (Y_trainDict['train_2'])


In [1]:
def get_im_cv2(path):
    img = io.imread(path)
    return img

def load_train():
    X_train = []
    Y_train = []
    X_train_id = []
    start_time = time.time()

    print('Read train images')
    path = os.path.join(path1+trainPath, '*.tif')
    print (path)
    files = glob.glob(path)
    for fl in files:
        flbase = os.path.basename(fl)
        img = get_im_cv2(fl)
        X_train.append(img)
        name = flbase.replace('.tif', '')
        X_train_id.append(name)
        Y_train.append(Y_trainDict[name])

    print('Read train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return X_train,X_train_id, Y_train

def load_test():
    X_test = []
    X_test_id = []
    start_time = time.time()

    print('Read test images')
    path = os.path.join(path1+testPath, '*.tif')
    print (path)
    files = glob.glob(path)
    for fl in files:
        flbase = os.path.basename(fl)
        img = get_im_cv2(fl)
        X_test.append(img)
        name = flbase.replace('.tif', '')
        X_test_id.append(name)

    print('Read test data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return X_test, X_test_id

def mapf (arr):
    res = ''
    for i in range(0,17):
        if arr[i] > 0.2:
            res += inv_label_map[i] + ' '
    res = res.rstrip()
    return res
 

In [None]:
X_train, X_train_id, Y_train = load_train()
print (len(X_train), len(X_train_id))
print(X_train_id[0:5])
print(X_train[0].shape)
print(Y_train[0])
print(Y_train[1])

X_test, X_test_id = load_test()
print (len(X_test), len(X_test_id))
print(X_test_id[0:5])
print(X_test[0].shape)


In [None]:
Y_train = np.array(Y_train, np.uint8)
X_train = np.array(X_train, np.float16) / 65535.

print(X_train.shape)
print(Y_train.shape)

In [None]:
# train / use larger split value for predictions, more epochs

split = 35000
x_train, x_valid, y_train, y_valid = X_train[:split], X_train[split:], Y_train[:split], Y_train[split:]

model = Sequential()
model.add(Conv2D(64, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(PIC_SIZE, PIC_SIZE, PIC_DEPTH)))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))

model.compile(loss='binary_crossentropy', # We NEED binary here, since categorical_crossentropy l1 norms the output before calculating loss.
              optimizer='adam',
              metrics=['accuracy'])
              
model.fit(x_train, y_train,
          batch_size=128,
          epochs=25,
          verbose=1,
          validation_data=(x_valid, y_valid))
          
from sklearn.metrics import fbeta_score

p_valid = model.predict(x_valid, batch_size=128)
#print(y_valid)
#print(p_valid)
print(fbeta_score(y_valid, np.array(p_valid) > 0.5, beta=2, average='macro'))

In [None]:
# predict
X_test = np.array(X_test, np.float16) / 65535.
preds = model.predict(X_test, batch_size=128)


In [None]:
# X_test_id, preds -> submission
predsText = []
for e in preds:
    predsText.append(mapf(e))
print (predsText[0:5])

In [None]:
#image_name,tags
#test_0,primary clear agriculture road water

subName = path1 + '/SUB_04_64pix_xxep.csv'
f = open(subName, 'w')
f.write('image_name,tags\n')
for i in range(0,len(predsText)):
    f.write(X_test_id[i]+','+predsText[i]+'\n')
f.close()


In [None]:
print (type(preds))
raw = pd.DataFrame(preds)
raw['id'] = X_test_id
print (raw.head())
raw.to_csv('RAW_04_64px_xxep.csv', index=False)