In [1]:
import numpy as np
np.random.seed(1984)

import os
import glob
import cv2
import datetime
import pandas as pd
import time
import warnings
warnings.filterwarnings("ignore")

from sklearn.cross_validation import KFold
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D, AveragePooling2D
from keras.optimizers import SGD, Adagrad
from keras.callbacks import EarlyStopping
from keras.utils import np_utils
from keras.constraints import maxnorm
from sklearn.metrics import log_loss
from keras import __version__ as keras_version
from collections import Counter

import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)


In [3]:
path1 = '/home/gs/DataScientist/planet'
trainPath = '/train-jpg'
testPath = '/test-jpg'

PIC_SIZE = 32



In [4]:
# read Y_train

try:
    Y_train = pd.read_csv(path1+'/train.csv')
except:
    path1 = '/home/ec2-user/DataScientist/planet'
    Y_train = pd.read_csv(path1+'/train.csv')

print (Y_train[0:5])

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in Y_train['tags'].values])))
label_map = {l: i for i, l in enumerate(labels)}
inv_label_map = {i: l for l, i in label_map.items()}
print(label_map)
print
print(inv_label_map)

Y_trainDict = {}
for i, row in Y_train.iterrows():
    name = row['image_name']
    tags = row['tags']
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    Y_trainDict[name] = targets

print (Y_trainDict['train_0'])
print (Y_trainDict['train_1'])
print (Y_trainDict['train_2'])


  image_name                                       tags
0    train_0                               haze primary
1    train_1            agriculture clear primary water
2    train_2                              clear primary
3    train_3                              clear primary
4    train_4  agriculture clear habitation primary road
{'selective_logging': 16, 'cultivation': 8, 'clear': 1, 'habitation': 11, 'conventional_mine': 5, 'cloudy': 4, 'primary': 3, 'water': 6, 'haze': 7, 'slash_burn': 0, 'partly_cloudy': 9, 'artisinal_mine': 10, 'blooming': 2, 'bare_ground': 12, 'blow_down': 13, 'agriculture': 14, 'road': 15}

{0: 'slash_burn', 1: 'clear', 2: 'blooming', 3: 'primary', 4: 'cloudy', 5: 'conventional_mine', 6: 'water', 7: 'haze', 8: 'cultivation', 9: 'partly_cloudy', 10: 'artisinal_mine', 11: 'habitation', 12: 'bare_ground', 13: 'blow_down', 14: 'agriculture', 15: 'road', 16: 'selective_logging'}
[ 0.  0.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  1.  0.  1

In [5]:
def get_im_cv2(path):
    img = cv2.imread(path)
    resized = cv2.resize(img, (PIC_SIZE, PIC_SIZE), cv2.INTER_LINEAR)
    return resized

def load_train():
    X_train = []
    Y_train = []
    X_train_id = []
    start_time = time.time()

    print('Read train images')
    path = os.path.join(path1+trainPath, '*.jpg')
    print (path)
    files = glob.glob(path)
    for fl in files:
        flbase = os.path.basename(fl)
        img = get_im_cv2(fl)
        X_train.append(img)
        name = flbase.replace('.jpg', '')
        X_train_id.append(name)
        Y_train.append(Y_trainDict[name])

    print('Read train data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return X_train,X_train_id, Y_train

def load_test():
    X_test = []
    X_test_id = []
    start_time = time.time()

    print('Read test images')
    path = os.path.join(path1+testPath, '*.jpg')
    print (path)
    files = glob.glob(path)
    for fl in files:
        flbase = os.path.basename(fl)
        img = get_im_cv2(fl)
        X_test.append(img)
        name = flbase.replace('.jpg', '')
        X_test_id.append(name)

    print('Read test data time: {} seconds'.format(round(time.time() - start_time, 2)))
    return X_test, X_test_id

def mapf (arr):
    res = ''
    for i in range(0,17):
        if arr[i] > 0.5:
            res += inv_label_map[i] + ' '
    res = res.rstrip()
    return res
 

In [6]:
X_train, X_train_id, Y_train = load_train()
print (len(X_train), len(X_train_id))
print(X_train_id[0:5])
print(X_train[0].shape)
print(Y_train[0])
print(Y_train[1])

X_test, X_test_id = load_test()
print (len(X_test), len(X_test_id))
print(X_test_id[0:5])
print(X_test[0].shape)


Read train images
/home/ec2-user/DataScientist/planet/train-jpg/*.jpg
Read train data time: 42.49 seconds
(40479, 40479)
['train_235', 'train_8857', 'train_40321', 'train_16924', 'train_13411']
(128, 128, 3)
[ 0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  1.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
Read test images
/home/ec2-user/DataScientist/planet/test-jpg/*.jpg
Read test data time: 42.52 seconds
(40669, 40669)
['test_2265', 'test_10037', 'test_37861', 'test_4787', 'test_29331']
(128, 128, 3)


In [7]:
Y_train = np.array(Y_train, np.uint8)
X_train = np.array(X_train, np.float16) / 255.

print(X_train.shape)
print(Y_train.shape)

(40479, 128, 128, 3)
(40479, 17)


In [9]:
split = 35000
x_train, x_valid, y_train, y_valid = X_train[:split], X_train[split:], Y_train[:split], Y_train[split:]

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(PIC_SIZE, PIC_SIZE, 3)))

model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(32, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))

model.compile(loss='binary_crossentropy', # We NEED binary here, since categorical_crossentropy l1 norms the output before calculating loss.
              optimizer='adam',
              metrics=['accuracy'])
              
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          verbose=1,
          validation_data=(x_valid, y_valid))
          
from sklearn.metrics import fbeta_score

p_valid = model.predict(x_valid, batch_size=128)
#print(y_valid)
#print(p_valid)
print(fbeta_score(y_valid, np.array(p_valid) > 0.5, beta=2, average='macro'))

Train on 35000 samples, validate on 5479 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.307892938125


In [None]:
X_test = np.array(X_test, np.float16) / 255.
preds = model.predict(X_test, batch_size=128)


In [None]:
# X_test_id, preds -> submission
predsText = []
for e in preds:
    predsText.append(mapf(e))
print (predsText[0:5])

In [None]:
#image_name,tags
#test_0,primary clear agriculture road water

subName = path1 + '/SUB_03_32.csv'
f = open(subName, 'w')
f.write('image_name,tags\n')
for i in range(0,len(predsText)):
    f.write(X_test_id[i]+','+predsText[i]+'\n')
f.close()
