In [1]:
#Import necessary libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.metrics import fbeta_score
from tqdm import tqdm
import cv2
from PIL import Image
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras import optimizers

from sklearn.model_selection import train_test_split
from sklearn.metrics import fbeta_score

In [36]:
#Load train and test CSVs
df_train = pd.read_csv('../input/planets-dataset/planet/planet/train_classes.csv')
df_test = pd.read_csv('../input/planets-dataset/planet/planet/sample_submission.csv')

In [3]:
labels = df_train['tags'].apply(lambda x: x.split(' '))
from collections import Counter, defaultdict
counts = defaultdict(int) #dictionary containing each individual label
for l in labels:
    for l2 in l:
        counts[l2] += 1
tag_list=list(counts.keys()) 
y=list(counts.values())
#Create a dictionary assigning a numerical value to each label
label_map = {i:j for j, i in enumerate(tag_list)}
tag_list

['haze',
 'primary',
 'agriculture',
 'clear',
 'water',
 'habitation',
 'road',
 'cultivation',
 'slash_burn',
 'cloudy',
 'partly_cloudy',
 'conventional_mine',
 'bare_ground',
 'artisinal_mine',
 'blooming',
 'selective_logging',
 'blow_down']

In [4]:
# One hot encode the training labels. Convert the images into pixels and resize them
X_train, Y_train = [], []
for img, label in tqdm(df_train.values, miniters = 1000):
  target = np.zeros(17)
  for tag in label.split(' '):
    target[label_map[tag]]=1
  X_train.append(cv2.resize(cv2.imread('../input/planets-dataset/planet/planet/train-jpg/{}.jpg'.format(img)), (64,64)))
  Y_train.append(target)

100%|██████████| 40479/40479 [02:12<00:00, 304.70it/s]


In [5]:
#convert the test images to pixels and resize them as well
X_test=[]
for img, label in tqdm(df_test[:40669].values, miniters = 1000):
  X_test.append(cv2.resize(cv2.imread('../input/planets-dataset/planet/planet/test-jpg/{}.jpg'.format(img)), (64,64)))
for img, label in tqdm(df_test[40669:].values, miniters = 1000):
  X_test.append(cv2.resize(cv2.imread('../input/planets-dataset/test-jpg-additional/test-jpg-additional/{}.jpg'.format(img)), (64,64)))

100%|██████████| 40669/40669 [02:33<00:00, 265.70it/s]
100%|██████████| 20522/20522 [01:15<00:00, 270.21it/s]


In [6]:
#Change lists to numpy arrays and normalize
X = np.array(X_train, np.float16)/255
y = np.array(Y_train, np.uint8)
X_test = np.array(X_test, np.float16)/255

In [7]:
np.save('./X',X)
np.save('./y',y)
np.save('./X_test',X_test)

In [None]:
# X = np.load('./X.npy')
# y = np.load('./y.npy')
# X_test = np.load('./X_test.npy')

In [8]:
x_train, x_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, shuffle = True, random_state = 1)

print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

(32383, 64, 64, 3) (32383, 17) (8096, 64, 64, 3) (8096, 17)


In [9]:
del(X,y)

In [10]:
from keras_preprocessing.image import ImageDataGenerator

In [11]:
datagen = ImageDataGenerator(rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,                             
    horizontal_flip=True)

In [12]:
traingen = datagen.flow(x_train,
                       y_train,
                       batch_size=64)


In [13]:
from keras import backend as K


def fbeta(y_true, y_pred, threshold_shift=0):
    beta = 2

    # just in case of hipster activation at the final layer
    y_pred = K.clip(y_pred, 0, 1)

    # shifting the prediction threshold from .5 if needed
    y_pred_bin = K.round(y_pred + threshold_shift)

    tp = K.sum(K.round(y_true * y_pred_bin)) + K.epsilon()
    fp = K.sum(K.round(K.clip(y_pred_bin - y_true, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true - y_pred, 0, 1)))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    beta_squared = beta ** 2
    return (beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall + K.epsilon())


In [14]:
from keras.applications.vgg19 import VGG19

In [15]:
base_model = VGG19(include_top=False,
                  weights='imagenet',
                  input_shape=(64,64,3))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg19/vgg19_weights_tf_dim_ordering_tf_kernels_notop.h5


In [17]:
#kfold_weights_path = os.path.join('', 'weights_kfold_' + '.h5')
model = Sequential()
model.add(BatchNormalization(input_shape=(64, 64,3)))
model.add(base_model)
# model.add(Conv2D(32, kernel_size=(3, 3),padding='same', activation='relu'))
# model.add(Conv2D(32, (3, 3), activation='relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# model.add(Dropout(0.25))

# model.add(Conv2D(64, kernel_size=(3, 3),padding='same', activation='relu'))
# model.add(Conv2D(64, (3, 3), activation='relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# model.add(Dropout(0.25))
        
# model.add(Conv2D(128, kernel_size=(3, 3),padding='same', activation='relu'))
# model.add(Conv2D(128, (3, 3), activation='relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# model.add(Dropout(0.25))
        
# model.add(Conv2D(256, kernel_size=(3, 3),padding='same', activation='relu'))
# model.add(Conv2D(256, (3, 3), activation='relu'))
# model.add(MaxPooling2D(pool_size=(2, 2)))
# model.add(Dropout(0.25))
        
model.add(Flatten())
# model.add(Dense(512, activation='relu'))
# model.add(BatchNormalization())
# model.add(Dropout(0.5))
model.add(Dense(17, activation='sigmoid'))



In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization (BatchNo (None, 64, 64, 3)         12        
_________________________________________________________________
vgg19 (Functional)           (None, 2, 2, 512)         20024384  
_________________________________________________________________
flatten (Flatten)            (None, 2048)              0         
_________________________________________________________________
dense (Dense)                (None, 17)                34833     
Total params: 20,059,229
Trainable params: 20,059,223
Non-trainable params: 6
_________________________________________________________________


In [19]:
gc.enable()

In [20]:
#Try a combination of epoch lengths and learning rates
#batch_size=128
epochs = 20
learn_rate = 0.0001
opt  = optimizers.Adam(lr=learn_rate)
model.compile(loss='binary_crossentropy',optimizer=opt,metrics=[fbeta])
callbacks = [EarlyStopping(monitor='val_loss', patience=2, verbose=0),
            ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=1, cooldown=0, min_lr=1e-7, verbose=1),
            ModelCheckpoint(filepath='./weights_best.hdf5', verbose=1, save_best_only=True, 
                             save_weights_only=True, mode='auto')]

model.fit(traingen, 
          validation_data=(x_val, y_val),
          verbose=1, 
          epochs=epochs,
          callbacks=callbacks)
        


Epoch 1/20
Epoch 00001: val_loss improved from inf to 0.11871, saving model to ./weights_best.hdf5
Epoch 2/20
Epoch 00002: val_loss improved from 0.11871 to 0.11355, saving model to ./weights_best.hdf5
Epoch 3/20
Epoch 00003: val_loss improved from 0.11355 to 0.10807, saving model to ./weights_best.hdf5
Epoch 4/20
Epoch 00004: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-06.

Epoch 00004: val_loss did not improve from 0.10807
Epoch 5/20
Epoch 00005: val_loss improved from 0.10807 to 0.10029, saving model to ./weights_best.hdf5
Epoch 6/20
Epoch 00006: ReduceLROnPlateau reducing learning rate to 9.999999747378752e-07.

Epoch 00006: val_loss did not improve from 0.10029
Epoch 7/20
Epoch 00007: val_loss improved from 0.10029 to 0.09910, saving model to ./weights_best.hdf5
Epoch 8/20
Epoch 00008: ReduceLROnPlateau reducing learning rate to 1e-07.

Epoch 00008: val_loss did not improve from 0.09910
Epoch 9/20
Epoch 00009: ReduceLROnPlateau reducing learning rate to 1e-07.



<tensorflow.python.keras.callbacks.History at 0x7f2eaa3aa850>

In [21]:
#from keras.models import load_weights
model.load_weights('./weights_best.hdf5')

In [22]:
yfull_test = []
yfull_train = []

p_val = model.predict(x_val, batch_size = 32, verbose=2)
print(fbeta_score(y_val, np.array(p_val) > 0.2, beta=2, average='samples')) #Check the model performance on the validation set

# p_train = model.predict(x_train, batch_size =128, verbose=2) #save the training predictions
# yfull_train.append(p_train)
        
# p_test = model.predict(x_test, batch_size = 128, verbose=2) #save the test predictions
# yfull_test.append(p_test)

# result = np.array(yfull_test[0])
# # for i in range(1, nfolds):
# #     result += np.array(yfull_test[i])
# # result /= nfolds
# result = pd.DataFrame(result, columns = labels)
# result

253/253 - 3s
0.9134671275322496


In [23]:
predictions = model.predict(X_test)

In [25]:
tags = np.array(tag_list)

In [27]:
thres = 0.2
test_labels=[]
for i in tqdm(range(predictions.shape[0]),miniters=1000):
    label = predictions[i,:] > thres
    
    test_labels.append(list(tags[label]))

100%|██████████| 61191/61191 [00:00<00:00, 111491.21it/s]


In [37]:
test_labels

[['primary', 'clear'],
 ['primary', 'clear'],
 ['primary', 'partly_cloudy'],
 ['primary', 'agriculture', 'clear', 'cultivation', 'partly_cloudy'],
 ['primary', 'cloudy', 'partly_cloudy'],
 ['primary', 'clear'],
 ['haze',
  'primary',
  'agriculture',
  'clear',
  'habitation',
  'road',
  'cultivation'],
 ['primary', 'clear', 'habitation', 'road'],
 ['primary', 'clear'],
 ['haze', 'primary', 'agriculture', 'clear', 'cultivation'],
 ['primary', 'partly_cloudy'],
 ['primary', 'agriculture', 'clear', 'cultivation'],
 ['cloudy'],
 ['primary', 'agriculture', 'clear', 'road', 'cultivation'],
 ['primary', 'agriculture', 'clear', 'cultivation'],
 ['primary', 'agriculture', 'clear'],
 ['primary', 'agriculture', 'clear', 'habitation', 'road', 'cultivation'],
 ['primary', 'partly_cloudy'],
 ['primary', 'clear'],
 ['primary', 'clear', 'water', 'road', 'selective_logging'],
 ['primary', 'agriculture', 'clear', 'water', 'road'],
 ['primary', 'agriculture', 'clear', 'water'],
 ['primary', 'agricultur

In [38]:
df_test.head()

Unnamed: 0,image_name,tags
0,test_0,primary clear agriculture road water
1,test_1,primary clear agriculture road water
2,test_2,primary clear agriculture road water
3,test_3,primary clear agriculture road water
4,test_4,primary clear agriculture road water


In [39]:
df_test['tags'] = test_labels

In [40]:
df_test.head()

Unnamed: 0,image_name,tags
0,test_0,"[primary, clear]"
1,test_1,"[primary, clear]"
2,test_2,"[primary, partly_cloudy]"
3,test_3,"[primary, agriculture, clear, cultivation, par..."
4,test_4,"[primary, cloudy, partly_cloudy]"


In [41]:
df_test['tags'] = df_test['tags'].str.join(' ')

In [44]:
df_test.shape

(61191, 2)

In [43]:
df_test.to_csv('submission.csv',index=False)