In [1]:
# Importing the necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.preprocessing import image
from keras.layers import BatchNormalization
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import tensorflow as tf
from keras.preprocessing.image import ImageDataGenerator
import gc

In [2]:
# Importing the train dataset

train_data = pd.read_csv('../input/planet-understanding-the-amazon-from-space/train_v2.csv/train_v2.csv')

In [3]:
train_data.head()

In [4]:
# Extracting the unique labels from the train dataset

tag_set = set()
def add_tag(tags):
    for tag in tags.split():
        tag_set.add(tag)
        
train_data['tags'].apply(add_tag)
tag_list = list(tag_set)
print(tag_list)

In [5]:
# Splitting and labelling the tags

for tag in tag_list:
  train_data[tag] = train_data['tags'].apply(lambda x: 1 if tag in x.split() else 0)
train_data.head()

In [6]:
# Reducing the shape of the train dataset due to memory issues on the kaggle notebook

train_df = train_data.iloc[:6000]

In [7]:
train_df.shape

In [8]:
# Defining the image directory for extracting the jpg train images

image_directory = '../input/train-jpg/train-jpg/'

In [None]:
size = 200

x_dataset = []

for i in tqdm(range(train_df.shape[0])):
    img = image.load_img(image_directory + train_df['image_name'][i]+'.jpg', target_size=(size,size,3))
    img = image.img_to_array(img)
    img = img/255.0
    x_dataset.append(img)

x_train = np.array(x_dataset)

In [None]:
# Clearing the memory

del train_data

gc.collect()

In [None]:
x_train.shape

In [None]:
# Defining the features

x = x_train

In [None]:
plt.imshow(x_train[24])

In [None]:
# Defining the tag labels

y = np.array(train_df.drop(['image_name', 'tags'], axis=1))

In [None]:
# Splitting to train and train sets

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=20, test_size=0.2)

In [None]:
def fbeta(ytrue, ypred, beta = 2, epsilon = 1e-4):
    """Used for determining the fbeta"""
    
    beta_sqd = beta**2
    ytrue = tf.cast(ytrue, tf.float32)
    ypred = tf.cast(tf.greater(tf.cast(ypred, tf.float32), tf.constant(0.5)), tf.float32)
    
    TP = tf.reduce_sum(ytrue * ypred, axis = 1)
    FP = tf.reduce_sum(ypred, axis = 1) - TP
    FN = tf.reduce_sum(ytrue, axis = 1) - TP
    
    precision = TP / (TP + FP + epsilon)
    recall = TP / (TP + FN + epsilon)
    
    Fbeta = (1 + beta_sqd) * precision * recall / (beta_sqd * precision + recall + epsilon)
    return Fbeta

In [None]:
def multi_label_acc(ytrue, ypred, epsilon = 1e-4):
    """Used for determing the multilabel accuracy score"""
    
    ytrue = tf.cast(ytrue, tf.float32)
    ypred = tf.cast(tf.greater(tf.cast(ypred, tf.float32), tf.constant(0.5)), tf.float32)
    
    true_pos = tf.reduce_sum(ytrue * ypred, axis = 1)
    false_pos = tf.reduce_sum(ypred, axis = 1) - true_pos
    false_neg = tf.reduce_sum(ytrue, axis = 1) - true_pos
    
    ytrue = tf.cast(ytrue, tf.bool)
    ypred = tf.cast(ypred, tf.bool)
    
    true_neg = tf.reduce_sum(tf.cast(tf.logical_not(ytrue), tf.float32) * tf.cast(tf.logical_not(ypred), tf.float32), \
                             axis = 1)
    
    mla = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg + epsilon)
    return mla

In [None]:
def build_model():
    """Defines the various neural network layers"""
    
    model = Sequential()
    model.add(Conv2D(filters=16, kernel_size=(5,5), activation='relu', input_shape=(200,200,3)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    model.add(Conv2D(filters=64, kernel_size=(5,5), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    model.add(Conv2D(filters=64, kernel_size=(5,5), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(17, activation='sigmoid'))
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics = [multi_label_acc, fbeta])
    
    return model

In [None]:
model = build_model()

model.summary()

In [None]:
# Fitting the model to the train and test sets

history = model.fit(x_train, y_train, epochs=20, validation_data=(x_test, y_test), batch_size=64)

In [None]:
# Evaluating for the metrics

test_loss, mla, fbeta = model.evaluate(x_test, y_test)

print(f'Test loss: {test_loss}')
print(f'Fbeta score: {fbeta}')
print(f'Multilabel accuracy: {mla}')

In [None]:
# Importing the sample submission to extract features from it

sample_submission_df = pd.read_csv('../input/planets-dataset/planet/planet/sample_submission.csv')
sample_submission_df.head()

In [None]:
# Extracting features from the sample submission

test1 = sample_submission_df.iloc[:40669]['image_name']

In [None]:
# Converting the Series object to a DataFrame

test_df1 = pd.DataFrame(test1, columns=['image_name'])

In [None]:
# Defining the names of the image files

test_df1['image_name'] = test_df1['image_name'].apply(lambda x: '{}.jpg'.format(x))

test_df1.head()

In [None]:
# Geneating images from the test-jpg directory

image_gen = ImageDataGenerator(rescale = 1/255, validation_split = 0.2)

test1_gen = image_gen.flow_from_dataframe(dataframe = test_df1,
                                         directory = '../input/planets-dataset/planet/planet/test-jpg', x_col = 'image_name', y_col = None,
                                         target_size = (200,200), class_mode = None, shuffle = False, batch_size = 128)

In [None]:
# Making predictions

step_test1_size = int(np.ceil(test1_gen.samples / test1_gen.batch_size))

pred1 = model.predict(test1_gen, steps = step_test1_size, verbose = 1)

In [None]:
test1_file_names = test1_gen.filenames

pred1_tags = pd.DataFrame(pred1)

pred1_tags = pred1_tags.apply(lambda x: ' '.join(np.array(tag_list)[x > 0.5]), axis = 1)

In [None]:
pred1_result = pd.DataFrame({'image_name': test1_file_names, 'tags': pred1_tags})

pred1_result.head()

In [None]:
test2 = sample_submission_df.iloc[40669:]['image_name']

In [None]:
test_df2 = pd.DataFrame(test2, columns=['image_name'])

In [None]:
test_df2['image_name'] = test_df2['image_name'].apply(lambda x: '{}.jpg'.format(x))

test_df2.head()

In [None]:
test2_gen = image_gen.flow_from_dataframe(dataframe = test_df2,
                                         directory = '../input/planets-dataset/test-jpg-additional/test-jpg-additional', x_col = 'image_name', y_col = None,
                                         target_size = (200,200), class_mode = None, shuffle = False, batch_size = 128)

In [None]:
step_test2_size = int(np.ceil(test2_gen.samples / test2_gen.batch_size))

pred2 = model.predict(test2_gen, steps = step_test2_size, verbose = 1)

In [None]:
test2_file_names = test2_gen.filenames

pred2_tags = pd.DataFrame(pred2)

pred2_tags = pred2_tags.apply(lambda x: ' '.join(np.array(tag_list)[x > 0.5]), axis = 1)

In [None]:
pred2_result = pd.DataFrame({'image_name': test2_file_names, 'tags': pred2_tags})

pred2_result.head()

In [None]:
pred_result = pd.concat([pred1_result, pred2_result])

pred_result = pred_result.reset_index().drop('index', axis = 1)

pred_result.head(2)

In [None]:
pred_result.shape

In [None]:
pred_result['image_name'] = pred_result['image_name'].apply(lambda x: x[:-4])

In [None]:
# Exporting the prediction result as a csv file, ready for submission

pred_result.to_csv('submission.csv', index=False)