### Importing Libraries

In [1]:
#import the necessary libraries

import os # navigating through the folders
import numpy as np 
import pandas as pd 
import matplotlib.image as mpimg
from skimage import io
from scipy import ndimage
import cv2
import gc
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, BatchNormalization, Conv2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, History, ReduceLROnPlateau


In [2]:
# to clear space
gc.collect() 

43

In [3]:
#reading the train_classes and the submission sample

train_classes = pd.read_csv("../input/planets-dataset/planet/planet/train_classes.csv")
sample_submission = pd.read_csv("../input/planets-dataset/planet/planet/sample_submission.csv")

In [4]:
gc.collect()

0

In [5]:
# paths to all the images

train_path = ('../input/planets-dataset/planet/planet/train-jpg')
test_path = ('../input/planets-dataset/planet/planet/test-jpg')
test_additional_path = ('../input/planets-dataset/test-jpg-additional/test-jpg-additional')

In [6]:
flatten = lambda l: [item for sublist in l for item in sublist]

#using the lambda function to create our labels
labels = list(set(flatten([l.split(' ') for l in train_classes['tags'].values])))

#Creating our label map
label_map = {l: i for i, l in enumerate(labels)}

In [7]:
from itertools import chain
labels_list = list(chain.from_iterable([tags.split(" ") for tags in train_classes['tags'].values]))
labels_set = set(labels_list)
print("There is {} unique labels including {}".format(len(labels_set), labels_set))

There is 17 unique labels including {'agriculture', 'blooming', 'road', 'primary', 'water', 'habitation', 'slash_burn', 'partly_cloudy', 'haze', 'cloudy', 'conventional_mine', 'selective_logging', 'blow_down', 'bare_ground', 'artisinal_mine', 'cultivation', 'clear'}


In [8]:
# converting the tags columns into one_hot_vectors
for tag in labels_set:
    train_classes[tag] = train_classes['tags'].apply(lambda x: 1 if tag in x.split() else 0)
    
# adding '.jpg' extension to 'image_name'
train_classes['image_name'] = train_classes['image_name'].apply(lambda x: '{}.jpg'.format(x)) 
train_classes.head()

Unnamed: 0,image_name,tags,agriculture,blooming,road,primary,water,habitation,slash_burn,partly_cloudy,haze,cloudy,conventional_mine,selective_logging,blow_down,bare_ground,artisinal_mine,cultivation,clear
0,train_0.jpg,haze primary,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
1,train_1.jpg,agriculture clear primary water,1,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1
2,train_2.jpg,clear primary,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
3,train_3.jpg,clear primary,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,train_4.jpg,agriculture clear habitation primary road,1,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,1


### Reading in the training set

In [9]:

y_col = list(train_classes.columns[2:]) 
train_image_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255,validation_split=0.2)

# creating the training set
training_set = train_image_gen.flow_from_dataframe(dataframe=train_classes, directory=train_path, x_col='image_name', y_col=y_col, \
       target_size=(224,224), class_mode='raw', seed=0, batch_size=128, subset='training')

# creating the validation set
validation_set = train_image_gen.flow_from_dataframe(dataframe=train_classes, directory=train_path, x_col='image_name', y_col=y_col, \
       target_size=(224,224), class_mode='raw', seed=0, batch_size=128, subset='validation')

Found 32384 validated image filenames.
Found 8095 validated image filenames.


In [10]:
from keras import backend as K
# defining a function to help calculate the fbeta_score
def fbeta_score_K(y_true, y_pred):
    beta_squared = 4

    tp = K.sum(y_true * y_pred) + K.epsilon()
    fp = K.sum(y_pred) - tp
    fn = K.sum(y_true) - tp

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    result = (beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall + K.epsilon())

    return result

### Using Vgg16 as my pre_trained model

In [11]:
import tensorflow.keras as keras
# Loading the pre-trained VGG16 architecture module
from tensorflow.keras.applications.vgg16 import VGG16



# Extract the pre - trained architecture
vgg_model = VGG16(input_shape =(224,224,3),include_top =False,weights ='imagenet')
vgg_model.summary()

# Get the output of the base_model formed above
x = vgg_model.output
x = Flatten()(x)
x = Dense (1000 , activation ='relu')(x)
predictions = Dense (17 , activation ='sigmoid')(x)
model = Model(inputs= vgg_model.input,outputs = predictions)

2021-10-19 00:35:02.978446: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-19 00:35:03.066116: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-19 00:35:03.066808: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2021-10-19 00:35:03.068907: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)    

### Training the model

In [12]:
history = History()
callbacks = [ EarlyStopping(monitor='val_loss',
                           patience=2,
                           verbose=0)]
             

# setting step size for training and validation image data
step_train_size = int(np.ceil(training_set.samples / training_set.batch_size))
step_val_size = int(np.ceil(validation_set.samples / validation_set.batch_size))

#compiling the model
model.compile(loss='binary_crossentropy', optimizer="Adam", metrics=['accuracy', fbeta_score_K])

# fitting the model
history = model.fit(x=training_set, steps_per_epoch=step_train_size, validation_data=validation_set, validation_steps=step_val_size,
         epochs=20, callbacks=callbacks)

2021-10-19 00:35:08.629256: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/20


2021-10-19 00:35:11.710974: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20


In [13]:
#making a copy of the sample submission
sample_copy = sample_submission.copy()
sample_copy['image_name'] = sample_copy['image_name'].apply(lambda x: '{}.jpg'.format(x))
sample_copy.head()


# creating a dataframe for the first 40669 images since we have 2 files for the testing set
test1 = sample_copy.iloc[:40669]['image_name'].reset_index().drop('index', axis=1)
test1.head()

Unnamed: 0,image_name
0,test_0.jpg
1,test_1.jpg
2,test_2.jpg
3,test_3.jpg
4,test_4.jpg


### Reading in the first test data

In [14]:
test_image_gen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255)

# generating the image data for the first 40669 images in the sample submission dataframe
test_gen = test_image_gen.flow_from_dataframe(dataframe=test1, \
            directory=test_path, x_col='image_name', y_col=None, \
            batch_size=128, shuffle=False, class_mode=None, target_size=(224,224))


step_test_size1 = int(np.ceil(test_gen.samples / test_gen.batch_size))

Found 40669 validated image filenames.


In [15]:
test_gen.reset() # reseting the generator to be sure of avoiding shuffling
pred = model.predict(test_gen, steps=step_test_size1, verbose=1) # predicts the first 40669 images in the 
                                                                    # sample submission dataframe



In [16]:
unique_labels = set()
def append_labels(tags):
    for tag in tags.split():
        unique_labels.add(tag)

train_classes = train_classes.copy()
train_classes['tags'].apply(append_labels)
unique_labels = list(unique_labels)
print(unique_labels)

['agriculture', 'blooming', 'road', 'primary', 'water', 'habitation', 'slash_burn', 'partly_cloudy', 'haze', 'cloudy', 'conventional_mine', 'selective_logging', 'blow_down', 'bare_ground', 'artisinal_mine', 'cultivation', 'clear']


In [17]:
test_file_names1 = test_gen.filenames # storing the filenames (images names) of the first 40669 images names in \
# the sample submission dataframe as ordered in the prediction as a variable
        
# converting the predictions of the first 40669 to tag names
pred_tags = pd.DataFrame(pred)
pred_tags = pred_tags.apply(lambda x: ' '.join(np.array(unique_labels)[x > 0.5]), axis=1)

# converting the predictions of the first 40669 to a dataframe
import pandas as pd
result1 = pd.DataFrame({'image_name': test_file_names1, 'tags': pred_tags})
print(result1.head())



   image_name                       tags
0  test_0.jpg              primary clear
1  test_1.jpg              primary clear
2  test_2.jpg      primary partly_cloudy
3  test_3.jpg  primary cultivation clear
4  test_4.jpg      primary partly_cloudy


In [18]:
# selecting the remaining 'image_name'(s) from the submission_sample dataframe to generate image data from 
# test_additional_path
test_additional = sample_copy.iloc[40669:]['image_name'].reset_index().drop('index', axis=1)
test_additional.head()

Unnamed: 0,image_name
0,file_0.jpg
1,file_1.jpg
2,file_10.jpg
3,file_100.jpg
4,file_1000.jpg


### Reading in the additional testing files

In [19]:
# initializing an image data generator object for the remaining images in the sample submission dataframe
test_image_gen2 = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255)

# generating the image data for the remaining images in the sample submission dataframe
test_gen2 = test_image_gen2.flow_from_dataframe(dataframe=test_additional, \
            directory=test_additional_path, x_col='image_name', \
            y_col=None, batch_size=128, shuffle=False, class_mode=None, target_size=(224,224))

# setting the step size for the testing set for the remaining images in the sample submission dataframe
step_test_size2 = int(np.ceil(test_gen2.samples / test_gen2.batch_size))


test_gen2.reset() # reseting the generator to be sure of avoiding shuffling
pred2 = model.predict(test_gen2, steps=step_test_size2, verbose=1) 

Found 20522 validated image filenames.


In [20]:
test_file_names2 = test_gen2.filenames 
        
# converting the predictions of the remaining images to tag names
pred_tags2 = pd.DataFrame(pred2)
pred_tags2 = pred_tags2.apply(lambda x: ' '.join(np.array(unique_labels)[x > 0.5]), axis=1)

# converting the predictions of the remaining to a dataframe
result2 = pd.DataFrame({'image_name': test_file_names2, 'tags': pred_tags2})

In [21]:
# joining the result of the test data and the additional files
final_result = pd.concat([result1, result2]) 
final_result = final_result.reset_index().drop('index', axis=1) # reseting the index of the dataframe so it 
                                                                # matches that of sample submis

In [22]:
# removing the jpegs extension back from the dataframe
final_result['image_name'] = final_result['image_name'].apply(lambda x: x[:-4])
final_result.head()

Unnamed: 0,image_name,tags
0,test_0,primary clear
1,test_1,primary clear
2,test_2,primary partly_cloudy
3,test_3,primary cultivation clear
4,test_4,primary partly_cloudy


### Final Submission

In [23]:
final_result.to_csv('second_submission.csv', index=False) # saving the predictions