In [21]:
import numpy as np
import pandas as pd
train_classes = pd.read_csv('/content/train_classes.csv')
train_classes.head()

Unnamed: 0,image_name,tags
0,train_0,haze primary
1,train_1,agriculture clear primary water
2,train_2,clear primary
3,train_3,clear primary
4,train_4,agriculture clear habitation primary road


In [22]:
labels = set()
def splitting_tags(tags):
    for tag in tags.split():
        labels.add(tag)
train_classes1 = train_classes.copy()
train_classes1['tags'].apply(splitting_tags)
labels = list(labels)
print(labels)


assert len(train_classes1['image_name'].unique()) == train_classes1.shape[0]

for tag in labels:
    train_classes1[tag] = train_classes1['tags'].apply(lambda x: 1 if tag in x.split() else 0)
    

train_classes1['image_name'] = train_classes1['image_name'].apply(lambda x: '{}.jpg'.format(x))
train_classes1.head()

['selective_logging', 'agriculture', 'conventional_mine', 'haze', 'partly_cloudy', 'cultivation', 'habitation', 'bare_ground', 'road', 'slash_burn', 'blow_down', 'cloudy', 'artisinal_mine', 'clear', 'primary', 'blooming', 'water']


Unnamed: 0,image_name,tags,selective_logging,agriculture,conventional_mine,haze,partly_cloudy,cultivation,habitation,bare_ground,road,slash_burn,blow_down,cloudy,artisinal_mine,clear,primary,blooming,water
0,train_0.jpg,haze primary,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
1,train_1.jpg,agriculture clear primary water,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1
2,train_2.jpg,clear primary,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
3,train_3.jpg,clear primary,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0
4,train_4.jpg,agriculture clear habitation primary road,0,1,0,0,0,0,1,0,1,0,0,0,0,1,1,0,0


In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Conv2D, MaxPooling2D
from tensorflow.keras.layers import Dropout, Flatten
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from tensorflow.keras.preprocessing.image import ImageDataGenerator


columns = list(train_classes1.columns[2:]) 
columns

['selective_logging',
 'agriculture',
 'conventional_mine',
 'haze',
 'partly_cloudy',
 'cultivation',
 'habitation',
 'bare_ground',
 'road',
 'slash_burn',
 'blow_down',
 'cloudy',
 'artisinal_mine',
 'clear',
 'primary',
 'blooming',
 'water']

In [23]:
def fbeta(y_true, y_pred, beta = 2, epsilon = 1e-4):
    
    beta_squared = beta**2
    
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.greater(tf.cast(y_pred, tf.float32), tf.constant(0.5)), tf.float32)
    
    tp = tf.reduce_sum(y_true * y_pred, axis = 1)
    fp = tf.reduce_sum(y_pred, axis = 1) - tp
    fn = tf.reduce_sum(y_true, axis = 1) - tp
    
    precision = tp/(tp+fp+epsilon)
    recall = tp/(tp+fn+epsilon)
    
    fb = (1+beta_squared)*precision*recall / (beta_squared*precision+recall+epsilon)
    return fb

In [24]:
def multi_label_acc(y_true, y_pred, epsilon = 1e-4):
    
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(tf.greater(tf.cast(y_pred, tf.float32), tf.constant(0.5)), tf.float32)
    
    tp = tf.reduce_sum(y_true * y_pred, axis = 1)
    fp = tf.reduce_sum(y_pred, axis = 1) - tp
    fn = tf.reduce_sum(y_true, axis = 1) - tp
    
    y_true = tf.cast(y_true, tf.bool)
    y_pred = tf.cast(y_pred, tf.bool)
        
    tn = tf.reduce_sum(tf.cast(tf.logical_not(y_true), tf.float32) * tf.cast(tf.logical_not(y_pred), tf.float32), 
                       axis = 1)
    return (tp+tn)/(tp+tn+fp+fn+epsilon)

In [25]:
def build_model():
    model = Sequential()
    model.add(BatchNormalization(input_shape=(128, 128, 3)))
    model.add(Conv2D(32, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(64, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(128, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(256, kernel_size=(3, 3), padding='same', activation='relu'))
    model.add(Conv2D(256, kernel_size=(3, 3), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Flatten())
    model.add(Dense(512, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(17, activation='sigmoid'))

    opt = Adam(lr=1e-4)

    model.compile(loss='binary_crossentropy',
              
              optimizer=opt,
              metrics=[multi_label_acc, fbeta])

    return model

In [26]:
save_best_check_point = ModelCheckpoint(filepath = 'best_model.hdf5', monitor = 'val_fbeta', mode = 'max',
                                       save_best_only = True, save_weights_only = True)

In [27]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [28]:
!unzip gdrive/MyDrive/Train-jpg.zip

Archive:  gdrive/MyDrive/Train-jpg.zip
replace train-jpg/train_0.jpg? [y]es, [n]o, [A]ll, [N]one, [r]ename: N


In [29]:
train_image_gen = ImageDataGenerator(rescale = 1/255, validation_split = 0.2)

train_generator = train_image_gen.flow_from_dataframe(dataframe=train_classes1,
                                                directory ="/content/train-jpg",  
                                                x_col="image_name", y_col=columns, subset="training", 
                                                batch_size=16,seed=2021, shuffle=True, 
                                                class_mode="raw", target_size=(128,128))

#generating validation data which is expected to be 20% of the train dataset since validation split is 0.2
val_generator = train_image_gen.flow_from_dataframe(dataframe=train_classes1,
                                                directory ="/content/train-jpg",  
                                                x_col="image_name", y_col=columns, subset="validation", 
                                                batch_size=16,seed=2021, shuffle=True, 
                                                class_mode="raw", target_size=(128,128))

Found 32384 validated image filenames.
Found 8095 validated image filenames.


In [30]:
step_train_size = int(np.ceil(train_generator.samples / train_generator.batch_size))
step_val_size = int(np.ceil(val_generator.samples / val_generator.batch_size))

model1 = build_model()

model1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
batch_normalization_1 (Batch (None, 128, 128, 3)       12        
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 128, 128, 32)      896       
_________________________________________________________________
conv2d_9 (Conv2D)            (None, 126, 126, 32)      9248      
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 63, 63, 32)        0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 63, 63, 32)        0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 63, 63, 64)        18496     
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 61, 61, 64)       

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [31]:
model1.fit(x = train_generator, steps_per_epoch = step_train_size, validation_data = val_generator, 
           validation_steps = step_val_size,epochs = 1, 
           callbacks = [save_best_check_point])



<keras.callbacks.History at 0x7f6ca838dcd0>

In [32]:
model2 = build_model()

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [34]:
sample_submission = pd.read_csv('/content/sample_submission.csv')
sample_submission1 = sample_submission.copy()
sample_submission1['image_name'] = sample_submission1['image_name'].apply(lambda x: '{}.jpg'.format(x))
sample_submission1.head()

Unnamed: 0,image_name,tags
0,test_0.jpg,primary clear agriculture road water
1,test_1.jpg,primary clear agriculture road water
2,test_2.jpg,primary clear agriculture road water
3,test_3.jpg,primary clear agriculture road water
4,test_4.jpg,primary clear agriculture road water


In [35]:
model2.load_weights('best_model.hdf5')

test1_df = sample_submission1.iloc[:40669]['image_name'].reset_index().drop('index', axis =1)
test1_df.head()

Unnamed: 0,image_name
0,test_0.jpg
1,test_1.jpg
2,test_2.jpg
3,test_3.jpg
4,test_4.jpg


In [37]:
!unzip gdrive/MyDrive/test-jpg.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: test-jpg/test_5499.jpg  
  inflating: test-jpg/test_55.jpg    
  inflating: test-jpg/test_550.jpg   
  inflating: test-jpg/test_5500.jpg  
  inflating: test-jpg/test_5501.jpg  
  inflating: test-jpg/test_5502.jpg  
  inflating: test-jpg/test_5503.jpg  
 extracting: test-jpg/test_5504.jpg  
  inflating: test-jpg/test_5505.jpg  
  inflating: test-jpg/test_5506.jpg  
  inflating: test-jpg/test_5507.jpg  
  inflating: test-jpg/test_5508.jpg  
  inflating: test-jpg/test_5509.jpg  
  inflating: test-jpg/test_551.jpg   
  inflating: test-jpg/test_5510.jpg  
  inflating: test-jpg/test_5511.jpg  
  inflating: test-jpg/test_5512.jpg  
  inflating: test-jpg/test_5513.jpg  
  inflating: test-jpg/test_5514.jpg  
  inflating: test-jpg/test_5515.jpg  
  inflating: test-jpg/test_5516.jpg  
  inflating: test-jpg/test_5517.jpg  
  inflating: test-jpg/test_5518.jpg  
  inflating: test-jpg/test_5519.jpg  
  inflating: test-jpg/t

In [40]:
test_image_gen = ImageDataGenerator(rescale = 1/255)


#creating a generator for the images found in the first test image files
test_generator1 = test_image_gen.flow_from_dataframe(dataframe=test1_df, 
                                                directory="/content/test-jpg", 
                                                x_col="image_name", y_col=None, batch_size=16, 
                                                shuffle=False, class_mode=None, target_size=(128,128))

step_test_size1 = int(np.ceil(test_generator1.samples/test_generator1.batch_size))

Found 40669 validated image filenames.


In [41]:
test_generator1.reset()
pred1 = model2.predict(test_generator1, steps = step_test_size1, verbose = 1)



In [42]:
file_names1 = test_generator1.filenames


pred_tags1 = pd.DataFrame(pred1)
pred_tags1 = pred_tags1.apply(lambda x: ' '.join(np.array(labels)[x>0.5]), axis = 1)


result1 = pd.DataFrame({'image_name': file_names1, 'tags': pred_tags1})
result1.head()

Unnamed: 0,image_name,tags
0,test_0.jpg,clear primary
1,test_1.jpg,clear primary
2,test_2.jpg,partly_cloudy primary
3,test_3.jpg,clear primary
4,test_4.jpg,partly_cloudy primary


In [43]:
test2_df = sample_submission1.iloc[40669:]['image_name'].reset_index().drop('index', axis =1)
test2_df.head()

Unnamed: 0,image_name
0,file_0.jpg
1,file_1.jpg
2,file_10.jpg
3,file_100.jpg
4,file_1000.jpg


In [44]:
!unzip gdrive/MyDrive/Test-jpg-additional.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: test-jpg-additional/file_5499.jpg  
  inflating: test-jpg-additional/file_55.jpg  
  inflating: test-jpg-additional/file_550.jpg  
  inflating: test-jpg-additional/file_5500.jpg  
  inflating: test-jpg-additional/file_5501.jpg  
  inflating: test-jpg-additional/file_5502.jpg  
  inflating: test-jpg-additional/file_5503.jpg  
  inflating: test-jpg-additional/file_5504.jpg  
  inflating: test-jpg-additional/file_5505.jpg  
  inflating: test-jpg-additional/file_5506.jpg  
  inflating: test-jpg-additional/file_5507.jpg  
  inflating: test-jpg-additional/file_5508.jpg  
  inflating: test-jpg-additional/file_5509.jpg  
  inflating: test-jpg-additional/file_551.jpg  
  inflating: test-jpg-additional/file_5510.jpg  
  inflating: test-jpg-additional/file_5511.jpg  
  inflating: test-jpg-additional/file_5512.jpg  
  inflating: test-jpg-additional/file_5513.jpg  
  inflating: test-jpg-additional/file_5514.jpg  
  inflat

In [45]:
test_generator2 = test_image_gen.flow_from_dataframe(dataframe=test2_df, 
                                                directory="/content/test-jpg-additional", 
                                                x_col="image_name", y_col=None, batch_size=16, 
                                                shuffle=False, class_mode=None, target_size=(128,128))

step_test_size2 = int(np.ceil(test_generator2.samples/test_generator2.batch_size))

Found 20522 validated image filenames.


In [46]:
test_generator2.reset()
pred2 = model2.predict(test_generator2, steps = step_test_size2, verbose = 1)



In [47]:
file_names2 = test_generator2.filenames
pred_tags2 = pd.DataFrame(pred2)
pred_tags2 = pred_tags2.apply(lambda x: ''.join(np.array(labels)[x>0.5]), axis = 1)

result2 = pd.DataFrame({'image_name': file_names2, 'tags': pred_tags2})
result2.head()

Unnamed: 0,image_name,tags
0,file_0.jpg,clearprimary
1,file_1.jpg,agricultureroadclearprimary
2,file_10.jpg,cloudy
3,file_100.jpg,agricultureclearprimary
4,file_1000.jpg,clearprimary


In [48]:
last_result = pd.concat([result1, result2])

last_result = last_result.reset_index().drop('index', axis =1)

print(last_result.shape)
last_result.head()

(61191, 2)


Unnamed: 0,image_name,tags
0,test_0.jpg,clear primary
1,test_1.jpg,clear primary
2,test_2.jpg,partly_cloudy primary
3,test_3.jpg,clear primary
4,test_4.jpg,partly_cloudy primary


In [49]:
last_result['image_name'] = last_result['image_name'].apply(lambda x: x[:-4])
last_result.head()

Unnamed: 0,image_name,tags
0,test_0,clear primary
1,test_1,clear primary
2,test_2,partly_cloudy primary
3,test_3,clear primary
4,test_4,partly_cloudy primary


In [51]:
last_result.to_csv('submission1.csv', index = False)

In [52]:
from google.colab import files
files.download('submission1.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>