## 1. PREPROCESS DATA



In [None]:
import tensorflow as tf
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))
tf.random.set_seed(142)

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, BatchNormalization
from tensorflow.keras.layers import Conv2D, MaxPooling2D
import tensorflow_addons as tfa
from tensorflow_addons.metrics import FBetaScore 

import tqdm.notebook as tq
import os
import logging
import warnings
warnings.filterwarnings('ignore')
logging.getLogger("tensorflow").setLevel(logging.ERROR)

In [None]:
PROJECT_FOLDER = '../input/planets-dataset/planet'
DATA_PATH = os.path.join(PROJECT_FOLDER, "planet")
TRAIN_JPG_DIR = os.path.join(DATA_PATH, "train-jpg")
TRAIN_CSV_PATH = os.path.join(DATA_PATH, "train_classes.csv")

In [None]:
df = pd.read_csv(TRAIN_CSV_PATH)
df.head(5)

One-hot encode the labels.

In [None]:
dummies = df['tags'].str.get_dummies(' ')
df = pd.concat([df, dummies], axis=1)

labels = dummies.columns.values
N_LABELS = len(labels)
dummies

In [None]:
print(f"There are {N_LABELS} unique labels including {labels}")

In [None]:
# Countplot of label distribution
label_count = dummies.sum(axis=0).sort_values()
print(label_count)

In [None]:
label_count.plot(kind='barh', figsize=(15, 10))
for i in range(label_count.shape[0]):
    plt.text(label_count.iloc[i] + 4, i, label_count.iloc[i], va='center')

In [None]:
images_title = [df[df['tags'].str.contains(label)].iloc[i]['image_name'] + '.jpg' for i, label in enumerate(labels)]

In [None]:
_, axs = plt.subplots(6, 3, sharex='col', sharey='row', figsize=(10, 20))
axs = axs.ravel()

for i, (image_name, label) in enumerate(zip(images_title, labels)):
    img_path = os.path.join(TRAIN_JPG_DIR, image_name)
    img = plt.imread(img_path)
    axs[i].imshow(img)
    axs[i].set_title(f'{image_name} - {label}')

In [None]:
pip install iterative-stratification

In [None]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [None]:
y = df[labels].values
X = df['image_name'].values

df['fold'] = np.nan

mskf = MultilabelStratifiedKFold(n_splits=5)
for i, (_, test_index) in enumerate(mskf.split(X, y)):
    df.iloc[test_index, -1] = i
   
df['fold'] = df['fold'].astype('int')
df['is_valid'] = False
df['is_valid'][df['fold'] == 0] = True

In [None]:
# Number of observations of each tags in each fold. 
df.groupby('fold')[labels].sum()

In [None]:
TRAIN_SIZE = df.shape[0] - df['is_valid'].sum()
VAL_SIZE = df['is_valid'].sum()

In [None]:
# Converting the values into features

def _image_feature(value):
    """Returns a bytes_list from a string / byte."""
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.encode_jpeg(value).numpy()]))

def _int64_feature(value):
    if type(value) != list:
        value = [value]
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))): # if value ist tensor
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def serialize_array(array):
    array = tf.io.serialize_tensor(array)
    return array


def image_feature(path, label):
    image = plt.imread(path)

    # image = tf.io.decode_jpeg(image, channels=3)
    feature = {'height': _int64_feature(image.shape[0]),
               'width': _int64_feature(image.shape[1]),
               'channel': _int64_feature(image.shape[2]),
               'image': _bytes_feature(serialize_array(image)),
               'label': _int64_feature(label),}
    return tf.train.Example(features=tf.train.Features(feature=feature))


def create_record(df, record_name):
    all_image_paths = df['image_name'].apply(lambda x: os.path.join(TRAIN_JPG_DIR, x+'.jpg')).values
    all_labels = df[labels].values

    record_path = f"{record_name}.tfrecords"
    writer = tf.io.TFRecordWriter(record_path) 

    for i in tq.tqdm(range(df.shape[0])):
        path = all_image_paths[i]
        label = all_labels[i].tolist()
        example = image_feature(path, label)
        writer.write(example.SerializeToString())
    
    writer.close()

In [None]:
# for i in range(5): 
#     create_record(df[df['fold'] == i], f'fold_{i}')

In [None]:
#arch
RECORDS = tf.io.gfile.glob(str('../input/tfrecord-files/*.tfrecords'))
RECORDS

In [None]:
IMG_WIDTH, IMG_HEIGHT = 192, 192
CHANNELS = 3

def read_tfrecord(example):
    tfrecord_format = {
        "height": tf.io.FixedLenFeature([], tf.int64),
        "width": tf.io.FixedLenFeature([], tf.int64),
        "channel": tf.io.FixedLenFeature([], tf.int64),
        "image": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([17], tf.int64, default_value=np.zeros((17,)).astype('int').tolist())
    }
    example = tf.io.parse_single_example(example, tfrecord_format)

    # Extract information
    height = example['height']
    width = example['width']
    channel = example['channel']
    image = example['image']
    label = example['label']

    # Convert raw image back to array
    image = tf.io.parse_tensor(image, out_type=tf.uint8)
    image = tf.reshape(image, shape=[height, width, channel])
    if channel == 4:
        image = image[:,:,:3]

    image = tf.image.resize(image, [IMG_WIDTH, IMG_HEIGHT])

    return (image, label)

In [None]:
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 1024
AUTOTUNE = tf.data.experimental.AUTOTUNE

In [None]:
def augmentation(image, label):
    image = tf.image.random_brightness(image, .1)
    image = tf.image.random_contrast(image, lower=0.0, upper=1.0)
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    return image, label

def load_dataset(filenames, shuffle=False, augment=False):
    """Load a list of pahts of TFRecords 
       and split them into train and validation set."""

    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=AUTOTUNE)

    # dataset = dataset.cache()

    if shuffle == True:
        dataset = dataset.shuffle(buffer_size = SHUFFLE_BUFFER_SIZE).repeat()

    if augment == True:
        dataset.map(augmentation, num_parallel_calls=AUTOTUNE)
    
    dataset = dataset.batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [None]:
RECORDS

In [None]:
train_ds = load_dataset(RECORDS[:4], shuffle=True, augment=True)
val_ds = load_dataset(RECORDS[4], shuffle=False, augment=False)

In [None]:
for i in train_ds.take(1):
    plt.imshow(i[0][1].numpy() / 255.)
    plt.show()

In [None]:
sample = df[df['fold'] == 0]
sample.head()

In [None]:
sample[labels].sum() / df[labels].sum()

In [None]:
def build_model(trainable = False, fine_tune_at = 0):
    
    mobile_net = tf.keras.applications.MobileNetV3Large(input_shape=(IMG_WIDTH, IMG_HEIGHT, CHANNELS), include_top=False)
    if trainable == True:
        mobile_net.trainable=True
    
        for layer in mobile_net.layers[:fine_tune_at]:
            layer.trainable = False
    else: 
        mobile_net.trainable = False
        
    
    input = tf.keras.Input(shape=(IMG_WIDTH, IMG_HEIGHT, CHANNELS), name='input')
    x = tf.keras.applications.mobilenet_v3.preprocess_input(input)
    x = mobile_net(x)
    x = tf.keras.layers.Dense(1024, activation = 'relu')(x)
    x = tf.keras.layers.GlobalAveragePooling2D()(x)
    output = tf.keras.layers.Dense(N_LABELS, activation = 'sigmoid')(x)
    model = tf.keras.Model(input, output)
    return model

In [None]:
model = build_model()
model.summary()

In [None]:
for batch in train_ds: 
    print(model.predict(batch[0]))
    break

In [None]:
ES_PATIENCE = 5
RLROP_PATIENCE = 3
DECAY_DROP = 0.5

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau


In [None]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', patience=ES_PATIENCE, restore_best_weights=True, verbose=1)
lr_decay = ReduceLROnPlateau(monitor='val_loss', mode='min', patience=RLROP_PATIENCE, factor=DECAY_DROP, min_lr=1e-6, verbose=1)

cp_callback = [early_stop, lr_decay]

In [None]:
LR = 1e-5
EPOCHS = 60
num_steps_train = tf.math.ceil(float(TRAIN_SIZE)/BATCH_SIZE)              
num_steps_val = tf.math.ceil(float(VAL_SIZE)/BATCH_SIZE)

fbeta = FBetaScore(num_classes=N_LABELS,
                   average='weighted',
                   beta=2.0,
                   threshold=0.2,
                   name='fbeta')

In [None]:
# Compile model with optimizer
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = LR),
               loss = 'binary_crossentropy',
               metrics = [fbeta, tf.keras.metrics.AUC()])

In [None]:
# Train model
history = model.fit(train_ds,
                  steps_per_epoch = num_steps_train,
                  epochs = EPOCHS,
                  validation_data = val_ds,
                  validation_steps = num_steps_val,
                  callbacks=[cp_callback, lr_decay], verbose = 2)

In [None]:
 def plot_stats(training_stats, val_stats, x_label='Training Steps', stats='loss'):
    stats, x_label = stats.title(), x_label.title()
    legend_loc = 'upper right' if stats=='loss' else 'lower right'
    training_steps = len(training_stats)
    test_steps = len(val_stats)

    plt.figure()
    plt.ylabel(stats)
    plt.xlabel(x_label)
    plt.plot(training_stats, label='Training ' + stats)
    plt.plot(np.linspace(0, training_steps, test_steps), val_stats, label='Validation ' + stats)
    plt.ylim([0,max(plt.ylim())])
    plt.legend(loc=legend_loc)
    plt.show()

In [None]:
plt.figure(figsize = (15, 10))

plot_stats(history.history['loss'], history.history['val_loss'], x_label='Epochs', stats='loss')
plot_stats(history.history['fbeta'], history.history['val_fbeta'], x_label='Epochs', stats='fbeta');

In [None]:
SAVE_PATH = '/kaggle/working/model/1_full_decay.h5'
model.save(SAVE_PATH)

In [None]:
model = tf.keras.models.load_model(SAVE_PATH)

In [None]:
def read_tfrecord_label_only(example):
    tfrecord_format = {
        "height": tf.io.FixedLenFeature([], tf.int64),
        "width": tf.io.FixedLenFeature([], tf.int64),
        "channel": tf.io.FixedLenFeature([], tf.int64),
        "image": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([17], tf.int64, default_value=np.zeros((17,)).astype('int').tolist())
    }
    example = tf.io.parse_single_example(example, tfrecord_format)

    # Extract information
    height = example['height']
    width = example['width']
    channel = example['channel']
    image = example['image']
    label = example['label']

    return label

In [None]:
train_label_ds = tf.data.TFRecordDataset(RECORDS[:4])
train_label_ds = train_label_ds.map(read_tfrecord_label_only, num_parallel_calls=AUTOTUNE)

In [None]:
true = list(train_label_ds.as_numpy_iterator())
true = np.array(true)
true.shape

In [None]:
train_image_ds = load_dataset(RECORDS[:4], shuffle=False, augment=False)

In [None]:
predictions = model.predict(train_image_ds)
final_predictions = (predictions > 0.2).astype('int')
final_predictions.shape

In [None]:
from sklearn.metrics import fbeta_score
fbeta_score(true, final_predictions, average='weighted', beta=2)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(true, final_predictions))

In [None]:
for data in train_image_ds.take(1):
    sample_images = data[0].numpy().astype('int')
    sample_labels = data[1].numpy().astype('bool')

sample_images = sample_images[:40]
sample_labels = sample_labels[:40]
sample_predictions = final_predictions[:40]

In [None]:
fig, axes = plt.subplots(10, 4, figsize=(20, 30))
axes = axes.ravel()

for i, (image, label) in enumerate(zip(sample_images, sample_labels)):
    axes[i].imshow(image)
    predict_label = labels[sample_predictions[i] == 1]
    predict_label = ', '.join(predict_label)
    correct = ', '.join(labels[label])
    axes[i].set_title(f"PREDICT: {predict_label} \nCORRECT: {correct}")

plt.subplots_adjust(wspace=1, hspace=1)
plt.show()

## 3. TUNING THRESHOLD

In [None]:
def perf_grid(y_hat_val, y_val, label_names, n_thresh=100):
    
    # Find label frequencies in the validation set
    label_freq = y_val.sum(axis=0)

    # Define thresholds
    thresholds = np.linspace(0, 1, n_thresh+1).astype(np.float32)
    
    # Compute all metrics for all labels
    ids, labels, freqs, tps, fps, fns, precisions, recalls, f1s, f2s = [], [], [], [], [], [], [], [], [], []
    
    for i in tq.tqdm(range(len(label_names))):
        for thresh in thresholds:   
            ids.append(i)
            labels.append(label_names[i])
            freqs.append(round(label_freq[i]/len(y_val),2))

            y = y_val[:, i]
            y_pred = y_hat_val[:, i] > thresh

            tp = np.count_nonzero(y_pred  * y)
            fp = np.count_nonzero(y_pred * (1-y))
            fn = np.count_nonzero((1-y_pred) * y)
            precision = tp / (tp + fp + 1e-16)
            recall = tp / (tp + fn + 1e-16)
            f1 = 2*tp / (2*tp + fn + fp + 1e-16)
            f2 = fbeta_score(y, y_pred, average='weighted', beta=2)
            
            tps.append(tp)
            fps.append(fp)
            fns.append(fn)
            precisions.append(precision)
            recalls.append(recall)
            f1s.append(f1)
            f2s.append(f2)
            
    # Create the performance dataframe
    grid = pd.DataFrame({'id':ids,
                         'label':labels,
                         'freq':freqs,
                         'threshold':list(thresholds)*len(label_names),
                         'tp':tps,
                         'fp':fps,
                         'fn':fns,
                         'precision':precisions,
                         'recall':recalls,
                         'f1':f1s,
                         'f2': f2s})
    
    return grid

In [None]:
predictions.shape

In [None]:
true.shape

In [None]:
# Performance table
grid = perf_grid(predictions, true, labels)

In [None]:
grid[grid['label'].str.contains('primary')].head(20)

In [None]:
# Choose the best threshold of 
grid_max = grid.loc[grid.groupby(['id', 'label'])[['f2']].idxmax()['f2'].values]
grid_max

## 4. PREDICT ON TEST 

In [None]:
test_paths = tf.io.gfile.glob(str(DATA_PATH + '/test-jpg/*.jpg')) 
additional_paths = tf.io.gfile.glob(str('../input/planets-dataset/test-jpg-additional' + '/test-jpg-additional/*.jpg')) 

In [None]:
len(test_paths)

In [None]:
len(additional_paths)

In [None]:
def create_test_record(data_folder, record_name):
    paths = tf.io.gfile.glob(str(data_folder + f'/{record_name}/*.jpg')) 

    record_path = f"./{record_name}.tfrecords"
    writer = tf.io.TFRecordWriter(record_path) 

    for i in tq.tqdm(range(len(paths))):
        path = paths[i]
        image = plt.imread(path)
        feature = {'height': _int64_feature(image.shape[0]),
                   'width': _int64_feature(image.shape[1]),
                   'channel': _int64_feature(image.shape[2]),
                   'image': _bytes_feature(serialize_array(image))}
        example = tf.train.Example(features=tf.train.Features(feature=feature))
        writer.write(example.SerializeToString())
    writer.close()

In [None]:
create_test_record(DATA_PATH,'test-jpg')
create_test_record("../input/planets-dataset/test-jpg-additional",'test-jpg-additional')

In [None]:
def read_test_record(example): 
    tfrecord_format = {
            "height": tf.io.FixedLenFeature([], tf.int64),
            "width": tf.io.FixedLenFeature([], tf.int64),
            "channel": tf.io.FixedLenFeature([], tf.int64),
            "image": tf.io.FixedLenFeature([], tf.string)
            }

    example = tf.io.parse_single_example(example, tfrecord_format)

    # Extract information
    height = example['height']
    width = example['width']
    channel = example['channel']
    image = example['image']

    # Convert raw image back to array
    image = tf.io.parse_tensor(image, out_type=tf.uint8)
    image = tf.reshape(image, shape=[height, width, channel])
    if channel == 4:
        image = image[:,:,:3]

    image = tf.image.resize(image, [IMG_WIDTH, IMG_HEIGHT])
    return image

In [None]:
def load_test_dataset(filenames, shuffle=False, augment=False):
    dataset = tf.data.TFRecordDataset(filenames)
    dataset = dataset.map(read_test_record, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
    
    return dataset

In [None]:
RECORDS = tf.io.gfile.glob('./*.tfrecords')
RECORDS

In [None]:
test_ds = load_test_dataset(RECORDS[-1])
additional_ds = load_test_dataset(RECORDS[-2])

In [None]:
RECORDS[-2]

In [None]:
test_predictions = model.predict(test_ds)
additional_predictions = model.predict(additional_ds)

In [None]:
len(test_predictions)

In [None]:
len(additional_predictions)

In [None]:
threshold = { 'agriculture':0.164,
          'artisinal_mine':0.114,
          'bare_ground':0.138,
          'blooming':0.168,
          'blow_down':0.2,
          'clear':0.13,
          'cloudy':0.076,   
          'conventional_mine':0.1,
          'cultivation':0.204,
          'habitation':0.17,
          'haze':0.204,
          'partly_cloudy':0.112,
          'primary':0.204,
          'road':0.156,
          'selective_logging':0.154,
          'slash_burn':0.38,
          'water':0.182
            }
            
thresholds = np.fromiter(threshold.values(), dtype=float)

In [None]:
def get_tag(prediction):
    return ' '.join(labels[(prediction >= thresholds)])

In [None]:
get_tag

In [None]:
final_test_predictions = list(map(get_tag, test_predictions))
final_additional_predictions = list(map(get_tag, additional_predictions))

In [None]:
len(final_additional_predictions)

In [None]:
len(final_test_predictions)

In [None]:
test_filenames = list(map(lambda x: x.split('/')[-1][:-4], test_paths))
additional_filenames = list(map(lambda x: x.split('/')[-1][:-4], additional_paths))

In [None]:
len(additional_filenames)


In [None]:
len(test_filenames)

In [None]:
submission = pd.DataFrame({'image_name': test_filenames, 'tags': final_test_predictions})
submission['count'] = submission['image_name'].str.strip('test_').astype('int')
submission = submission.sort_values('count', ascending=True).reset_index(drop=True)
submission.drop(columns=['count'], inplace=True)
submission

In [None]:
submission_2 = pd.DataFrame({'image_name': additional_filenames, 'tags': final_additional_predictions})
submission_2['count'] = submission_2['image_name'].str.strip('file_').astype('int')
submission_2 = submission_2.sort_values('count', ascending=True).reset_index(drop=True)
submission_2.drop(columns=['count'], inplace=True)
submission_2

In [None]:
final_submission = pd.concat([submission, submission_2], axis=0)
final_submission

In [None]:
final_submission.to_csv("Stage_D_Submission.csv", index=False)