In [None]:
#########################################################
# Submission script for the Petfinder.my kaggle contest #
#########################################################


####################
# Python libraries #
####################

import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import random
import imgaug.augmenters as iaa
import tensorflow as tf

from glob import glob
from PIL import Image
from time import time
from sklearn.model_selection import train_test_split
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import Flatten, Dense, Dropout, Activation,\
    Conv2D, GlobalAveragePooling2D, Lambda, Concatenate, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau,\
    EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model, load_model, save_model
from tensorflow.keras.metrics import RootMeanSquaredError
from tensorflow.keras.utils import Sequence
from tensorflow.keras import Input

tf.random.set_seed(48)

# Class importation from kaggle database
sys.path.append('../input/swin-transformer-tf')
from swintransformer import SwinTransformer


#########################
# Constants definitions #
#########################


WEIGHTS = "/kaggle/input/efficientnet-keras-noisystudent-weights-b0b7\
            /imagenet/imagenet.notop-b0.h5"
INPUT = "/kaggle/input/petfinder-pawpularity-score/"
TRAIN = "/kaggle/input/petfinder-pawpularity-score/train"
MODEL = "/kaggle/input/swin-large-01"
TEST = "/kaggle/input/petfinder-pawpularity-score/test"
OUT = "/kaggle/working/"
SIZE = (224, 224)
BATCH = 16
SEED = 48
FEATS = ['ratio', 'pixels', 'Subject Focus', 'Eyes', 'Face', 'Near', 'Action',
         'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']

########################
# Processing functions #
########################


def create_train_df(name, input_dir=INPUT, pics_dir=TRAIN):
    """
    Create reference dataframes for the whole modelization process.
    """
    df_data = pd.read_csv(input_dir + name)
    train_pics = glob(pics_dir + "/*")

    dico = {}

    def paw_class_10(x):

        if x > 89:
            return "9"

        elif x > 79:
            return "8"

        elif x > 69:
            return "7"

        elif x > 59:
            return "6"

        elif x > 49:
            return "5"

        elif x > 39:
            return "4"

        elif x > 29:
            return "3"

        elif x > 19:
            return "2"

        elif x > 9:
            return "1"

        else:
            return "0"

    def paw_classification(x):

        if x > 79:
            return "a"

        elif x > 59:
            return "b"

        elif x > 39:
            return "c"

        elif x > 19:
            return "d"

        else:
            return "e"

    for file in train_pics:

        with Image.open(file) as img:

            id = file.split("/")[-1].split(".")[0]
            image_name = file.split("/")[-1]
            ratio = max([img.size[0]/img.size[1], img.size[1]/img.size[0]])
            dico[id] = (file, image_name, img.size, img.size[0], img.size[1],
                        ratio, img.size[0]*img.size[1], img.mode)

    df = pd.DataFrame.from_dict(dico, orient='index',
                                columns=["chemin", "nom", "taille",
                                         "longueur", "hauteur",
                                         "ratio", "pixels", "mode"])

    df = df.rename_axis('Id').reset_index()
    df = df.merge(df_data, left_on="Id", right_on="Id")

    df["classe"] = df["Pawpularity"].map(paw_classification)
    df["dizaine"] = df["Pawpularity"].map(paw_class_10)
    df["Pawpularity"] = df["Pawpularity"]/100

    max_ratio = df["ratio"].max()
    max_pixels = df["pixels"].max()

    df["ratio"] = df["ratio"]/max_ratio
    df["pixels"] = df["pixels"]/max_pixels

    return df, max_ratio, max_pixels


def create_test_df(name, input_dir=INPUT, pics_dir=TEST):
    """
    Reference dataframe for the test samples.
    """
    df_data = pd.read_csv(input_dir + name)
    train_pics = glob(pics_dir + "/*")

    dico = {}

    for file in train_pics:

        with Image.open(file) as img:

            id = file.split("/")[-1].split(".")[0]
            image_name = file.split("/")[-1]
            ratio = max([img.size[0]/img.size[1], img.size[1]/img.size[0]])
            dico[id] = (file, image_name, img.size, img.size[0], img.size[1],
                        ratio, img.size[0]*img.size[1], img.mode)

    df = pd.DataFrame.from_dict(dico, orient='index',
                                columns=["chemin", "nom", "taille", "longueur",
                                         "hauteur", "ratio", "pixels", "mode"])

    df = df.rename_axis('Id').reset_index()
    df = df.merge(df_data, left_on="Id", right_on="Id")

    df["ratio"] = df["ratio"]/MAX_RATIO
    df["pixels"] = df["pixels"]/MAX_PIXELS

    return df


def weights_data(df):
    """
    Create weights for the data.
    """
    liste_classe = []
    dico_classe = {}

    for val in sorted(set(df["classe"].values)):

        df_temp = df[df["classe"] == val]
        weight = 1/len(df_temp)
        liste_classe.append(weight)

    fact_cl = 1/max(liste_classe)

    for val in sorted(set(df["classe"].values)):

        df_temp = df[df["classe"] == val]
        weight_n = fact_cl/len(df_temp)
        dico_classe[val] = weight_n

    liste_dizaine = []
    dico_dizaine = {}

    for val in sorted(set(df["dizaine"].values)):

        df_temp = df[df["dizaine"] == val]
        weight = 1/len(df_temp)
        liste_dizaine.append(weight)

    fact_d = 1/max(liste_dizaine)

    for val in sorted(set(df["dizaine"].values)):

        df_temp = df[df["dizaine"] == val]
        weight_n = fact_d/len(df_temp)
        dico_dizaine[val] = weight_n

    df["classe_w"] = df["classe"].map(dico_classe)
    df["dizaine_w"] = df["dizaine"].map(dico_dizaine)


def tts_df(df):
    """
    Create balanced train and validation data taking into account
    the pawpularity values.
    """
    df_train = pd.DataFrame()
    df_val = pd.DataFrame()

    for val in sorted(set(df["dizaine"].values)):

        df_temp = df[df["dizaine"] == val]
        df_temp_train, df_temp_val = train_test_split(df_temp, test_size=0.2)
        df_train = pd.concat([df_train, df_temp_train])
        df_val = pd.concat([df_val, df_temp_val])

    df_train = df_train.sort_values("nom")
    df_val = df_val.sort_values("nom")

    return df_train, df_val


########################
# Data generator class #
########################

class DataGen(Sequence):
    """
    Generator class.
    """
    def __init__(self,
                 directory,
                 df,
                 labels,
                 batch_size,
                 input_size,
                 feat_col,
                 iteration=1,
                 weight=False,
                 training=True,
                 aug=True,
                 shuffle=True,
                 crop=True,
                 pp=False,
                 pp_func=None):

        self.directory = directory
        self.df = df.copy()
        self.data = pd.DataFrame()
        self.batch_size = batch_size
        self.input_size = input_size
        self.feat_col = feat_col
        self.iteration = iteration
        self.weight = weight
        self.training = training
        self.aug = aug

        if self.training:

            self.labels = labels

        self.shuffle = shuffle
        self.crop = crop
        self.pp = pp
        self.pp_func = pp_func
        self.zoom = iaa.geometric.Affine(scale=(1, 1.15))
        self.crop = iaa.size.CenterCropToSquare()
        self.tx = iaa.TranslateX(percent=(-7, 7), mode="reflect")
        self.ty = iaa.TranslateY(percent=(-7, 7), mode="reflect")

        for i in range(self.iteration):

            self.data = pd.concat([self.data, self.df])

        self.nb_samples = len(self.data)
        self.on_epoch_end()

    def __len__(self):

        return int(np.ceil(self.nb_samples / self.batch_size))

    def on_epoch_end(self):

        if self.shuffle:

            self.data = self.data.sample(
                frac=1, random_state=SEED).reset_index(drop=True)

    def __getitem__(self, index):

        temp_data = self.data.iloc[index*self.batch_size:
                                   (index+1)*self.batch_size]
        images = []

        for name in temp_data["nom"].values:

            path = os.path.join(self.directory, name)
            img = cv2.imread(path)

            if self.crop:

                img = self.crop.augment_image(img)

            img = cv2.resize(img, SIZE, interpolation=cv2.INTER_LINEAR)

            if self.aug:

                flip_or_not = random.getrandbits(1)

                if flip_or_not:

                    img = cv2.flip(img, 1)

                img = self.zoom.augment_image(img)
                img = self.tx.augment_image(img)
                img = self.ty.augment_image(img)

            if self.pp:

                img = self.pp_func(img)

            img = np.array(img, dtype='float32')
            images.append(img)

        if self.training:

            if self.weight:

                return [np.array(images), temp_data[self.feat_col].values],\
                        temp_data[self.labels].values,\
                        temp_data["dizaine_w"].values

            else:

                return [np.array(images), temp_data[self.feat_col].values],\
                        temp_data[self.labels].values

        else:

            return [np.array(images), temp_data[self.feat_col].values]


#########################################
# Submission and modelization functions #
#########################################

def submission(model, X_test, df_test):
    """
    Create the submission csv file.
    """
    preds = model.predict(X_test)
    df_test["Pawpularity"] = list(np.squeeze(preds))
    df_test["Pawpularity"] = df_test["Pawpularity"] * 100
    df_test = df_test[["Id", "Pawpularity"]]
    df_test.to_csv(OUT + "/" + "submission.csv", index=False)


def modelize(model, nb_epochs, train_gen, val_gen, train_steps, val_steps,
             batch, mod_name, opti="adam", verbose=1, graph=True):
    """
    Modelization function.
    """
    start = time()

    mod_save_path = mod_name

    reduce_lr = ReduceLROnPlateau(monitor="val_root_mean_squared_error",
                                  patience=1,
                                  factor=0.5,
                                  verbose=1,
                                  mode='min')

    stop = EarlyStopping(monitor='val_root_mean_squared_error', patience=4)

    check = ModelCheckpoint(mod_save_path,
                            save_weights_only=False,
                            monitor='val_root_mean_squared_error',
                            save_best_only=True)

    model.compile(loss="binary_crossentropy",
                  optimizer=opti,
                  metrics=RootMeanSquaredError())

    history = model.fit(train_gen,
                        steps_per_epoch=train_steps,
                        epochs=nb_epochs,
                        batch_size=batch,
                        validation_data=val_gen,
                        validation_steps=val_steps,
                        verbose=verbose,
                        callbacks=[check, stop, reduce_lr])

    model.save(OUT + mod_name)

    train_rmse = history.history['root_mean_squared_error']
    val_rmse = history.history['val_root_mean_squared_error']
    train_loss = history.history["loss"]
    val_loss = history.history["val_loss"]

    nb_ep = len(history.history["loss"])

    if graph:

        fig = plt.figure(figsize=(11, 9))

        plt.subplot(2, 1, 1)

        plt.plot(range(1, nb_ep+1),
                 train_loss,
                 'b',
                 label='Train loss',
                 color="green")

        plt.plot(range(1, nb_ep+1),
                 val_loss,
                 'b',
                 label='Validation loss',
                 color="yellow")

        plt.title('Train & validation losses - ' + mod_name)
        plt.xlabel("Epochs")
        plt.ylabel("Losses")
        plt.legend()

        plt.subplot(2, 1, 2)

        plt.plot(range(1, nb_ep+1),
                 train_rmse,
                 'b',
                 label='Train RMSE',
                 color="red")

        plt.plot(range(1, nb_ep+1),
                 val_rmse,
                 'b',
                 label='Validation RMSE',
                 color="blue")

        plt.title('Train & validation RMSE - ' + mod_name)
        plt.xlabel("Epochs")
        plt.ylabel("Accuracy")
        plt.legend()

        plt.show()

    print()
    print(f"Entrainement pour {nb_ep} epochs : {time()-start:.0f} secondes.")
    print(f"Meilleure train_RMSE = {np.min(train_rmse):.3f}")
    print(f"Meilleure val_RMSE = {np.min(val_rmse):.3f}")
    print()

    return model


#####################################################################
# Model creation function.                                          #
# To deal with the "no internet" kaggle competition rule, the model #
# is not created here but imported as an "added data".              #
#####################################################################

def create_swin(mod):

    inputs = Input(shape=(224, 224, 3))
    fx = Lambda(lambda data: tf.keras.applications.imagenet_utils.
                preprocess_input(tf.cast(data, tf.float32),
                                 mode="torch"))(inputs)
    fx = mod(fx)
    fx = Dropout(0.2)(fx)

    meta = Input(14)

    x = Concatenate()([fx, meta])

    x = Dense(128, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)
    x = Dense(64, activation="relu")(x)
    x = BatchNormalization()(x)
    x = Dropout(0.2)(x)

    output = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=[inputs, meta], outputs=output)

    return model


##########
# Script #
##########

# Reference dataframes creation
df, MAX_RATIO, MAX_PIXELS = create_train_df("train.csv")
weights_data(df)
df_train, df_val = tts_df(df)

df_test = create_test_df("test.csv")

# Train, validation and test generators
train_gen = DataGen(TRAIN,
                    df_train,
                    "Pawpularity",
                    BATCH,
                    SIZE,
                    FEATS,
                    2,  # Iterations
                    False,  # weight
                    True,  # training
                    True,  # aug
                    True,  # shuffle
                    True,  # crop
                    False,  # use prepro funct
                    None)  # prepro func name

val_gen = DataGen(TRAIN,
                  df_val,
                  "Pawpularity",
                  BATCH,
                  SIZE,
                  FEATS,
                  1,
                  False,  # weight
                  True,  # training
                  False,  # aug
                  True,  # shuffle
                  True,  # crop
                  False,  # use prepro func
                  None)  # prepro func name

train_steps, val_steps = len(train_gen), len(val_gen)

test_gen = DataGen(TEST,
                   df_test,
                   "Pawpularity",
                   1,
                   SIZE,
                   FEATS,
                   1,  # Iteration
                   False,  # weight
                   False,  # training
                   False,  # aug
                   False,  # shuffle
                   True,  # crop
                   False,  # use prepro func
                   None)  # prepro func name

# If the kaggle rules allowed it, we would here instanciate
# a SWIN model and pass it in our "create_swin" function.
# Then we would freeze it to just train the classification layers.
#
# swin = SwinTransformer("swin_large_224", include_top=False,
#                        pretrained=True, use_tpu=False)
#
# model = create_swin(swin)
# for layer in model.layers[:4]:
#
#    layer.trainable = False

# But because of the "no internet" rule, we just load our complete model
# from an added database...
model = load_model(MODEL)

# Model training
train_model = modelize(model, 30, train_gen, val_gen, train_steps,
                       val_steps, BATCH, "large_swin_extractor",
                       opti="adam", verbose=1, graph=False)

# Results submission
submission(train_model, test_gen, df_test)