<a href="https://colab.research.google.com/github/HyeonhoonLee/NIPA2020/blob/main/image_classification_with_blending_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## 1. First Impressions and Getting Tools Ready¶

from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True

# loading packages

import pandas as pd
import numpy as np

#

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

#

import seaborn as sns
import plotly.express as px

#

import os
import random
import re
import math
import time

from tqdm import tqdm
from tqdm.keras import TqdmCallback


from pandas_summary import DataFrameSummary

import warnings


warnings.filterwarnings('ignore') # Disabling warnings for clearer outputs



# seed_val = 42
# random.seed(seed_val)
# np.random.seed(seed_val)

# Setting color palette.
orange_black = [
    '#fdc029', '#df861d', '#FF6347', '#aa3d01', '#a30e15', '#800000', '#171820'
]

# Setting plot styling.
plt.style.use('ggplot')

os.getcwd()

# Setting file paths for our notebook:
base_path = '/home/workspace/data/.train/.task148/'
train_img_path = '/home/workspace/data/.train/.task148/data/train/images/'
test_img_path = '/home/workspace/data/.train/.task148/data/test/images/'
# img_stats_path = '/kaggle/input/melanoma2020imgtabular'
train_custom_img_stats_path = '/home/workspace/data/.train/.task148/data/train/train_custom.csv'
train_img_stats_path = '/home/workspace/data/.train/.task148/data/train/train.csv'
test_img_stats_path = '/home/workspace/data/.train/.task148/data/test/test.csv'

DATA_OUT_PATH = '/home/workspace/user-workspace/kb'

## 2. Loading Data

seed = 1234
BATCH_SIZE = 32  
LEARNING_RATE = 3e-4
EPOCHS = 100
num_classes=5
img_size = 128

train_df = pd.read_csv(train_img_stats_path)
test_df = pd.read_csv(test_img_stats_path)

# train_custom_df = pd.read_csv(train_custom_img_stats_path)
# train_custom_df.head()

train_df.head()

def add_png(name):
    added = name + ".png"
    return added

train_df["ID"] = train_df["ID"].apply(add_png)
train_df.head()

# from sklearn import preprocessing 
# label_encoder = preprocessing.LabelEncoder() 
# onehot_encoder = preprocessing.OneHotEncoder()
# train_y = label_encoder.fit_transform(info['class2'])
# train_y = train_y.reshape(len(train_y), 1)
# train_y = onehot_encoder.fit_transform(train_y)


train_df = train_df.replace(['10_콘크리트외벽', '20_조적외벽', '30_판넬외벽', '40_유리외벽', '50_기타외벽'], [0, 1, 2, 3, 4])

# train_df.Target = pd.factorize(train_df.Target)[0]
# train_df.head(10)

import tensorflow as tf

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization, Input
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras import regularizers, optimizers, Model

train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        rotation_range=10,
        width_shift_range=0.2,
        height_shift_range=0.2,
        # shear_range=0.2,
        # zoom_range=0.2,
        # rotation_range=15,
        # width_shift_range=0.1,
        # height_shift_range=0.1,
        horizontal_flip=True,
        validation_split=0.2
        )

train_generator=train_datagen.flow_from_dataframe(
  dataframe=train_df,
  directory=train_img_path,
  x_col="ID",
  y_col="Target",
  # y_col=["Plant","Disease"],
  subset="training",
  batch_size=BATCH_SIZE,
  seed=seed,
  shuffle=True,
  class_mode="raw",
  target_size=(img_size,img_size))

validation_generator=train_datagen.flow_from_dataframe(
  dataframe=train_df,
  directory=train_img_path,
  x_col="ID",
  y_col="Target",
  subset="validation",
  batch_size=BATCH_SIZE,
  seed=seed,
  shuffle=True,
  class_mode="raw",
  target_size=(img_size,img_size))

# orig_model = tf.keras.applications.EfficientNetB7(include_top=False, weights='imagenet', pooling='avg')
# inp = Input(shape = (img_size,img_size,3))
# x = orig_model(inp)
# # output1 = Dense(8, activation = 'sigmoid')(x)
# # output2 = Dense(14, activation = 'sigmoid')(x)
# # model = Model(inp,[output1,output2])
# output = Dense(num_classes, activation='sigmoid')(x)
# model = Model(inp, output)

model_input = tf.keras.Input(shape=(img_size, img_size, 3),
                             name='img_input')

dummy = tf.keras.layers.Lambda(lambda x: x)(model_input)

outputs = []

x = tf.keras.applications.EfficientNetB3(include_top=False,
                       weights='imagenet',
                       input_shape=(img_size,img_size, 3),
                       pooling='avg')(dummy)
x = tf.keras.layers.Dense(num_classes, activation='sigmoid')(x)
outputs.append(x)

x = tf.keras.applications.EfficientNetB4(include_top=False,
                       weights='imagenet',
                       input_shape=(img_size, img_size, 3),
                       pooling='avg')(dummy)
x = tf.keras.layers.Dense(num_classes, activation='sigmoid')(x)
outputs.append(x)

x = tf.keras.applications.EfficientNetB5(include_top=False,
                       weights='imagenet',
                       input_shape=(img_size, img_size, 3),
                       pooling='avg')(dummy)
x = tf.keras.layers.Dense(num_classes, activation='sigmoid')(x)
outputs.append(x)
model = tf.keras.Model(model_input, outputs, name='aNetwork')

model.summary()

loss = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=False)
adam = tf.keras.optimizers.Adam(
    learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07, amsgrad=False,
    name='Adam')


def recall(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_target_yn = tf.round(tf.keras.backend.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다
    y_pred_yn = tf.round(tf.keras.backend.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = tf.keras.backend.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Negative) = 실제 값이 1(Positive) 전체
    count_true_positive_false_negative = tf.keras.backend.sum(y_target_yn)

    # Recall =  (True Positive) / (True Positive + False Negative)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    recall = count_true_positive / (count_true_positive_false_negative + tf.keras.backend.epsilon())

    # return a single tensor value
    return recall

def precision(y_target, y_pred):
    # clip(t, clip_value_min, clip_value_max) : clip_value_min~clip_value_max 이외 가장자리를 깎아 낸다
    # round : 반올림한다
    y_pred_yn = tf.round(tf.keras.backend.clip(y_pred, 0, 1)) # 예측값을 0(Negative) 또는 1(Positive)로 설정한다
    y_target_yn = tf.round(tf.keras.backend.clip(y_target, 0, 1)) # 실제값을 0(Negative) 또는 1(Positive)로 설정한다

    # True Positive는 실제 값과 예측 값이 모두 1(Positive)인 경우이다
    count_true_positive = tf.keras.backend.sum(y_target_yn * y_pred_yn) 

    # (True Positive + False Positive) = 예측 값이 1(Positive) 전체
    count_true_positive_false_positive = tf.keras.backend.sum(y_pred_yn)

    # Precision = (True Positive) / (True Positive + False Positive)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    precision = count_true_positive / (count_true_positive_false_positive + tf.keras.backend.epsilon())

    # return a single tensor value
    return precision

def f1score(y_target, y_pred):
    _recall = recall(y_target, y_pred)
    _precision = precision(y_target, y_pred)
    # K.epsilon()는 'divide by zero error' 예방차원에서 작은 수를 더한다
    _f1score = ( 2 * _recall * _precision) / (_recall + _precision+ tf.keras.backend.epsilon())
    
    # return a single tensor value
    return _f1score

model.compile(optimizer=adam, 
              loss = [loss, loss, loss],
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy(name='accuracy'), f1score]
              )

import datetime
dt = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Callback checkpoint dir
model_name = "tf2_kb" + str(dt)
checkpoint_path = os.path.join(DATA_OUT_PATH, model_name, 'weights.h5')
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create path if exists
if os.path.exists(checkpoint_dir):
    print("{} -- Folder already exists \n".format(checkpoint_dir))
else:
    os.makedirs(checkpoint_dir, exist_ok=True)
    print("{} -- Folder create complete \n".format(checkpoint_dir))

# overfitting을 막기 위한 earlystop 추가
earlystop_callback = tf.keras.callbacks.EarlyStopping(monitor='val_dense_1_loss', min_delta=0.0001,patience=5, mode='min')
# min_delta: the threshold that triggers the termination (acc should at least improve 0.0001)
# patience: no improvment epochs (patience = 1, 1번 이상 상승이 없으면 종료)\

cp_callback = tf.keras.callbacks.ModelCheckpoint(
    checkpoint_path, monitor='val_dense_1_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min')

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_dense_1_loss', patience=2)

STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=validation_generator.n//validation_generator.batch_size

history = model.fit_generator(
    # generator=generator_wrapper(train_generator),
    train_generator,
    steps_per_epoch=STEP_SIZE_TRAIN,
    validation_data = validation_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=EPOCHS,verbose=1,
                    callbacks=[cp_callback, earlystop_callback, reduce_lr])
#steps_for_epoch
print(history.history)

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string], '')
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history, 'dense_accuracy')

plot_graphs(history, 'dense_1_accuracy')

plot_graphs(history, 'dense_loss')

plot_graphs(history, 'dense_1_loss')

plot_graphs(history, 'dense_2_loss')

plot_graphs(history, 'dense_f1score')

plot_graphs(history, 'dense_1_f1score')

## Submission

## To load a best weights from a saved file (.h5)
model.load_weights(checkpoint_path)

test_df.head()

total_number = len(test_df["ID"])
total_number

test_df["ID"] = test_df["ID"].apply(add_png)
test_df.head()

test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
        rescale=1./255)

test_generator=test_datagen.flow_from_dataframe(
  dataframe=test_df,
  directory=test_img_path,
  x_col="ID",
  y_col=None,
  # subset="validation",
  batch_size=BATCH_SIZE,
  seed=seed,
  shuffle=False,
  class_mode=None,
  target_size=(img_size,img_size))

STEP_SIZE_TEST=test_generator.n//test_generator.batch_size
test_generator.reset()
pred=model.predict_generator(test_generator,
  steps=200,
  verbose=1)

pred

len(pred[0])

prob1 = pred[0][0]
prob2 = pred[1][0]
prob3 = pred[2][0]
prob1_np = np.array(prob1)
prob2_np = np.array(prob2)
prob3_np = np.array(prob3)
# print(prob1, prob2, prob3)
probs2 = np.mean([prob1_np, prob2_np, prob3_np], axis=0)
probs2

answer = np.argmax(probs2, axis=-1)
answer

# probs = model.predict(ds_testAug, verbose=1, steps=steps * cfg['tta_steps'])
probs = np.stack(pred)
# probs = probs[:, :count_data_items(filenames_test) * cfg['tta_steps']]
# probs = probs[:, :test_generator.n * STEP_SIZE_TEST]
# probs = np.stack(np.split(probs, STEP_SIZE_TEST, axis=1), axis=1)
# probs = np.mean(probs, axis=1)

probs2 = probs[:, :-1]

len(probs2)

prob_re = probs.reshape(5698,3,5)

probs2 = np.mean(prob_re, axis=1)

len(probs2)

from tqdm import tqdm
targetlist = []
for i in tqdm(range(len(pred[0]))):
    prob1 = pred[0][i]
    prob2 = pred[1][i]
    prob3 = pred[2][i]
    prob1_np = np.array(prob1)
    prob2_np = np.array(prob2)
    prob3_np = np.array(prob3)
    probs2 = np.mean([prob1_np, prob2_np, prob3_np], axis=0)
    answer = np.argmax(probs2, axis=0)
    targetlist.append(answer)
len(targetlist)

test_df['Target'] = targetlist
test_df.head()

len(targetlist)

def remove_png(name):
    removed = name.split(".")
    name = removed[0]
    return name

test_df["ID"] = test_df["ID"].apply(remove_png)
test_df.head()

target_dic = {0:'10_콘크리트외벽', 1:'20_조적외벽', 2:'30_판넬외벽', 3:'40_유리외벽', 4:'50_기타외벽'}

submit_df = test_df.replace({'Target':target_dic})
submit_df.head()

submit_df['Target'].value_counts()

from nipa.taskSubmit import nipa_submit

team_id = "1429"
task_no= "148"
prediction_path = '/home/workspace/user-workspace/prediction/prediction148.csv'
save_path = os.path.join(DATA_OUT_PATH, model_name, 'prediction148.csv')

submit_df.to_csv(prediction_path, index = False, header = True)
submit_df.to_csv(save_path, index = False, header = True)

print("is file: ", os.path.isfile(prediction_path))

nipa_submit(team_id=team_id,
task_no=task_no,
result=prediction_path
)