<a href="https://colab.research.google.com/github/changdaeoh/DACON_Dirty-MNIST/blob/main/0218_fit_on_newdata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 구글드라이브와 연동
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# setting
import tensorflow as tf
from tensorflow import  keras

!pip install tensorflow-addons
import tensorflow_addons as tfa

import numpy as np
import pandas as pd 

from tensorflow.keras.preprocessing.image import ImageDataGenerator

import os
os.chdir('/content/drive/MyDrive/project_dataset/dacon_v2')


# 데이터 경로
train_dir = "/content/drive/MyDrive/project_dataset/dacon_v2/dirty_mnist_2nd"
test_dir = "/content/drive/MyDrive/project_dataset/dacon_v2/test_route"

# GPU 확인
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
    raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [None]:
# # 현재 경로에 새로운 디렉토리 생성
# # 생성한 디렉토리에 zip파일 압축해제

from google.colab import output

!mkdir "dirty_mnist_2nd"
!unzip "dirty_mnist_2nd.zip" -d "dirty_mnist_2nd"

# !mkdir "test_dirty_mnist"
# !unzip "test_dirty_mnist.zip" -d "./test_dirty_mnist"

output.clear()

# Training 
### 1. Data Preparing

In [None]:
meta_df = pd.read_csv('dirty_mnist_2nd_answer.csv')
meta_df['index'] = meta_df['index'].apply(lambda x: str("{0:05d}".format(x))+'.png')
columns = list(meta_df.columns[1:])


# Data Augumentation
datagen_ag = ImageDataGenerator(rescale=1./255., 
                               rotation_range = 10,
                               width_shift_range = 0.2,
                               height_shift_range = 0.2,
                               horizontal_flip = True,
                               vertical_flip = True,
                               validation_split=0.1,
                               fill_mode = "nearest")

# generator_ag
train_gen_ag = datagen_ag.flow_from_dataframe(dataframe = meta_df,        
                                        directory = train_dir,       
                                        x_col='index',               
                                        y_col=columns,                
                                        batch_size = 32,               
                                        seed = 42,
                                        color_mode = "rgb",           
                                        class_mode='raw',
                                        target_size=(256, 256),       
                                        subset='training')
val_gen_ag = datagen_ag.flow_from_dataframe(dataframe = meta_df,
                                        directory = train_dir,
                                        x_col='index',
                                        y_col=columns,
                                        batch_size = 32,
                                        seed = 42,
                                        color_mode = "rgb",
                                        class_mode='raw',
                                        target_size=(256, 256),
                                        subset='validation')


# datagen_1 = ImageDataGenerator(rescale=1./255., validation_split=0.1)

# # not ag
# train_gen = datagen_1.flow_from_dataframe(dataframe = meta_df,        
#                                         directory = train_dir,       
#                                         x_col='index',               
#                                         y_col=columns,                
#                                         batch_size = 32,               
#                                         seed = 42,
#                                         color_mode = "rgb",           
#                                         class_mode='raw',
#                                         target_size=(256, 256),       
#                                         subset='training')
# val_gen = datagen_1.flow_from_dataframe(dataframe = meta_df,
#                                         directory = train_dir,
#                                         x_col='index',
#                                         y_col=columns,
#                                         batch_size = 32,
#                                         seed = 42,
#                                         color_mode = "rgb",
#                                         class_mode='raw',
#                                         target_size=(256, 256),
#                                         subset='validation')

Found 45000 validated image filenames.
Found 5000 validated image filenames.


### 2. Model Building

In [None]:
base_model = keras.applications.inception_resnet_v2.InceptionResNetV2(include_top=False,
                                                                      weights='imagenet',
                                                                      input_shape = (256, 256, 3))
# model body
incep_res = keras.Sequential([
                              base_model,

                              keras.layers.GlobalAveragePooling2D(), 
                              keras.layers.Dense(1024, kernel_initializer='he_normal'),
                              keras.layers.BatchNormalization(),
                              keras.layers.Activation('elu'),
                              keras.layers.Dense(512, kernel_initializer='he_normal'),
                              keras.layers.BatchNormalization(),
                              keras.layers.Activation('elu'),
                              keras.layers.Dense(256, kernel_initializer='he_normal'),
                              keras.layers.BatchNormalization(),
                              keras.layers.Activation('elu'),
                              keras.layers.Dense(26, kernel_initializer='glorot_normal', activation='sigmoid')
])      

# optimizer & metrics
# lr_schedule = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=1e-2,
#                                                           decay_steps=10000,
#                                                           decay_rate=0.9)
optimizer = keras.optimizers.Nadam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
BAcc = keras.metrics.BinaryAccuracy()

# model compile
incep_res.compile(optimizer = optimizer, loss = "binary_crossentropy", metrics = [BAcc])

# callbacks
checkpoint = keras.callbacks.ModelCheckpoint(f'./model/incep_res_cd_0219_v1.h5', 
                                             save_best_only=True, verbose=1)
early_stop_cb = keras.callbacks.EarlyStopping(patience = 10, restore_best_weights = True)

# fitting
history = incep_res.fit(train_gen_ag, epochs = 35, 
                        validation_data = val_gen_ag, 
                        callbacks = [checkpoint, early_stop_cb])

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_resnet_v2/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/35

Epoch 00001: val_loss improved from inf to 0.67056, saving model to ./model/incep_res_cd_0219_v1.h5
Epoch 2/35

Epoch 00002: val_loss improved from 0.67056 to 0.62884, saving model to ./model/incep_res_cd_0219_v1.h5
Epoch 3/35

Epoch 00003: val_loss improved from 0.62884 to 0.61823, saving model to ./model/incep_res_cd_0219_v1.h5
Epoch 4/35

Epoch 00004: val_loss improved from 0.61823 to 0.59933, saving model to ./model/incep_res_cd_0219_v1.h5
Epoch 5/35

Epoch 00005: val_loss improved from 0.59933 to 0.57490, saving model to ./model/incep_res_cd_0219_v1.h5
Epoch 6/35

Epoch 00006: val_loss improved from 0.57490 to 0.54299, saving model to ./model/incep_res_cd_0219_v1.h5
Epoch 7/35

Epoch 00007: val_loss improved from 0.54299 to 0.52788, saving model to ./model/incep_res_cd_0219_v1.h5
Epoch 8/35

Epoch 00008: 

### 훈련데이터를 더 써서 두 번째 훈련
* 49000 : 1000

In [None]:
# Data Augumentation
datagen_ag2 = ImageDataGenerator(rescale=1./255., 
                               rotation_range = 10,
                               width_shift_range = 0.2,
                               height_shift_range = 0.2,
                               horizontal_flip = True,
                               vertical_flip = True,
                               validation_split=0.02,
                               fill_mode = "nearest")

# generator_ag
train_gen_ag2 = datagen_ag2.flow_from_dataframe(dataframe = meta_df,        
                                        directory = train_dir,       
                                        x_col='index',               
                                        y_col=columns,                
                                        batch_size = 32,               
                                        seed = 42,
                                        color_mode = "rgb",           
                                        class_mode='raw',
                                        target_size=(256, 256),       
                                        subset='training')
val_gen_ag2 = datagen_ag2.flow_from_dataframe(dataframe = meta_df,
                                        directory = train_dir,
                                        x_col='index',
                                        y_col=columns,
                                        batch_size = 32,
                                        seed = 42,
                                        color_mode = "rgb",
                                        class_mode='raw',
                                        target_size=(256, 256),
                                        subset='validation')

Found 49000 validated image filenames.
Found 1000 validated image filenames.


In [None]:
# callbacks
checkpoint2 = keras.callbacks.ModelCheckpoint(f'./model/incep_res_cd_0219_v1_add.h5', 
                                             save_best_only=True, verbose=1)
early_stop_cb2 = keras.callbacks.EarlyStopping(patience = 5, restore_best_weights = True)

# fitting
history = incep_res.fit(train_gen_ag2, epochs = 10, 
                        validation_data = val_gen_ag2, 
                        callbacks = [checkpoint2, early_stop_cb2])

Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.32388, saving model to ./model/incep_res_cd_0219_v1_add.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.32388 to 0.31705, saving model to ./model/incep_res_cd_0219_v1_add.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.31705 to 0.31417, saving model to ./model/incep_res_cd_0219_v1_add.h5
Epoch 4/10

Epoch 00004: val_loss did not improve from 0.31417
Epoch 5/10

Epoch 00005: val_loss did not improve from 0.31417
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.31417
Epoch 7/10

Epoch 00007: val_loss improved from 0.31417 to 0.31187, saving model to ./model/incep_res_cd_0219_v1_add.h5
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.31187
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.31187
Epoch 10/10
  81/1532 [>.............................] - ETA: 13:13 - loss: 0.2486 - binary_accuracy: 0.9018

KeyboardInterrupt: ignored

# Prediction

In [None]:
# image_dataset_from_directory
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
    directory = test_dir,
    label_mode = None,
    class_names = None,
    color_mode = "rgb",
    batch_size = 64,
    image_size = (256, 256),
    shuffle = False,
    seed = None,
    validation_split = None
)

test_ds = test_ds.prefetch(1)

Found 5000 files belonging to 1 classes.


In [None]:
# add rescaling layer
mod = keras.Sequential([
                  keras.layers.experimental.preprocessing.Rescaling(1./255.),
                  incep_res
])

# prediction
result = mod.predict(test_ds)
result2 = result.copy()

# get class label
result2[result2 <= 0.5] = 0
result2[result2 > 0.5] = 1

# frame create
res = pd.DataFrame(result2, columns = columns )
int_rest = res.astype(int)
submit = pd.concat([test_df.iloc[:,0],int_rest], axis = 1)


submit.to_csv("IncepRes_0219_v1.csv", index = False)


In [None]:
# 모든 데이터 써서 최종 훈련

# Data Augumentation
datagen_fin = ImageDataGenerator(rescale=1./255.)

# generator_ag
train_gen_fin = datagen_fin.flow_from_dataframe(dataframe = meta_df,        
                                        directory = train_dir,       
                                        x_col='index',               
                                        y_col=columns,                
                                        batch_size = 32,               
                                        seed = 42,
                                        color_mode = "rgb",           
                                        class_mode='raw',
                                        target_size=(256, 256),       
                                        subset='training')


# opt
optimizer = keras.optimizers.Nadam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
BAcc = keras.metrics.BinaryAccuracy()

# model compile
incep_res.compile(optimizer = optimizer, loss = "binary_crossentropy", metrics = [BAcc])


# callbacks
checkpoint3 = keras.callbacks.ModelCheckpoint(f'./model/incep_res_cd_0219_v1_fin.h5', verbose=1)

# fitting
history_fin = incep_res.fit(train_gen_fin, epochs = 2, callbacks = [checkpoint3])

Found 50000 validated image filenames.
Epoch 1/2
Epoch 2/2


In [None]:
incep_res.save('./model/incep_res_cd_0219_v1_fin.h5')

In [None]:
mod = keras.Sequential([
                  keras.layers.experimental.preprocessing.Rescaling(1./255.),
                  incep_res
])

# make a prediction on the test set
yhat = mod.predict(test_ds)
# round probabilities to class labels
yhat = yhat.round()


# frame create
res = pd.DataFrame(yhat, columns = columns )
int_rest = res.astype(int)
submit = pd.concat([test_df.iloc[:,0],int_rest], axis = 1)

submit.to_csv("IncepRes_0219_v2.csv", index = False)