In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [2]:
import os
import csv

from glob import glob
from pathlib import Path

import pandas as pd
import numpy as np

from tensorflow.keras.layers import Flatten, Dense, Dropout, GlobalAveragePooling2D, BatchNormalization
from tensorflow.keras.initializers import GlorotNormal, Zeros
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
from tensorflow.keras.callbacks import ModelCheckpoint

from sklearn.model_selection import StratifiedKFold

In [3]:
df = pd.read_csv("data/train_label.csv", )
df.head()

Unnamed: 0,ID,Label
0,0,1
1,1,0
2,2,1
3,3,1
4,4,1


In [4]:
np.random.seed(5242)
target_size = (224, 224)
no_of_classes = df['Label'].nunique()

from tensorflow.keras.applications import VGG16, Xception

input_shape = list(target_size) + [3]
model =  VGG16(input_shape=input_shape, weights='imagenet', include_top=False)

# don't train existing weights
for layer in model.layers:
    layer.trainable = False

# w_init = GlorotNormal()

x = Flatten()(model.output)
# x = Dropout(0.2)(x)
# x = Dense(256, activation="relu", kernel_initializer=w_init,  kernel_regularizer="l2")(x)
# x = BatchNormalization()(x)
x = Dense(1024, activation="relu")(x)
x = Dropout(0.2, seed=5242)(x)
prediction = Dense(no_of_classes, activation='softmax')(x)

# create a model object
model = Model(inputs=model.input, outputs=prediction)

# view the structure of the model
model.summary()

# tell the model what cost and optimization method to use
model.compile(
  loss='categorical_crossentropy',
  optimizer='adam',
  metrics=['accuracy']
)

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

In [5]:
df['Filename'] = df['ID'].apply(lambda x: "{}.png".format(x))
df['Label'] = df['Label'].astype(str)
df.head()

Unnamed: 0,ID,Label,Filename
0,0,1,0.png
1,1,0,1.png
2,2,1,2.png
3,3,1,3.png
4,4,1,4.png


In [6]:
df_0 = df[df['Label'] == '0']
df_1 = df[df['Label'] == '1']
df_2 = df[df['Label'] == '2']

np.random.seed(5242)

min_row_count = min(df_0.shape[0], df_1.shape[0], df_2.shape[0])

df_0 = df_0.sample(n=min_row_count)
df_1 = df_1.sample(n=min_row_count)
df_2 = df_2.sample(n=min_row_count)
    
df = pd.concat([df_0, df_1, df_2])
df.head()

Unnamed: 0,ID,Label,Filename
199,199,0,199.png
558,558,0,558.png
100,100,0,100.png
831,831,0,831.png
709,709,0,709.png


In [7]:
df = df.reset_index().drop(columns=['index',  'ID'])

In [8]:
random_state = 5242
k = 3

x_train = df['Filename']
y_train = df['Label']
folds = list(StratifiedKFold(n_splits=k, shuffle=True, random_state=random_state).split(x_train, y_train))

In [9]:
batch_size = 32

train_datagen = ImageDataGenerator(rescale = 1./255)

val_datagen = ImageDataGenerator(rescale = 1./255)
                                 
# val_datagen = ImageDataGenerator(rescale = 1./255, 
#                             shear_range = 0.1,
#                             zoom_range = 0.1,
#                             width_shift_range = 0.1,
#                             height_shift_range = 0.1,
#                             rotation_range = 10,
#                             horizontal_flip = True,
#                             vertical_flip = True)

In [10]:
checkpoint_filepath = os.path.join("checkpoint", "vgg16")

model_checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=False,
    monitor='val_loss',
    mode='min',
    save_best_only=True)

In [11]:
data_dir = 'data'

def get_dir(data_dir, dir_name):
    dir_1 = os.path.join(data_dir, dir_name)

    if os.path.exists(dir_1):  # check target directory existence in the system
        img_count = len(glob("{}/*.png".format(dir_1)))

        if img_count == 0:  # if no image is found, increase 1 more layer
            dir_2 = os.path.join(dir_1, dir_name)
            img_count = len(glob("{}/*.png".format(dir_2)))

            if img_count == 0:  # if no image is found here also, raise error
                raise FileNotFoundError("!!! No images found at both {} and {} !!!".format(dir_1, dir_2))
            else:
                return dir_2  # if images are found here, return this directory path instead

        else:
            return dir_1  # if images are found here, return this directory path

    else:  # raise error if data directory doesn't exist
        raise OSError("!!! {} folder doesn't exist !!!".format(dir_1))

train_dir = get_dir(data_dir, 'train_image')
test_dir = get_dir(data_dir, 'test_image')
print(train_dir)
print(test_dir)

data\train_image
data\test_image


In [12]:
for idx, (train_idx, val_idx) in enumerate(folds):
    print("\nFold: {}".format(idx))
    
    df_train = pd.DataFrame()
    df_train['Filename'] = x_train[train_idx]
    df_train['Label'] = y_train[train_idx]
    df_val = pd.DataFrame()
    df_val['Filename'] = x_train[val_idx]
    df_val['Label'] = y_train[val_idx]
    
    train_generator = train_datagen.flow_from_dataframe(dataframe=df_train,
                                              directory=train_dir, 
                                              validate_filenames=False,
                                              x_col="Filename", 
                                              y_col="Label", 
                                              class_mode="categorical",
                                              target_size=target_size, 
                                              batch_size=batch_size)

    validation_generator = val_datagen.flow_from_dataframe(dataframe=df_val,
                                                  directory=train_dir, 
                                                  validate_filenames=False,
                                                  x_col="Filename", 
                                                  y_col="Label", 
                                                  class_mode="categorical",
                                                  target_size=target_size, 
                                                  batch_size=batch_size)
    model.fit(
        train_generator,
        epochs=20,
        validation_data=validation_generator,
        callbacks=[model_checkpoint_callback]
    )


Fold: 0
Found 624 non-validated image filenames belonging to 3 classes.
Found 312 non-validated image filenames belonging to 3 classes.
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 20 steps, validate for 10 steps
Epoch 1/20
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
INFO:tensorflow:Assets written to: checkpoint\vgg16\assets
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

Fold: 1
Found 624 non-validated image filenames belonging to 3 classes.
Found 312 non-validated image filenames belonging to 3 classes.
  ...
    to  
  ['...']
  ...
    to  
  ['...']
Train for 20 steps, validate for 10 steps
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [13]:
best_model = load_model(checkpoint_filepath)

test_files = glob("{}/*.png".format(test_dir))
result = []

for file in test_files:
    img = load_img(file, grayscale=False, color_mode="rgb", target_size=target_size)
    img_arr = np.array([img_to_array(img)])/255
    predictions = best_model.predict(img_arr)
    
    img_id = Path(file).stem
    result.append([img_id, np.argmax(predictions)])

with open('vgg16-submission3.csv','w') as result_file:
    wr = csv.writer(result_file)
    wr.writerows([['ID', 'Label']])
    wr.writerows(result)

In [14]:
# def train_val_split(df):
#     return np.split(df.sample(frac=1), [int(.7*len(df))])

# df_0_train, df_0_val = train_val_split(df_0)
# df_1_train, df_1_val = train_val_split(df_1)
# df_2_train, df_2_val = train_val_split(df_2)

# df_train = pd.concat([df_0_train, df_1_train, df_2_train])
# df_val = pd.concat([df_0_val, df_1_val, df_2_val])

In [15]:
# # fit the model
# r = model.fit(
#     train_generator,
#     epochs=20,
#     validation_data=validation_generator,
#     callbacks=[model_checkpoint_callback]
# )

In [16]:
# import matplotlib.pyplot as plt

# # loss
# plt.plot(r.history['loss'], label='train loss')
# plt.plot(r.history['val_loss'], label='val loss')
# plt.legend()

# plt.show()

In [17]:
# plt.figure(figsize=(18, 4))
# plt.subplot(1, 2, 1)
# plt.title('Loss')
# plt.plot(r.history['loss'], label="train loss")
# plt.plot(r.history['val_loss'], label="val loss")

# plt.subplot(1, 2, 2)
# plt.title('Accuracy')
# plt.plot(r.history['accuracy'], label="train accuracy")
# plt.plot(r.history['val_accuracy'], label="val accuracy")

# plt.legend()
# plt.show()