# Intro
In this notebook I am going to explore the data a little bit.
The competition goal is to give probability estimation to if an image contains a cat or a dog.
The content score is measured by by LogLoss, where
* y=1 -> dog
* y=0 -> cat

We will try to minimize the score
      

# Preprocess the data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, cv2
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Dropout, Flatten, Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import regularizers
from tqdm import tqdm      # a nice pretty percentage bar for tasks.

 Creating lists of image file names for both cats and dogs

In [2]:
TRAIN_DIR = '../input/train/' 
TEST_DIR = '../input/test/'

image_list = [TRAIN_DIR+i for i in os.listdir(TRAIN_DIR)] 
dog_image_list = [TRAIN_DIR+i for i in os.listdir(TRAIN_DIR) if 'dog' in i]
cat_image_list = [TRAIN_DIR+i for i in os.listdir(TRAIN_DIR) if 'cat' in i]
test_image_list = [TEST_DIR+i for i in os.listdir(TEST_DIR)]


In [3]:
print('Total images:\t',len(image_list),
      '\nDog Images:\t',len(dog_image_list),
      '\nCat Images:\t',len(cat_image_list),
      '\nTestImages:\t',len(test_image_list))

Total images:	 25000 
Dog Images:	 12500 
Cat Images:	 12500 
TestImages:	 12500


We can see that there is equal number of images to cats and dogs

Now we are going to load the images and resize them to 64x64x3

In [4]:
ROWS = 256
COLS = 256
CHANNELS = 3

def load_image(file_path, size=(ROWS,COLS)):
    img = cv2.imread(file_path, cv2.IMREAD_COLOR)
    b,g,r = cv2.split(img)
    rgb_img = cv2.merge([r,g,b])
    return cv2.resize(rgb_img, (ROWS,COLS))
    

def load_image_list(file_list, ret_labels=True):
    count = len(file_list)
    data = np.ndarray((count, ROWS, COLS, CHANNELS), dtype=np.uint8)
    labels = []
    
    for i, image_name in tqdm(enumerate(file_list)):
        data[i] = load_image(image_name)
        if ret_labels:
            if 'dog' in image_name:
                labels.append(1)
            else:
                labels.append(0)
    return data, labels

In [6]:
# dog_image_list = dog_image_list[:1000]
# cat_image_list = cat_image_list[:1000]

print("Loading Dogs' Images...")
dog_data_images, dog_data_labels = load_image_list(dog_image_list)
dog_train_images, dog_val_images, dog_train_labels, dog_val_labels = train_test_split(dog_data_images, dog_data_labels,
                                                    test_size=0.2, random_state=2)


print("Loading Cats' Images...")
cat_data_images, cat_data_labels = load_image_list(cat_image_list)
cat_train_images, cat_val_images, cat_train_labels, cat_val_labels = train_test_split(cat_data_images, cat_data_labels,
                                                   test_size=0.2, random_state=2)

train_data = np.concatenate((dog_train_images, cat_train_images), axis=0)
train_labels = dog_train_labels + cat_train_labels
# train_labels = to_categorical(train_labels, num_classes=2)

val_data = np.concatenate((dog_val_images, cat_val_images), axis=0)
val_labels = dog_val_labels + cat_val_labels
# val_labels = to_categorical(val_labels, num_classes=2)

del cat_train_images
del cat_val_images
del dog_train_images
del dog_val_images

# Normalize the data
train_data = train_data / 255.0
val_data = val_data /255.0

print ("Training data shape: {}".format(train_data.shape))
print ("Labels length: {}".format(len(train_labels)))

Loading Dogs' Images...


12500it [00:48, 258.29it/s]


Loading Cats' Images...


12500it [00:46, 268.35it/s]


MemoryError: 

## Lets show some images
sum dogs images

In [None]:
f, axarr = plt.subplots(2,2)
axarr[0,0].imshow(train_data[0])
axarr[0,1].imshow(train_data[1])
axarr[1,0].imshow(train_data[2])
axarr[1,1].imshow(train_data[3])

Some cats Images

In [None]:
f, axarr = plt.subplots(2,2)
axarr[0,0].imshow(train_data[1000])
axarr[0,1].imshow(train_data[1001])
axarr[1,0].imshow(train_data[1002])
axarr[1,1].imshow(train_data[1003])
plt.show()

We can see that some images are not face closeup of the animal, but a whole body shots.

We also go some strange images like dog.10801 (Finger in a metal loop???)


In [None]:
# strange_dog = load_image(TRAIN_DIR+'dog.10801.jpg', size=(350,261))
# plt.imshow(strange_dog)

# Building simple classifier

We are going to build a deep network for this task using keras and tensorflow.

In [None]:
regl2 = regularizers.l2(0.01)

model = Sequential()

# BLOC 1
model.add(Conv2D(filters=64, kernel_size=(4,4), padding='Same', activation='relu', input_shape=(ROWS,COLS,CHANNELS)))
model.add(Conv2D(filters=64, kernel_size=(4,4), padding='Same', activation='relu'))
model.add(Dropout(rate=0.5))
model.add(Conv2D(filters=64, kernel_size=(4,4), padding='Same', activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(rate=0.5))

# BLOC 2
model.add(Conv2D(filters=32, kernel_size=(4,4), padding='Same', activation='relu'))
model.add(Conv2D(filters=32, kernel_size=(4,4), padding='Same', activation='relu'))
model.add(Dropout(rate=0.5))
model.add(Conv2D(filters=32, kernel_size=(4,4), padding='Same', activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(rate=0.5))

# BLOC 3
model.add(Conv2D(filters=64, kernel_size=(4,4), padding='Same', activation='relu'))
model.add(Conv2D(filters=64, kernel_size=(4,4), padding='Same', activation='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(rate=0.5))

# BLOC 4
model.add(Flatten())
model.add(Dense(256, activation='relu', kernel_regularizer=regl2))
model.add(Dense(256, activation='relu', kernel_regularizer=regl2))
# model.add(Dropout(rate=0.5))
model.add(Dense(1, activation='sigmoid')) # check if we could use softmax function instead

optimizer = SGD(lr=0.01)


model.compile(optimizer=optimizer, loss="binary_crossentropy",
              metrics=["accuracy"])

In [None]:
# Save the checkpoint in the /output folder
filepath = "cat_dog_best_reg2_v2"

# Keep only a single checkpoint, the best over test accuracy.
checkpoint = ModelCheckpoint(filepath,
                            monitor='val_loss',
                            verbose=1,
                            save_best_only=True,
                            mode='min')

epochs = 150

batch_size = 50

history = model.fit(x=train_data, y=train_labels, batch_size=batch_size,
                    epochs=epochs, verbose=2, validation_data=(val_data, val_labels),
                   callbacks=[checkpoint])

# Visualize the accuracy gain
Let's plot the run history, and see if the model converge. In the first run we reached loss of 0.4122, we can do better.

In [None]:
fig, ax = plt.subplots(2,1)
# Loss Plot
ax[0].plot(history.history['loss'], color='b', label="Training Loss")
ax[0].plot(history.history['val_loss'], color='r', label='Validation Loss')

legend = ax[0].legend(loc='best', shadow=True)

# Accuracy Plot
ax[1].plot(history.history['acc'], color='b', label='Training Accuracy')
ax[1].plot(history.history['val_acc'], color='r', label='Validation Accuracy')
legend = ax[1].legend(loc='best', shadow=True)

# Fitting the test data

In [None]:
print("Loading Test Images...")
test_data, _ = load_image_list(test_image_list, False)
test_data = test_data / 255.0

print ("Test data shape: {}".format(test_data.shape))

In [None]:
test_predictions = model.predict(test_data, verbose=1)

In [None]:
print ("Test prediction shape: {}".format(test_predictions.shape))
with open('submission_file.csv','w') as f:
    f.write('id,label\n')

print("Saving the prediction results...")
with open('submission_file.csv','a') as f:
    for i, prediction in tqdm(enumerate(test_predictions)):
        f.write('{},{}\n'.format(i+1,prediction[0]))