In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import keras as ks # neural network models
import matplotlib.pyplot as plt 

# For working with images
import cv2 as cv2
import matplotlib.image as mpimg
import tqdm

# Potentially useful tools - you do not have to use these
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras.utils import to_categorical
from keras.applications.vgg16 import preprocess_input, decode_predictions

from random import randint

import os
from keras.models import Sequential
from keras.layers import BatchNormalization
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers.convolutional import Conv2D, Convolution2D, MaxPooling2D, SeparableConv2D
from mpl_toolkits.axes_grid1 import ImageGrid

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder

# Input data files are available in the "../input/" directory.
# Any results you write to the current directory are saved as output.

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
# CONSTANTS
# You may not need all of these, and you may find it useful to set some extras

CATEGORIES = ['airplane','car','cat','dog','flower','fruit','motorbike','person']

IMG_WIDTH = 100
IMG_HEIGHT = 100
TRAIN_PATH = '../input/natural_images/natural_images/'
TEST_PATH = '../input/evaluate/evaluate/'

In [3]:
# To find data:
folders = os.listdir(TRAIN_PATH)

images = []

for folder in folders:
    files = os.listdir(TRAIN_PATH + folder)
    images += [(folder, file, folder + '/' + file) for file in files]

image_locs = pd.DataFrame(images, columns=('class','filename','file_loc'))

# data structure is three-column table
# first column is class, second column is filename, third column is image address relative to TRAIN_PATH
image_locs.head()

Unnamed: 0,class,filename,file_loc
0,airplane,airplane_0372.jpg,airplane/airplane_0372.jpg
1,airplane,airplane_0045.jpg,airplane/airplane_0045.jpg
2,airplane,airplane_0075.jpg,airplane/airplane_0075.jpg
3,airplane,airplane_0667.jpg,airplane/airplane_0667.jpg
4,airplane,airplane_0620.jpg,airplane/airplane_0620.jpg


### Over to you

Now you must create your own solution to the problem. To get the file containing your results, you have to `commit` the kernel and then navigate to [kaggle.com/kernels](https://www.kaggle.com/kernels/), and the 'Your Work' tab, where you will find a list of your notebooks. Click on it and scroll down to the `Output` section.

# Dealing with the Classes

In [4]:
# Your code here
NUM_CLASSES = 8
print(image_locs.shape)
# Get a list of all the unique classes
filter_classes = (image_locs.groupby(['class']).count().sort_values(['class'], ascending=False).head(NUM_CLASSES).index)
print(filter_classes)

(6699, 3)
Index(['person', 'motorbike', 'fruit', 'flower', 'dog', 'cat', 'car',
       'airplane'],
      dtype='object', name='class')


In [5]:
img_locs_labels = image_locs['class']
print(img_locs_labels)
targets = pd.Series(img_locs_labels)
one_hot = pd.get_dummies(targets, sparse=True, dummy_na=False)
print(one_hot)
one_hot_labels = np.asarray(one_hot)
print(one_hot_labels)

0       airplane
1       airplane
2       airplane
3       airplane
4       airplane
          ...   
6694       fruit
6695       fruit
6696       fruit
6697       fruit
6698       fruit
Name: class, Length: 6699, dtype: object
      airplane  car  cat  dog  flower  fruit  motorbike  person
0            1    0    0    0       0      0          0       0
1            1    0    0    0       0      0          0       0
2            1    0    0    0       0      0          0       0
3            1    0    0    0       0      0          0       0
4            1    0    0    0       0      0          0       0
...        ...  ...  ...  ...     ...    ...        ...     ...
6694         0    0    0    0       0      1          0       0
6695         0    0    0    0       0      1          0       0
6696         0    0    0    0       0      1          0       0
6697         0    0    0    0       0      1          0       0
6698         0    0    0    0       0      1          0       0

[66

## One Hot Encode the Classes

In [6]:
data = np.array(filter_classes)
onehot_encoder = OneHotEncoder(sparse=False)
onehot_encoded = onehot_encoder.fit_transform(data.reshape(-1,1))

print(onehot_encoded)
print(len(onehot_encoded))

[[0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0.]]
8


In [7]:
# Function to transform the one hot encode back to its original label
# This will be used for transforming predictions in actual labels
def one_hot_to_label(prediction):
#     print(prediction)
#     k = 0
    for i in range(0, len(onehot_encoded)):
        if np.array_equal(prediction, onehot_encoded[i]):
            break
#         k = k + 1
    return filter_classes[i]

# Read In The Images

In [8]:
def read_img(img_id, train_or_test):
    img = cv2.imread(train_or_test + format(img_id))
    return cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT)) 

In [9]:
images = []
image_ids = []
classes = []

image_ids = image_locs['file_loc']

classes = image_locs['class'].tolist()

print(len(classes))

for img_id in image_ids:
  images.append(read_img(img_id, TRAIN_PATH))

print(len(images))

6699
6699


# Split Data Into Training/Test

In [10]:
# convert to np array
X = np.array(image_ids)
Y = np.array(one_hot_labels)

print(X)
print(Y)

# Split data
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.25, random_state=45)
print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

['airplane/airplane_0372.jpg' 'airplane/airplane_0045.jpg'
 'airplane/airplane_0075.jpg' ... 'fruit/fruit_0496.jpg'
 'fruit/fruit_0329.jpg' 'fruit/fruit_0880.jpg']
[[1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 1 0 0]]
(5024,)
(1675,)
(5024, 8)
(1675, 8)


## Load images for x train and test

In [11]:
x_train_images = []
x_test_images = []

for img_id in train_x:
  x_train_images.append(read_img(img_id, TRAIN_PATH))
  
for img_id in test_x:
  x_test_images.append(read_img(img_id, TRAIN_PATH))

print(len(x_train_images))
print(len(x_test_images))

5024
1675


# Model

In [12]:
# BUILD THE MODEL
def create_model():
    # Channels first tells the pooling layer to use the (Height, Width, Depth) format instead of the (Depth, Height, Width)
    data_format="channels_first"
    # Create a sequential model
    model = Sequential()
    # A convolutional layer
    model.add(Conv2D(32, (3, 3), input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)))
    # Activation layer
    model.add(Activation('relu'))
    # Pooling layer 
    model.add(MaxPooling2D(data_format=data_format, pool_size=(2, 2)))
    #Add another convolution layer 
    model.add(Conv2D(32, (3,3), input_shape=(IMG_WIDTH, IMG_HEIGHT,3)))
    #Add another relu activation
    model.add(Activation('relu'))
    #Add max pooling
    model.add(MaxPooling2D(data_format=data_format, pool_size=(2, 2)))
    model.add(BatchNormalization())
    #Add another convolution layer
    model.add(Conv2D(32, (3,3), input_shape=(IMG_WIDTH, IMG_HEIGHT, 3)))
    #Add another activation of relu
    model.add(Activation('relu'))
    #Add a max pooling layer
    model.add(MaxPooling2D(data_format=data_format, pool_size=(2, 2)))
    model.add(BatchNormalization())
    # Flatten Squashes the output of the previous layer to an array with 1 dimension
    model.add(Flatten())
    # A dense layer
    model.add(Dense(64))
    #Add another activation of relu 
    model.add(Activation('relu'))
    #Add dropout
    model.add(Dropout(0.1))
    #  Dense layer - last layer must be equal to the number of classes
    model.add(Dense(8))
    model.add(BatchNormalization())
    # Add an activation Sigmoid
    model.add(Activation('sigmoid'))
    # Compile the model
    model.compile(optimizer = 'sgd', 
                      loss='binary_crossentropy', 
                      metrics=['accuracy'])
    
    return model

In [13]:
# Convert images to np array
x_train_images_np = np.array(x_train_images)
x_test_images_np = np.array(x_test_images)

## Fit Model 

In [14]:
model = create_model()

'''
epocs = 100
batch size = 50
'''
model.fit(x_train_images_np, train_y, epochs=100, batch_size=50, validation_data=(x_test_images_np, test_y))


Train on 5024 samples, validate on 1675 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100

<keras.callbacks.History at 0x7f6824119f60>

## Predictions

In [15]:
# Predict the labels on the test set
# print(x_test_images_np)
predictions = model.predict(x_test_images_np)
# print(predictions)
print(model.evaluate(x_test_images_np, test_y))

[0.1407877215698584, 0.9737313434259215]


## View Predictions Against Actual

In [16]:
IMAGES_TO_PLOT = 50
i = 0
predictions = (predictions == predictions.max(axis=1)[:,None]).astype(int)
for img in x_test_images[0:IMAGES_TO_PLOT]:
    print("Pred: " + one_hot_to_label(predictions[i]))
    print('Actual Label: ' + one_hot_to_label(test_y[i]))
    print('----')
    i = i + 1

Pred: flower
Actual Label: flower
----
Pred: cat
Actual Label: dog
----
Pred: cat
Actual Label: cat
----
Pred: motorbike
Actual Label: motorbike
----
Pred: motorbike
Actual Label: motorbike
----
Pred: cat
Actual Label: dog
----
Pred: fruit
Actual Label: fruit
----
Pred: fruit
Actual Label: fruit
----
Pred: airplane
Actual Label: airplane
----
Pred: car
Actual Label: car
----
Pred: cat
Actual Label: cat
----
Pred: airplane
Actual Label: airplane
----
Pred: car
Actual Label: car
----
Pred: car
Actual Label: car
----
Pred: airplane
Actual Label: airplane
----
Pred: person
Actual Label: cat
----
Pred: dog
Actual Label: dog
----
Pred: motorbike
Actual Label: motorbike
----
Pred: person
Actual Label: person
----
Pred: person
Actual Label: person
----
Pred: cat
Actual Label: cat
----
Pred: airplane
Actual Label: airplane
----
Pred: flower
Actual Label: flower
----
Pred: dog
Actual Label: dog
----
Pred: airplane
Actual Label: airplane
----
Pred: fruit
Actual Label: fruit
----
Pred: person
Actu

# Run Model Against Unlabelled Data

## Read in Images

In [17]:
folders1 = os.listdir(TEST_PATH)
images1 = []

for folder1 in folders1:
    files1 = os.listdir(TEST_PATH )
    images1 += [(file1, file1) for file1 in files1]

image_locs1 = pd.DataFrame(images1, columns=('filename','file_loc'))
image_locs1.head()

Unnamed: 0,filename,file_loc
0,39.jpg,39.jpg
1,150.jpg,150.jpg
2,199.jpg,199.jpg
3,146.jpg,146.jpg
4,129.jpg,129.jpg


In [18]:
# Example values:
# filenames = ['test001','test002','test003','test004']
# predictions = ['car','cat','fruit','motorbike']
new_images = []
new_image_locs = image_locs1['file_loc']
new_image_names = image_locs1['filename']

In [19]:
print(len(new_image_names))

for img_id in new_image_locs:
    new_images.append(read_img(img_id, TEST_PATH))

print(len(new_images))

40000
40000


## Convert to np array

In [20]:
 new_images_np = np.array(new_images)

## Create Predictions

In [21]:
new_img_predictions = model.predict(new_images_np)

## Print Predictions

In [22]:
RESULTS_TO_CHECK = 20
i = 0
new_img_predictions = (new_img_predictions == new_img_predictions.max(axis=1)[:,None]).astype(int)
for img in new_images[0:RESULTS_TO_CHECK]:
    print("Pred: " + one_hot_to_label(new_img_predictions[i]))
    print('----')
    i = i + 1

Pred: car
----
Pred: flower
----
Pred: person
----
Pred: flower
----
Pred: motorbike
----
Pred: car
----
Pred: airplane
----
Pred: fruit
----
Pred: airplane
----
Pred: cat
----
Pred: flower
----
Pred: flower
----
Pred: airplane
----
Pred: dog
----
Pred: motorbike
----
Pred: person
----
Pred: airplane
----
Pred: motorbike
----
Pred: car
----
Pred: car
----


In [23]:
pred_one_hot = []

for pred in new_img_predictions:
    label = one_hot_to_label(pred)
    pred_one_hot.append(label)
    
# print(pred_one_hot)

# Save Results

In [24]:
# Save results

# results go in dataframe: first column is image filename, second column is category name
# category names are: airplane, car, cat, dog, flower, fruit, motorbike, person
df = pd.DataFrame()
df['filename'] = new_image_names
df['label'] = pred_one_hot
df = df.sort_values(by='filename')
df2 = df.drop_duplicates(keep="first")
df2.head()
df2.to_csv('results.csv', header=True, index=False)
