# Google Landmark Recognition Challenge

## Using Keras tensorflow CNN

### The Imports

In [1]:
#Necessary Imports for this algorithm

import tensorflow as tf
import pandas as pd
import numpy as np
#import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools

from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau

sns.set(style='white', context='notebook', palette='deep')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## The Data

In [3]:
train_df = pd.read_csv('/Users/JoonH/Desktop/Kaggle_data/landmarks_train.csv')
test_df = pd.read_csv('/Users/JoonH/Desktop/Kaggle_data/landmarks_test.csv')

# The approach
### Decode all images in the train and build a CNN based on that
### General Process:
#### 1. Generate X_train and Y_train from train_df with images downloaded (remove rows with broken links)
#### 2. Generate X_test as a numpy array using images downloaded
#### 3. Build CNN with keras tensorflow
#### 4. Train!
#### 5. Predict with trained model

In [4]:
train_df.head()

Unnamed: 0,id,url,landmark_id
0,cacf8152e2d2ae60,http://static.panoramio.com/photos/original/70...,4676
1,0a58358a2afd3e4e,http://lh6.ggpht.com/-igpT6wu0mIA/ROV8HnUuABI/...,6651
2,6b2bb500b6a38aa0,http://lh6.ggpht.com/-vKr5G5MEusk/SR6r6SJi6mI/...,11284
3,b399f09dee9c3c67,https://lh3.googleusercontent.com/-LOW2cjAqubA...,8429
4,19ace29d77a5be66,https://lh5.googleusercontent.com/-tnmSXwQcWL8...,6231


In [5]:
test_df.head()

Unnamed: 0,id,url
0,000088da12d664db,https://lh3.googleusercontent.com/-k45wfamuhT8...
1,0001623c6d808702,https://lh3.googleusercontent.com/-OQ0ywv8KVIA...
2,0001bbb682d45002,https://lh3.googleusercontent.com/-kloLenz1xZk...
3,0002362830cfe3a3,https://lh3.googleusercontent.com/-N6z79jNZYTg...
4,000270c9100de789,https://lh3.googleusercontent.com/-keriHaVOq1U...


In [6]:
#find all the names

import os
rootdir_train = 'C:/Users/JoonH/Desktop/Landmarks_data/train'
rootdir_test = 'C:/Users/JoonH/Desktop/Landmarks_data/test'

names_train = []
#names_test = []

for subdir, dirs, files in os.walk(rootdir_train):
    for file in files:
        name = file[:-4]
        names_train.append(name)
        
#for subdir, dirs, files in os.walk(rootdir_test):
#    for file in files:
#        name = file[:-4]
#        names_test.append(name)
        

In [56]:
names_train


['00001106ce16f5fc',
 '0000222a89e243df',
 '00003092db9c8d35',
 '000036547d852a99',
 '00004d453fd979d0',
 '0000523e34f86fc3',
 '000053a907102752',
 '000058168141573f',
 '000064ff7079b1da',
 '00007a175a6775cc',
 '000090980ba6b551',
 '0000affb2cbdf699',
 '0000b9df596a3187',
 '0000cb0474bd5c93',
 '0000cf432f82f744',
 '0000e272a73c5186',
 '0000ec7396b3caf1',
 '0000f9971ac99874',
 '00010051a5b0b060',
 '000109188efbcb60',
 '00010a9d72c54031',
 '000110fc8d56bd9f',
 '000117de8e6d3d2a',
 '00011bcd49a14b1b',
 '00011ced2c81d968',
 '00012fae9f7fec26',
 '00013c1f4d443a46',
 '000148270e6a0c5d',
 '0001574de35357d7',
 '00016316f1ebe481',
 '000165ef8a66a32f',
 '00016c18279d0ed0',
 '00016ceaac039874',
 '000182e311643015',
 '00019b0dccee7110',
 '0001b2b23681d40b',
 '0001bb27a457f549',
 '0001d4d13d38d9f4',
 '0001ed8e32298154',
 '0001f599bc81055e',
 '0001f6554f0699a5',
 '00021d2240b40178',
 '0002361e26c7755f',
 '000254ba5df240b9',
 '00025e270442ef2e',
 '000264f84513f971',
 '00026e47e4716740',
 '000289e4047

In [57]:
#names_test

['000088da12d664db',
 '0001623c6d808702',
 '0001bbb682d45002',
 '0002362830cfe3a3',
 '000270c9100de789',
 '0002b0fab5d3ccc4',
 '000396be3c24830a',
 '000506dc6ab3a40e',
 '0005292fc4b005a3',
 '0005456a82264bc8',
 '00055cf2bfb5594a',
 '000664eed4a70821',
 '0006aea5b6f4eaaa',
 '0006bbfa00dd6c0f',
 '0008aee1c0abed9d',
 '0008de5f3c25d563',
 '00094466c9f054f4',
 '0009f09a69405693',
 '000a7f4a25af7558',
 '000a9180e4dc8705',
 '000aee511bc53f8b',
 '000b55ddc1160a12',
 '000c08916d77b8ba',
 '000c2a80838aabff',
 '000dad8a776a3631',
 '000e30f1e5b32741',
 '000e47a43eb8a307',
 '00108ec72b2344bd',
 '0010ab920bac8640',
 '0010e7cad366d6fa',
 '001119140e4bb030',
 '0013a06d20c7839f',
 '001490806c884d96',
 '0014be4f4910bbe6',
 '0014ea5d819557be',
 '0015898890608086',
 '001645116c77f6be',
 '001693c1ea808396',
 '00169f449b4c60c0',
 '0016de8fd1c3eab6',
 '0017719d5bba733c',
 '0017deb491895bba',
 '001800346d31f711',
 '00181e5553bf2564',
 '0018761e6c424bd4',
 '0018893bd38632df',
 '0018b97bd2b24bd7',
 '0018d55375f

In [7]:
Y_train = train_df
#test_df
Y_train = Y_train.drop(['url'], axis = 1)
Y_train.head()

Unnamed: 0,id,landmark_id
0,cacf8152e2d2ae60,4676
1,0a58358a2afd3e4e,6651
2,6b2bb500b6a38aa0,11284
3,b399f09dee9c3c67,8429
4,19ace29d77a5be66,6231


In [8]:
Y_train.set_index('id', inplace = True)
#Y_val.set_index('id', inplace = True)

In [9]:
Y_train.head()

Unnamed: 0_level_0,landmark_id
id,Unnamed: 1_level_1
cacf8152e2d2ae60,4676
0a58358a2afd3e4e,6651
6b2bb500b6a38aa0,11284
b399f09dee9c3c67,8429
19ace29d77a5be66,6231


In [10]:
Y_train = Y_train.loc[names_train]
#Y_val = Y_val.loc[names_test]

In [11]:
Y_train = Y_train.reset_index(drop=True)
#Y_val = Y_val.reset_index(drop=False)

In [12]:
Y_train.head()

Unnamed: 0,landmark_id
0,10728
1,3748
2,6599
3,12630
4,6901


### Generating X_train from images downloaded via Kaggle

In [33]:
import cv2
import os

def load_images_from_folder(folder):
    exist = True
    #images = np.array([])
    #images.reshape(256,256)
    
    i = 0
    
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder,filename))
        if img is not None:
             if i < 2: 
                if exist:
                    images = img
                    exist = False
                else:
                    #images.append(img)
                    #np.concatenate((images,img))
                    np.hstack((images,img))
                #print(1)
                i = i + 1
    return images

In [34]:
X_train = load_images_from_folder("/Users/JoonH/Desktop/Landmarks_data/train")


KeyboardInterrupt: 

In [38]:
import glob

#building X_train
X_train = []
files = glob.glob("/Users/JoonH/Desktop/Landmarks_data/train")

for file in files:
    image = cv2.imread(file)
    X_train.append(image)

In [68]:
import cv2
import os

def load_images_from_folder_new(folder):
    X_train = []
    for filename in os.listdir(folder):
        img = cv2.imread(os.path.join(folder,filename))
        #img = cv2.imread(filename)
        X_train.append(img)
    return X_train

In [72]:
X_train = load_images_from_folder_new("/Users/JoonH/Desktop/Landmarks_data/train")

In [73]:
print('Date shape: ', np.array(X_train).shape)

MemoryError: 

In [101]:
#X_test can be built later for creating submission file 
#TODO: Save train dataset as csv
#X_test = load_images_from_folder("/Users/JoonH/Desktop/Landmarks_data/test")

In [74]:
len(X_train)

1218647

In [79]:
X_train = np.asarray(X_train)

MemoryError: 

In [80]:
np.savetxt('Landmarks_train_data.csv',X_train,delimiter = ',')

MemoryError: 

In [81]:
len(Y_train)

1218647

In [83]:
# Grayscale normalization of the data
# since CNN converges faster with [0,1] when current data goes [0,255]
X_train = np.array(X_train) / 255.0
#test = test / 255.0

MemoryError: 

In [85]:
# Reshape image in 3 dimensions (height = 28px, width = 28px , canal = 1)
#X_train = X_train.reshape((-1,28,28,1))
X_train = np.array(X_train).reshape((-1, 256, 256,1))
#test = test.reshape((-1,28,28,1))

MemoryError: 

In [26]:
# Encode labels to one hot vectors (ex : 2 -> [0,0,1,0,0,0,0,0,0,0])
Y_val = to_categorical(Y_train, num_classes = 10728)
random_seed = np.random.seed(2)

MemoryError: 

In [86]:
# Split the train and the validation set for the fitting
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.2, random_state= 2)

#I choosed to split the train set in two parts : a small fraction (10%) became the validation set which the model is evaluated and the rest (90%) is used to train the model.
#Since we have 42 000 training images of balanced labels (see 2.1 Load data), a random split of the train set doesn't cause some labels to be over represented in the validation set. Be carefull with some unbalanced dataset a simple random split could cause inaccurate evaluation during the validation.
#To avoid that, you could use stratify = True option in train_test_split function (Only for >=0.17 sklearn versions).


In [87]:
len(X_train)

974917

In [88]:
len(Y_train)

974917

In [89]:
len(X_val)

243730

In [90]:
len(Y_val)

243730

In [91]:
# Set the CNN model 
# my CNN architechture is In -> [[Conv2D->relu]*2 -> MaxPool2D -> Dropout]*2 -> Flatten -> Dense -> Dropout -> Out
# Expanded from origianl example model

model = Sequential()

model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))


model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                 activation ='relu'))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(10, activation = "softmax"))

In [92]:
# Define the optimizer
optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

In [93]:
# Compile the model
model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

In [94]:
# Set a learning rate annealer, learning rate is checked every two runes to see if accuracy has increased,
# If not showing much improvement learning rate is reduced by factor of 0.5
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=2, 
                                            verbose=1, 
                                            factor=0.50, 
                                            min_lr=0.00001)

In [98]:
epochs = 10 
batch_size = 16

In [99]:
datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=0,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0, # Randomly zoom image 
        width_shift_range=0,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False)  # randomly flip images


datagen.fit(X_train)

MemoryError: 

In [96]:
history = model.fit_generator(datagen.flow(X_train, Y_train, batch_size=batch_size),
                              epochs = epochs, validation_data = (X_val,Y_val),
                              verbose = 2, steps_per_epoch=X_train.shape[0] // batch_size
                              , callbacks=[learning_rate_reduction])

NameError: name 'datagen' is not defined

In [106]:
X_train = X_train[::2]
Y_train = Y_train[::2]

In [107]:
print(len(X_train), len(Y_train))

243730 243730


In [110]:
X_val = X_val[::2]
Y_val = Y_val[::2]

In [111]:
print(len(X_val), len(Y_val))

60933 60933


In [113]:
cnn = model.fit( x=X_train, y=Y_train, batch_size=86, 
                epochs=15, verbose=1, callbacks=None, 
                validation_split=0.0, validation_data=(X_val,Y_val), 
                shuffle=False, class_weight=None, sample_weight=None, 
                initial_epoch=0, steps_per_epoch=None, validation_steps=None)

MemoryError: 

In [None]:
#Save the Model
# TODO(): YAML saving and loading model

model.save('Brian_Digit_Recognizer.h5')

In [None]:
# predict results
results = model.predict(test)

# select the indix with the maximum probability
# Save final prediction results into csv file
results = np.argmax(results,axis = 1)

results = pd.Series(results,name="Label")

submission = pd.concat([pd.Series(range(1,28001),name = "LandmarkId"),results],axis = 1)
submission.to_csv("Landmarks_submission_1.csv",index=False)