In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [4]:
!pip install -U efficientnet

Collecting efficientnet
  Downloading efficientnet-1.1.1-py3-none-any.whl (18 kB)
Collecting keras-applications<=1.0.8,>=1.0.7
  Downloading Keras_Applications-1.0.8-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 1.5 MB/s eta 0:00:011
Installing collected packages: keras-applications, efficientnet
Successfully installed efficientnet-1.1.1 keras-applications-1.0.8


In [5]:
import numpy as np
import pandas as pd
import glob
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import re
from keras.preprocessing.image import ImageDataGenerator
from keras.applications import MobileNetV2
from keras.utils import to_categorical
from keras.layers import Dense
from keras import Model
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from tensorflow.keras.applications.xception import Xception
import tensorflow as tf
import tensorflow.keras.layers as L

import tensorflow.keras.layers as L
import efficientnet.tfkeras as efn

In [None]:
train_data = pd.read_csv("../input/landmark-recognition-2020/train.csv")
train_images = glob.glob('../input/landmark-recognition-2020/train/*/*/*/*')
test_images = glob.glob('../input/landmark-recognition-2020/test/*/*/*/*')
sample = pd.read_csv("../input/landmark-recognition-2020/sample_submission.csv")

# EDA

In [None]:
train_data.head()

In [None]:
train_data['landmark_id'].value_counts() # counting landmark classes

In [None]:
train_data['id'].value_counts()# Checking if all the ids are distinct

In [None]:
# Check for Duplicates
train_data.duplicated().sum()

Labelling train image paths with landmark id from the current train csv file

In [None]:
# Creating a dict of image id and landmark id
image_id = {}
for i in range(len(train_data)):
    x = train_data['id'][i]
    image_id[x] = train_data['landmark_id'][i]

In [None]:
# Creating a dataframe with landmark id and train image path
import re
dict_image_target = {}
for i in range(len(train_images)):
    x = re.findall(r'/[0-9A-Za-z]/[0-9A-Za-z]/[0-9A-Za-z]/(.*).jpg',train_images[i])
    dict_image_target[train_images[i]] = image_id[x[0]]
df_images_train = pd.DataFrame(list(dict_image_target.items()), columns=['Image_path','Target'])    
    

In [None]:
df_images_train.head()

In [None]:
# Creating dataframe for the test image path
df_images_test = pd.DataFrame(test_images, columns = ['Image_path']) 
df_images_test.head()

# Performing basic visualization

In [None]:
# Density plot to show distribution of classes
plt.figure(figsize = (12, 8))


sns.kdeplot(df_images_train['Target'], color="yellow",shade=True)
plt.xlabel("LandMark IDs")
plt.ylabel("Probability Density")
plt.title('Class Distribution - Density plot')

plt.show()

In [None]:
# Top 10 most frequently occurring landmarks
fig = plt.figure(figsize = (12,8))

count = train_data.landmark_id.value_counts().sort_values(ascending=False)[:10]

sns.countplot(x=train_data.landmark_id,
             order = train_data.landmark_id.value_counts().sort_values(ascending=False).iloc[:10].index)

plt.xticks(rotation = 90)

plt.xlabel("LandMark Id")
plt.ylabel("Frequency")
plt.title("Top 10 Classes in the Dataset")

plt.show()

In [None]:
# Top five images in the dataset
head_5 = train_data.landmark_id.value_counts().sort_values(ascending=False)[:5].index

images = []

for i in range(5):
    img=cv2.imread(df_images_train[df_images_train.Target == head_5[i]]['Image_path'].values[1])   
    image = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    images.append(image)

f, ax = plt.subplots(3,2, figsize=(20,15))
for i, img in enumerate(images):        
        ax[i//2, i%2].imshow(img)
        ax[i//2, i%2].axis('off')

In [None]:
# Some images from the test dataset
test_images = df_images_test.Image_path[1:13]
images = []

for i in range(1,13):
    img=cv2.imread(test_images[i])   
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    images.append(img)
f, ax = plt.subplots(3,4, figsize=(20,15))
for i, img in enumerate(images):
        ax[i//4, i%4].imshow(img)
        ax[i//4, i%4].axis('off')

# Model Building

In [None]:
val_rate = 0.2 # 20% validation dataset
batch_size = 5 # Batch size kept small in order to process the epochs faster

In [None]:
df_images_train['Target'] = df_images_train['Target'].astype(str)

In [None]:
# Image pre-processing
gen = ImageDataGenerator(validation_split=val_rate)

train_gen = gen.flow_from_dataframe(
    df_images_train,
    directory="",
    x_col="Image_path",
    y_col="Target",
    weight_col=None,
    target_size=(256, 256),
    color_mode="rgb",
    classes=None,
    class_mode="categorical",
    batch_size=batch_size,
    shuffle=True,
    subset="training",
    interpolation="nearest",
    validate_filenames=False)

val_gen = gen.flow_from_dataframe(
    df_images_train,
    directory="",
    x_col="Image_path",
    y_col="Target",
    weight_col=None,
    target_size=(256, 256),
    color_mode="rgb",
    classes=None,
    class_mode="categorical",
    batch_size=batch_size,
    shuffle=True,
    subset="validation",
    interpolation="nearest",
    validate_filenames=False)

In [None]:
model = tf.keras.Sequential([
    efn.EfficientNetB2(
        input_shape=(256, 256, 3),
        weights='imagenet',
        include_top=False
    ),
    L.GlobalAveragePooling2D(),
    L.Dense(81313, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss = 'categorical_crossentropy',
    metrics=['categorical_accuracy']
)

In [None]:
# training parameters
epochs = 1 # maximum number of epochs
train_steps = int(len(df_images_train)*(1-val_rate))//batch_size # Tuning parameter
val_steps = int(len(df_images_train)*val_rate)//batch_size

model_checkpoint = ModelCheckpoint("model_efnB3.h5", save_best_only=True, verbose=1)

In [None]:
history = model.fit_generator(train_gen, steps_per_epoch=train_steps, epochs=epochs,validation_data=val_gen, validation_steps=val_steps, callbacks=[model_checkpoint])

model.save("model.h5")

In [None]:
stored_model = load_model("model.h5")

In [None]:
test_gen = ImageDataGenerator().flow_from_dataframe(
    df_images_test,
    directory="../input/landmark-recognition-2020/test/",
    x_col="filename",
    y_col=None,
    weight_col=None,
    target_size=(256, 256),
    color_mode="rgb",
    classes=None,
    class_mode=None,
    batch_size=1,
    shuffle=True,
    subset=None,
    interpolation="nearest",
    validate_filenames=False)

In [None]:
y_pred_mod = stored_model.predict_generator(test_gen, verbose=1, steps = test_steps)

y_pred = np.argmax(y_pred_oh, axis=1)