This Notebook is for Shoppee's Product Detection Competition. All Datasets belong to Shoppee and can be found 

In [None]:
import numpy as np
import pandas as pd
import cv2
from sklearn.base import BaseEstimator, TransformerMixin
import tensorflow as tf
from tensorflow import keras
import glob
import matplotlib.pyplot as plt
%matplotlib inline
import gc

In [None]:
class ImagePreprocessor(BaseEstimator, TransformerMixin):
    # the 3 methods that need to be implemented when inheriting from BaseEstimator and TransformerMixin
    def __init__(self, img_size):
        self._img_size = img_size
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X, y = None):
        if y is not None:
            return np.array(list(map(self.preprocess_image, X))), self.one_hot_encode(y)
        else:
            return np.array(list(map(self.preprocess_image, X)))
        
    
    def preprocess_image(self, image):
        image = tf.image.resize(image, (self._img_size, self._img_size))
        image = cv2.cvtColor(np.float32(image), cv2.COLOR_BGR2RGB) #converting color back properly because cv2 reads the image in BGR colors
        image = cv2.GaussianBlur(image, (5,5),0) #denoise the image
        #resize image
        #grayscale image: Will not grayscale because using MobileNetV2
        image = (image/255 - 0.5)
        #perform gridmask on images:
        
        return image
    
    def one_hot_encode(self, y):
        y = np.array(y)
        y_post = np.zeros(y.shape[0] * 42).reshape(y.shape[0], 42)
        for index in range(y.shape[0]):
            y_post[index, y[index]] = 1
        del y
        gc.collect()
        return y_post

In [None]:
IMG_SIZE = 96
preprocessor = ImagePreprocessor(IMG_SIZE)

In [None]:
df = pd.read_csv("train.csv")

In [None]:
def sort_names(string):
    return string[15:]


class Pipeline():
    def __init__(self, preprocessor, model):
        self._preprocessor = preprocessor
        self._model = model
    
    def fit(self, X_train, X_test, y_train, y_test, batch_size = 200, epochs = 5):
        print('preprocessing...')
        X_train, y_train = self._preprocessor.transform(X_train, y_train)
        X_test, y_test = self._preprocessor.transform(X_test, y_test)

        print('training...')
        self._model.fit(X_train, y_train,
                 batch_size = batch_size,
                 epochs = epochs,
                 validtion_set = [(X_train, y_train), (X_test, y_test)])
        
    def predict(self, X_pred):
        X_pred = self._preprocessor.transform(X_pred)
        return np.argmax(model.predict(X_pred), axis = 1)

In [None]:
# # filenames = glob.glob("train/train/00/*.jpg")
# filenames = []
# for n in range(42):
#     train_file = ""
#     if n < 10:
#         train_file = "0" + str(n)
#     else:
#         train_file = str(n)
    
#     filepath = 'train/train/' + train_file + '/*.jpg'
#     filenames += glob.glob(filepath)
    
# #shuffle 10 times:
# for z in range(10):
#     np.random.shuffle(filenames)

In [None]:
# filenames = glob.glob("train/train/00/*.jpg")
filenames = []
for n in range(42):
    train_file = ""
    if n < 10:
        train_file = "0" + str(n)
    else:
        train_file = str(n)
    
    filepath = 'train/train/' + train_file + '/*.jpg'
    all_files = np.array(glob.glob(filepath))
    np.random.shuffle(all_files)
    all_files = all_files.tolist()[:240]
    filenames +=  all_files
    
#shuffle 10 times:
for z in range(10):
    np.random.shuffle(filenames)

In [None]:
IMG_SHAPE = (IMG_SIZE, IMG_SIZE, 3)
UNTRAINABLE_BEFORE_LAYER = 50

#Google's MobileNetV2 model
#base_model  = tf.keras.applications.MobileNetV2(input_shape=IMG_SHAPE, include_top = False)

#Facebook's ResNet152V2 model
base_model = tf.keras.applications.ResNet101V2(input_shape=IMG_SHAPE, include_top=False)

base_model.trainable = True
for layer in base_model.layers[: UNTRAINABLE_BEFORE_LAYER]:
    layer.trainable = False
    
model = keras.Sequential([
    base_model,
    keras.layers.GlobalAveragePooling2D(),
    keras.layers.Dense(42, activation="sigmoid")
])
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=0.0001, decay = 1e-6),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])


In [None]:
base_model.summary()

In [None]:
pipeline = Pipeline(preprocessor, model)
sample_size = 1000
def main(): 
    for r in range(4):
        print("round: " + str(r+1) + "/4")
        n = 0
        while n + sample_size < len(filenames):
            print(str(n + sample_size) + ' samples')
            temp = filenames[n : n + sample_size]
            shortened_names = list(map(sort_names, temp))
            X_train = list(map(cv2.imread, temp[:int(sample_size * 0.9)]))
            X_test = list(map(cv2.imread, temp[int(sample_size * 0.9):]))


            y_train = df[df["filename"].isin(shortened_names[:int(sample_size * 0.9)])]
            y_train["order"] = pd.Categorical(y_train["filename"],
                                              categories=shortened_names[:int(sample_size * 0.9)],
                                              ordered = True)
            y_train.sort_values("order", inplace = True)
            y_train.drop(columns = ["filename", "order"], inplace = True)


            y_test = df[df["filename"].isin(shortened_names[int(sample_size * 0.9):])]
            y_test["order"] = pd.Categorical(y_test["filename"],
                                             categories=shortened_names[int(sample_size * 0.9):],
                                             ordered = True)
            y_test.sort_values("order", inplace = True)
            y_test.drop(columns = ["filename", "order"], inplace = True)


            del temp, shortened_names
            gc.collect()

            pipeline.fit(X_train, X_test, y_train, y_test, batch_size=42, epochs = 1)

            del X_train, X_test, y_train, y_test
            gc.collect()
            n += sample_size
        
if __name__ == "__main__":
    main()

In [None]:
test_files = filenames[10000:]
test_images = list(map(cv2.imread,test_files))
test_files = list(map(sort_names, test_files))

test_labels = df[df["filename"].isin(test_files)]
test_labels["order"] = pd.Categorical(test_labels["filename"], categories=test_files, ordered = True)
test_labels.sort_values("order", inplace = True)
test_labels.drop(columns = ["filename", "order"], inplace = True)
test_labels = np.array(test_labels)

In [None]:
test_images = preprocessor.transform(test_images)
predictions = np.argmax(model.predict(test_images), axis = 1)
test_labels = test_labels.reshape(test_labels.shape[0],)

In [None]:
(predictions == test_labels).sum()/test_labels.shape[0]

In [None]:
# test_files

In [None]:
# test_labels = df[df["filename"].isin(test_files)]
# test_labels

In [None]:
# test_labels["order"] = pd.Categorical(
#     test_labels["filename"],
#     categories = test_files,
#     ordered = True
# )
# test_labels.sort_values("order", inplace = True)
# test_labels.drop(columns=["filename", "order"])

In [None]:
pred_path = "test/test/*.jpg"
pred_df = pd.read_csv("test.csv")
pred_image_files = glob.glob(pred_path)

In [None]:
pred_df

In [None]:
pred_images = list(map(lambda string : string[10:], pred_image_files))

In [None]:
pred_df["order"] = pd.Categorical(pred_df["filename"], categories=pred_images, ordered=True)
pred_df.sort_values("order", inplace=True)
pred_df.drop(columns=["category", "order"], inplace=True)
pred_df.reset_index(drop = True, inplace= True)

In [None]:
pred_df.head()

In [None]:
for file in pred_images:
    if file not in np.array(pred_df["filename"]):
        pred_images.remove(file)

In [None]:
category = np.array([])
count = 0
for file in pred_images:
    print(count)
    count+=1
    image = [cv2.imread("test/test\\" + file)]
    prediction = pipeline.predict(image)
    category = np.concatenate((category, prediction), axis = None)

In [None]:
pred_df["category"] = pd.Series(category)
pred_df.head()

In [None]:
pred_df.to_csv("submission1.csv", index=False)