In [3]:
from PIL import Image, ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
from matplotlib import pyplot as plt
import numpy as np
import sklearn
from skimage import io, feature
from skimage.color import rgb2gray
import pandas as pd
import cv2
import os
import pickle
import tensorflow as tf
from tensorflow import keras

## Goals
Developed for Shopee Code League 2020<br>
The purpose of this notebook is to build a image classification model for multi-label image dataset.<br>
The dataset provided is not given/shown for confidentiality reasons

<br>
<b>To change the dimensions, change the variable PIXEL</b>

## Loading data

In [4]:
train = pd.read_csv("datasets/product detection dataset/train.csv")
test = pd.read_csv("datasets/product detection dataset/test.csv")
train = train.sort_values(by=['category'])
train = train.reset_index(drop=True)
CAT_COUNT = train.groupby(['category']).count()
categories = list(range(0, 42))
PIXEL = 20
print(train.shape)
CAT_COUNT.head(50)

(105392, 2)


Unnamed: 0_level_0,filename
category,Unnamed: 1_level_1
0,2683
1,2702
2,2687
3,2703
4,2703
5,2641
6,2641
7,2660
8,2700
9,2698


### Converting images into standard pixel format and into an array (TRAIN DATASET)

In [None]:
def convertImageToArray(array_counter, i, category):
    # the string here needs to change according to the relative path of where you save your dataset
    string = "datasets/product detection dataset/train"
    if len(str(category))==1:
        string += "/"+"0"+str(category)+"/"+train.iloc[i,0]
    else:
        string += "/"+str(category)+"/"+train.iloc[i,0]
    # Change the values in the resize method to reflect your given dimensions/pixels
    temp = np.array(Image.open(string).convert('L').resize((PIXEL,PIXEL)))
    train_img[array_counter] = temp
    
def convertArrayToTxtFile(cat_array, category):
    #Change the file below to reflect our given dimension/pixel
    if len(str(category))==1:
        filename = "datasets/product detection dataset/train_array/train_img"+"0"+str(category)+".txt"
    else:
        filename = "datasets/product detection dataset/train_array/train_img"+str(category)+".txt"
    with open(filename, 'w') as outfile:
        outfile.write('# Array shape: {0}\n'.format(cat_array.shape))
        for data_slice in cat_array:
            np.savetxt(outfile, data_slice, fmt='%-7.2f')
            outfile.write('# New slice\n')
    #print("txt file writing complete")

CAT_COUNT = train.groupby(['category']).count()
cat = 0
array_counter = 0
train_img = np.empty([CAT_COUNT.iloc[0,0], PIXEL, PIXEL])
for i in range(train.shape[0]):
    if cat==train.iloc[i,1]:
        convertImageToArray(array_counter, i, cat)
        array_counter+=1
    else:
        #print("Category completed: ", cat)
        convertArrayToTxtFile(train_img, cat)
        array_counter = 0
        cat = train.iloc[i,1]
        train_img = np.empty([CAT_COUNT.iloc[cat, 0], PIXEL, PIXEL])
        convertImageToArray(array_counter, i, cat)
        
with open("datasets/product detection dataset/train_array/train_img41.txt", 'w') as outfile:
    outfile.write('# Array shape: {0}\n'.format(train_img.shape))
    for data_slice in train_img:
        np.savetxt(outfile, data_slice, fmt='%-7.2f')
        outfile.write('# New slice\n')
#print("Final txt file writing complete")
print("Data writing complete")

### Converting images into standard pixel format and into an array (TEST DATASET)

In [None]:
def convertImageToArray(i):
    # the string here needs to change according to the relative path of where you save your dataset
    string = "datasets/product detection dataset/test/"+test.iloc[i,0]
    # Change the values in the resize method to reflect your given dimensions/pixels
    temp = np.array(Image.open(string).convert('L').resize((PIXEL,PIXEL)))
    test_img[i] = temp
    
def convertArrayToTxtFile(cat_array):
    #Change the file below to reflect our given dimension/pixel
    filename = "datasets/product detection dataset/test_img.txt"
    with open(filename, 'w') as outfile:
        outfile.write('# Array shape: {0}\n'.format(cat_array.shape))
        for data_slice in cat_array:
            np.savetxt(outfile, data_slice, fmt='%-7.2f')
            outfile.write('# New slice\n')
    print("txt file writing complete")

array_counter = 0
test_img = np.empty([test.shape[0], PIXEL, PIXEL])
for i in range(test.shape[0]):
    convertImageToArray(i)
print("Test dataset completed")
convertArrayToTxtFile(test_img)
print("Text file generated")

## Read txt files and populate the values into an array (TRAIN)
This is for development purposes, where the data preparation and model training is broken up into several phases/session

In [None]:
y = [0] * CAT_COUNT.iloc[0,0]
for i in range(1,42):
    temp = [i] * CAT_COUNT.iloc[i,0]
    y = np.append(y, temp, axis=0)
    
print("Array retrieval complete. y values array size: ", y.shape)      

In [None]:
def retrieveAllArrayFromFile(array, i):
    temp = "datasets/product detection dataset/train_array/train_img"
    if(i<10):
        temp += "0"+ str(i) + ".txt"
    else:
        temp += str(i) + ".txt"
    temp_data = np.loadtxt(temp)
    temp_data = temp_data.reshape((CAT_COUNT.iloc[i,0], PIXEL, PIXEL))
    # To set the range of values to be between 0 to 1
    temp_data = temp_data / 255.0
    if i==0:
        return temp_data
    else:
        return np.append(array, temp_data, axis=0)

X1 = 0
for i in range(CAT_COUNT.shape[0]):
    X1 = retrieveAllArrayFromFile(X1, i)
    print("X1 category ",i, ": ", X1.shape)
        
print("Array retrieval complete. X1 values array size: ", X1.shape)

# Building the model
The model used is a simple CNN model

## Layer setup

In [None]:
model1 = keras.Sequential([
    #Represents the input layer shape
    keras.layers.Flatten(input_shape=(PIXEL, PIXEL)),
    #Represents the number of hidden units (hidden layers)
    keras.layers.Dense(256, activation='relu'),
    # Represents the number of output units (should be consistent with your predicted labels)
    keras.layers.Dense(42)
])

## Compile the model

In [None]:
model1.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Train the model

## Fitting the model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.2, random_state=17)
model1.fit(X_train, y_train, epochs=30)

### Attempt to remove outliers
The model built does not meet the necessary requirements, as such various methods are used to try to improve the accuracy of the model
1. Use model to prediction on training dataset
2. remove images that where predictions do not align with pre trained model
3. use refined dataset to rebuild the model

#### Method 1
This method removes all values that when tested on the same trained data does not give the same label value

In [None]:
def retrieveArrayFromFile1(i, dataset):
    temp = "datasets/product detection dataset/train_array/train_img"
    if(i<10):
        temp += "0"+ str(i) + ".txt"
    else:
        temp += str(i) + ".txt"
    temp_data = np.loadtxt(temp)
    temp_data = temp_data.reshape((CAT_COUNT.iloc[i,0], PIXEL, PIXEL))
    # To set the range of values to be between 0 to 1
    temp_data = temp_data / 255.0
    return temp_data

def predictionForTrainData1(cat, categoryArray):
    probability_model = tf.keras.Sequential([model, tf.keras.layers.Softmax()])
    predictions = probability_model.predict(categoryArray)
    prediction_label = np.empty(CAT_COUNT.iloc[cat,0])
    if(cat<10):
        stringCat = "0"+str(cat)

    for i in range(predictions.shape[0]):
        prediction_label[i] = np.argmax(predictions[i])

    catData = train.where(train['category']==cat)
    catData = catData.dropna()
    catData['prediction'] = pd.Series(prediction_label, index=catData.index)
    tempdf = catData.where(catData['prediction']==catData['category'])
    return tempdf.dropna()
    
refined_train1 = pd.DataFrame(columns = ['filename', 'category', 'prediction'])
for i in range(42):
    categoryArray = retrieveArrayFromFile1(i, 1)
    refined_train1 = refined_train1.append(predictionForTrainData1(i, categoryArray))
    
REFINED_CAT_COUNT1 = refined_train1.groupby(['category']).count()
print(refined_train1.shape)
REFINED_CAT_COUNT1.head(50)

#### Method 2
Initial dataset was unbalanced, this is done to balance the number of images across all label types

In [20]:
def dataAugmentation(img_array, i):
    #Flip codes
    extra_img = cv2.flip(img_array, -1)
    new_array = cv2.resize(extra_img, (PIXEL, PIXEL))
    training_data.append([new_array, i])
    extra_img = cv2.flip(img_array, 0)
    new_array = cv2.resize(extra_img, (PIXEL, PIXEL))
    training_data.append([new_array, i])
    extra_img = cv2.flip(img_array, 1)
    new_array = cv2.resize(extra_img, (PIXEL, PIXEL))
    training_data.append([new_array, i])

training_data = []
print('category completed: ')
for i in range(42):
    if i<10:
        cat = "0"+str(i)
    else:
        cat = str(i)
    path = os.path.join("datasets/product detection dataset/train/", cat)
    counter = 0
    for img in os.listdir(path):        
        img_array = cv2.imread(os.path.join(path, img), cv2.IMREAD_GRAYSCALE)
        new_array = cv2.resize(img_array, (PIXEL, PIXEL))
        training_data.append([new_array, i])
        #dataAugmentation(img_array, i)
        
        if i == 33:
            for j in range(3):
                blur_factor = j*2+1
                extra_img = cv2.GaussianBlur(img_array, (blur_factor,blur_factor), cv2.BORDER_DEFAULT)
                new_array = cv2.resize(extra_img, (PIXEL, PIXEL))
                training_data.append([new_array, i])
                #dataAugmentation(extra_img, i)
        if i == 17 and counter%3<2:
            extra_img = cv2.GaussianBlur(img_array, (3,3), cv2.BORDER_DEFAULT)
            new_array = cv2.resize(extra_img, (PIXEL, PIXEL))
            training_data.append([new_array, i])
            #dataAugmentation(extra_img, i)
        if (i == 11 or i == 37) and counter%2<1:
            extra_img = cv2.GaussianBlur(img_array, (3,3), cv2.BORDER_DEFAULT)
            new_array = cv2.resize(extra_img, (PIXEL, PIXEL))
            training_data.append([new_array, i])
            #dataAugmentation(extra_img, i)
        if (i == 18 or i == 29) and counter%4<1:
            extra_img = cv2.GaussianBlur(img_array, (3,3), cv2.BORDER_DEFAULT)
            new_array = cv2.resize(extra_img, (PIXEL, PIXEL))
            training_data.append([new_array, i])
            #dataAugmentation(extra_img, i)
    print(i, end=', ')

category completed: 
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 

In [21]:
import random

random.shuffle(training_data)
X2 = []
y2 = []
for features, label in training_data:
    X2.append(features)
    y2.append(label)
X2 = np.array(X2).reshape(-1, PIXEL, PIXEL, 1)
y2 = np.array(y2)

In [23]:
test_data = []
for i in range(test.shape[0]):        
    img_array = cv2.imread(os.path.join("datasets/product detection dataset/test", test.iloc[i,0]), cv2.IMREAD_GRAYSCALE)
    new_array = cv2.resize(img_array, (PIXEL, PIXEL))
    test_data.append([new_array])
test_data = np.array(test_data).reshape(-1, PIXEL, PIXEL, 1)

### An alternative dataset preparation using pickle

In [24]:
pickle_out = open("datasets/product detection dataset/X.pickle","wb")
pickle.dump(X2, pickle_out)
pickle_out.close()
pickle_out = open("datasets/product detection dataset/y.pickle","wb")
pickle.dump(y2, pickle_out)
pickle_out.close()

pickle_out = open("datasets/product detection dataset/test.pickle","wb")
pickle.dump(test_data, pickle_out)
pickle_out.close()

## Improved model
This was the final model used, including more layers in the CNN model with dropout functionality

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras.applications.inception_v3 import InceptionV3

X2 = pickle.load(open("datasets/product detection dataset/X.pickle", "rb"))
y2 = pickle.load(open("datasets/product detection dataset/y.pickle", "rb"))

X2 = X2/255.0
y2 = np.array(y2)


model2 = Sequential()
model2.add(Conv2D(128, (3,3), input_shape=(PIXEL, PIXEL, 1)))
model2.add(Activation("relu"))
model2.add(MaxPooling2D(pool_size=(2,2)))
model2.add(Conv2D(256, (5,5)))
model2.add(Activation("relu"))
model2.add(MaxPooling2D(pool_size=(2,2)))
model2.add(Flatten())
model2.add(Dense(64))
model2.add(Activation('sigmoid'))
model2.add(Dropout(0.5))
model2.add(Dense(42))
model2.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              optimizer='adam',
              metrics=['accuracy'])

model2.fit(X2, y2, batch_size=32, epochs=30, validation_split=0.1,)

Train on 104832 samples, validate on 11648 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30

## Train model with new dataset
If the code below fails, check that the REFINED_CAT_COUNT does not have any missing data. It renders this PIXEL value invalid, therefore pls adjust model or PIXEl variables

In [None]:
from sklearn.model_selection import train_test_split
y2 = [0] * REFINED_CAT_COUNT2.iloc[0,0]

for i in range(1,42):
    temp = [i] * REFINED_CAT_COUNT2.iloc[i,0]
    y2 = np.append(y2, temp, axis=0)
    
X2 = np.empty([refined_train2.shape[0], PIXEL, PIXEL])
for i in range(refined_train2.shape[0]):
    X2[i] = X1[refined_train2.index[i]]

X_train, X_test, y_train, y_test = train_test_split(X2, y2, test_size=0.2, random_state=17)
model.fit(X_train, y_train, epochs=30)

## Evaluate accuracy

In [None]:
test_loss, test_acc = model.evaluate(X_test,  y_test, verbose=2)

print('\nTest accuracy:', test_acc)

## Use model to predict with test dataset

In [None]:
test_data = np.loadtxt("datasets/product detection dataset/test_img.txt")
#For model1
#test_data = test_data.reshape((test.shape[0], PIXEL, PIXEL))
#For model2
test_data = pickle.load(open("datasets/product detection dataset/test.pickle", "rb"))
# To set the range of values to be between 0 to 1
test_data = test_data / 255.0

probability_model = tf.keras.Sequential([model2, tf.keras.layers.Softmax()])
predictions = probability_model.predict(test_data)

# Create results and put in CSV file

In [None]:
print(predictions.shape)
print(test.shape)
test_label = np.empty([predictions.shape[0]])
for i in range(predictions.shape[0]):
    test_label[i] = np.argmax(predictions[i])
test['category'] = pd.Series(test_label, index=test.index)
test['category']= test['category'].astype(str)
test['category'] = test['category'].apply(lambda x: x.zfill(4))
test['category'] = test['category'].apply(lambda x: x[:-2])
#test.groupby(['category']).count().head(50)
test.head(50)

## Export to CSV file

In [None]:
test.to_csv('datasets/product detection dataset/results_'+str(PIXEL)+"px.csv", index=False)