In [None]:
from google_drive_downloader import GoogleDriveDownloader as gdd
import random, os, json, re, math, statistics
import numpy as np
from numpy import load
from numpy import save
import scipy as sp
from scipy import io
import cv2 as cv 
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from keras.models import load_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization, Conv2D, Dropout, Flatten, MaxPooling2D, Activation
from sklearn.metrics import classification_report, plot_confusion_matrix
from keras.utils.vis_utils import plot_model

In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Download dataset
gdd.download_file_from_google_drive(file_id='15M2oVwpl1UwUgreZDXj1QlLY8pv0cdSl',
                                    dest_path='./RetailDataset.zip',
                                    unzip=True)
os.remove('./RetailDataset.zip')

In [None]:
folder = './train2019/'
lst = os.listdir(folder)
lst.sort()

# Encoding each class
class_name = {'stationery':0, 'puffed_food':1, 'dried_fruit':2, 'dried_food':3, 'instant_drink':4, 'instant_noodles':5,
              'dessert':6, 'drink':7, 'alcohol':8, 'milk':9, 'canned_food':10, 'chocolate':11,
              'gum':12, 'candy':13, 'seasoner':14, 'personal_hygiene':15, 'tissue':16}

# Import image information
with open('./drive/My Drive/MSc Project/instances_train2019.json') as json_data:
  instances = json.load(json_data)

# Create dictionary that links product SKU to class
class_code = {}
for product in instances['__raw_Chinese_name_df']:
  class_code[product['code']] = [product['sku_class'],class_name[product['sku_class']]]

# Save class_code in .json file
json_data = json.dumps(class_code)
f = open('./drive/My Drive/MSc Project/class_code.json','w')
f.write(json_data)
f.close()
# Load class_code
with open('./drive/My Drive/MSc Project/class_code.json') as json_data:
  class_code = json.load(json_data)

def extract_code(name):
  '''
  Arg:
    name: string, image name
  Return:
    sku code extracted from image name
  '''
  return str(int(re.split('-|_|~',name)[0]))

class_lst = []
for name in lst:
  key = extract_code(name)
  class_lst.append(class_code[key][1])

# Separate image names into .txt files according to class
if not os.path.exists('./classes'):
    os.makedirs('./classes')
for num in range(len(lst)):
  file_name = './classes/class_'+str(class_lst[num])+'.txt'
  open(file_name,'a').write(lst[num]+'\n')

In [None]:
### Exploratory Data Analysis
class_count = []
for num in range(0,17):
  file_name = './classes/class_'+str(num)+'.txt'
  lines = open(file_name).readlines()
  class_count.append(len(lines))

### Visualisaiton of Data Distribution
classes = ['stationery 0', 'puffed_food 1', 'dried_fruit 2', 'dried_food 3', 'instant_drink 4', 'instant_noodles 5',
              'dessert 6', 'drink 7', 'alcohol 8', 'milk 9', 'canned_food 10', 'chocolate 11',
              'gum 12', 'candy 13', 'seasoner 14', 'personal_hygiene 15', 'tissue 16']
fig, ax = plt.subplots(figsize=(15,12))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim([0,7000])
y_pos = [i for i, _ in enumerate(classes)]
plt.barh(y_pos, class_count, alpha=0.7)
ylocs, ylabs = plt.yticks()
plt.ylabel('Class', fontsize=20)
plt.xlabel('Number of Examples', fontsize=20)
plt.title('Data Distribution',fontsize=24)
plt.yticks(y_pos, classes, ha='right', fontsize=20)
plt.xticks(fontsize=18)
for i, v in enumerate(class_count):
    ax.text(v + 3, i - 0.15, str(v), color='#1f77b4', alpha=0.9, fontweight='bold',fontsize=20)
plt.savefig('Data Distribution.png', bbox_inches='tight')
plt.close(fig)

In [None]:
### Data Splitting
y_test = []
y_val = []
y_train = []
train_class_count = []
for num in range(0,17):
  # Shuffle image names within each class
  file_name = './classes/class_'+str(num)+'.txt'
  lines = open(file_name).readlines()
  random.shuffle(lines)

  # Split each class file into train:validation:test with a proportion of 8:1:1
  test_val_num = int(len(lines)/10)
  train_num = len(lines) - test_val_num*2
  train_class_count.append(train_num)
  y_test += [num]*test_val_num
  y_val += [num]*test_val_num
  y_train += [num]*train_num
  open('x_test_name.txt','a').writelines(lines[:test_val_num])
  open('x_val_name.txt','a').writelines(lines[test_val_num:2*test_val_num])
  # Overwrite class file, so that it only contains training data
  open(file_name,'w').writelines(lines[2*test_val_num:])
  #open('x_train_name.txt','a').writelines(lines[2*test_val_num:])

# Save y_train, y_val and y_test
#save('y_train.npy', np.array(y_train))
save('y_test.npy', np.array(y_test))
save('y_val.npy', np.array(y_val))

In [None]:
### Distribution of Raw Training Data
classes = ['stationery 0', 'puffed_food 1', 'dried_fruit 2', 'dried_food 3', 'instant_drink 4', 'instant_noodles 5',
              'dessert 6', 'drink 7', 'alcohol 8', 'milk 9', 'canned_food 10', 'chocolate 11',
              'gum 12', 'candy 13', 'seasoner 14', 'personal_hygiene 15', 'tissue 16']
fig, ax = plt.subplots(figsize=(15,12))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim([0,5800])
y_pos = [i for i, _ in enumerate(classes)]
plt.barh(y_pos, train_class_count, alpha=0.7)
ylocs, ylabs = plt.yticks()
plt.ylabel('Class', fontsize=20)
plt.xlabel('Number of Examples', fontsize=20)
plt.title('Distribution of Raw Training Data',fontsize=24)
plt.yticks(y_pos, classes, ha='right', fontsize=20)
plt.xticks(fontsize=18)
for i, v in enumerate(train_class_count):
    ax.text(v + 3, i - 0.15, str(v), color='#1f77b4', alpha=0.9, fontweight='bold',fontsize=20)
plt.savefig('Distribution of Raw Training Data.png', bbox_inches='tight')
plt.close(fig)

In [None]:
### Resampling Training Data
print('median:',statistics.median(train_class_count))
print('mean:',np.mean(train_class_count))

# set number of examples in each class to be 2400,
# so downsample 7 classes and upsample 10 classes
y_train = []
train_class_count_new = []
thres = 2400
for num in range(0,17):
  # Shuffle image names within each class
  file_name = './classes/class_'+str(num)+'.txt'
  lines = open(file_name).readlines()
  random.shuffle(lines)

  if len(lines)<thres:
    # upsampling
    repeat_lines = random.sample(lines, thres - len(lines)) # random repeating
    open('x_train_name.txt','a').writelines(lines+repeat_lines)
    length = len(lines+repeat_lines)
    train_class_count_new.append(length)
  else:
    # downsampling
    sample_lines = random.sample(lines, thres) # random sampling
    open('x_train_name.txt','a').writelines(sample_lines)
    length = len(sample_lines)
    train_class_count_new.append(length)

  y_train += [num]*length

In [None]:
# Shuffle the whole training data
lines = open('x_train_name.txt').readlines()
list_combined = list(zip(lines,y_train))
random.shuffle(list_combined)
lines, y_train = zip(*list_combined)
# overwrite x_train_name.txt
open('x_train_name.txt','w').writelines(lines)
save('y_train.npy', np.array(y_train))

In [None]:
### Distribution of Resampled Training Data
classes = ['stationery 0', 'puffed_food 1', 'dried_fruit 2', 'dried_food 3', 'instant_drink 4', 'instant_noodles 5',
              'dessert 6', 'drink 7', 'alcohol 8', 'milk 9', 'canned_food 10', 'chocolate 11',
              'gum 12', 'candy 13', 'seasoner 14', 'personal_hygiene 15', 'tissue 16']
fig, ax = plt.subplots(figsize=(15,12))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_xlim([0,5800])
y_pos = [i for i, _ in enumerate(classes)]
plt.barh(y_pos, train_class_count_new, alpha=0.7)
ylocs, ylabs = plt.yticks()
plt.ylabel('Class', fontsize=20)
plt.xlabel('Number of Examples', fontsize=20)
plt.title('Distribution of Resampled Training Data',fontsize=24)
plt.yticks(y_pos, classes, ha='right', fontsize=20)
plt.xticks(fontsize=18)
for i, v in enumerate(train_class_count_new):
    ax.text(v + 3, i - 0.15, str(v), color='#1f77b4', alpha=0.9, fontweight='bold',fontsize=20)
plt.savefig('Distribution of Resampled Training Data.png', bbox_inches='tight')
plt.close(fig)

In [None]:
def generate_data(path,num,img_name):
  data = np.zeros((num,256,256,3))
  for i in range(num):
    crop_img = cv.imread(path+img_name[i].strip())[172:1772,496:2096]/255 # crop image to (1600,1600)
    data[i] = cv.resize(crop_img, (256,256)) # resize image to (256,256)
  return data

### generate validation data
x_val = generate_data('./train2019/',len(y_val),x_val_name)
save('./drive/My Drive/MSc Project/x_val.npy', x_val)

### generate test data
x_test = generate_data('./train2019/',len(y_test),x_test_name)
save('./drive/My Drive/MSc Project/x_test.npy', x_test)