## Load the data

In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

ModuleNotFoundError: No module named 'google.colab'

In [None]:
# Extract kitchenware.tar.gz file
!tar -xzvf "/content/gdrive/MyDrive/kitchenware.tar.gz"

# EDA

In [None]:
import os

# Get project path
data_dir_list  = os.listdir('/content/kitchenware-dataset')
data_dir_list

In [None]:
# Make new base directory for train and validation set
project_dir = '/content/kitchenware-dataset'
base_dir = '/content/kitchware-data'
os.makedirs(base_dir, exist_ok=True)

In [None]:
# Create train directory including all 6 classes as subdirectories
train_dir = os.path.join(base_dir, 'train')
os.makedirs(train_dir, exist_ok=True)

train_cup_dir = os.path.join(train_dir, 'cup')
os.mkdir(train_cup_dir)

train_fork_dir = os.path.join(train_dir, 'fork')
os.mkdir(train_fork_dir)

train_glass_dir = os.path.join(train_dir, 'glass')
os.mkdir(train_glass_dir)

train_knife_dir = os.path.join(train_dir, 'knife')
os.mkdir(train_knife_dir)

train_plate_dir = os.path.join(train_dir, 'plate')
os.mkdir(train_plate_dir)

train_spoon_dir = os.path.join(train_dir, 'spoon')
os.mkdir(train_spoon_dir)

In [None]:
# Create validation directory including all 6 classes as subdirectories
val_dir = os.path.join(base_dir, 'validation')
os.makedirs(val_dir, exist_ok=True)

val_cup_dir = os.path.join(val_dir, 'cup')
os.mkdir(val_cup_dir)

val_fork_dir = os.path.join(val_dir, 'fork')
os.mkdir(val_fork_dir)

val_glass_dir = os.path.join(val_dir, 'glass')
os.mkdir(val_glass_dir)

val_knife_dir = os.path.join(val_dir, 'knife')
os.mkdir(val_knife_dir)

val_plate_dir = os.path.join(val_dir, 'plate')
os.mkdir(val_plate_dir)

val_spoon_dir = os.path.join(val_dir, 'spoon')
os.mkdir(val_spoon_dir)

In [None]:
import random
from shutil import copyfile

# Function to split image data into train and validation
def split_data(source, training, validation, split_size):
  files = []
  for filename in os.listdir(source):
    file = source + filename
    if os.path.getsize(file) > 0:
      files.append(filename)
    else:
      print(f'{filename} is zero length, so ignoring.')

    training_length = int(len(files) * split_size)
    val_length = int(len(files) - training_length)
    shuffled_set = random.sample(files, len(files))
    training_set = shuffled_set[:training_length]
    val_set = shuffled_set[training_length:]

    for filename in training_set:
      this_file = source + filename
      destination = training + filename
      copyfile(this_file, destination)

    for filename in val_set:
      this_file = source + filename
      destination = validation + filename
      copyfile(this_file, destination)

In [None]:
# Create variables for source, train, and valiation directories for each class
cup_source_dir = '/content/kitchenware-dataset/train/cup/'
train_cup_dir = '/content/kitchware-data/train/cup/'
val_cup_dir = '/content/kitchware-data/validation/cup/'

fork_source_dir = '/content/kitchenware-dataset/train/fork/'
train_fork_dir = '/content/kitchware-data/train/fork/'
val_fork_dir = '/content/kitchware-data/validation/fork/'

glass_source_dir = '/content/kitchenware-dataset/train/glass/'
train_glass_dir = '/content/kitchware-data/train/glass/'
val_glass_dir = '/content/kitchware-data/validation/glass/'

knife_source_dir = '/content/kitchenware-dataset/train/knife/'
train_knife_dir = '/content/kitchware-data/train/knife/'
val_knife_dir = '/content/kitchware-data/validation/knife/'

plate_source_dir = '/content/kitchenware-dataset/train/plate/'
train_plate_dir = '/content/kitchware-data/train/plate/'
val_plate_dir = '/content/kitchware-data/validation/plate/'

spoon_source_dir = '/content/kitchenware-dataset/train/spoon/'
train_spoon_dir = '/content/kitchware-data/train/spoon/'
val_spoon_dir = '/content/kitchware-data/validation/spoon/'

In [None]:
# Split image data into 85% training and 15% valiation data
split_size = .85

split_data(cup_source_dir, train_cup_dir, val_cup_dir, split_size)
split_data(fork_source_dir, train_fork_dir, val_fork_dir, split_size)
split_data(glass_source_dir, train_glass_dir, val_glass_dir, split_size)
split_data(knife_source_dir, train_knife_dir, val_knife_dir, split_size)
split_data(plate_source_dir, train_plate_dir, val_plate_dir, split_size)
split_data(spoon_source_dir, train_spoon_dir, val_spoon_dir, split_size)
print('completed!')

In [None]:
import matplotlib.pyplot as plt
from matplotlib.image import imread

# Check the number of images in training data
img_folder = ['cup', 'fork', 'glass', 'knife', 'plate', 'spoon']
num_imgs = {}
for i in img_folder:
  images = len(os.listdir(train_dir + '/' + i + '/'))
  num_imgs[i] = images

for i in img_folder:
  print(f"Training {i} images are: {len(os.listdir(train_dir + '/' + i + '/'))}")

In [None]:
# Plot image distribution in training data
plt.figure(figsize=(9, 6))
plt.bar(range(len(num_imgs)), list(num_imgs.values()), align='center')
plt.xticks(range(len(num_imgs)), list(num_imgs.keys()))
plt.title('Distribution of classes in Training Dataset')
plt.show()

In [None]:
# Check the number of images in validation data
img_folder = ['cup', 'fork', 'glass', 'knife', 'plate', 'spoon']
num_imgs = {}
for i in img_folder:
  images = len(os.listdir(val_dir + '/' + i + '/'))
  num_imgs[i] = images

for i in img_folder:
  print(f"Validation {i} images are: {len(os.listdir(val_dir + '/' + i + '/'))}")

## Move train and test images

In [None]:
!pip install -q kaggle

In [None]:
from google.colab import files

files.upload()

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c kitchenware-classification
!mkdir data
!unzip kitchenware-classification.zip -d data > /dev/null
!rm kitchenware-classification.zip

In [None]:
!ls

In [None]:
# Data directory
data_dir = 'data/images/'

# Read train csv
df_train_full = pd.read_csv('data/train.csv', dtype={'Id': str})
# Add new column 'filename' for train images
df_train_full['filename'] = data_dir + df_train_full['Id'] + '.jpg'
df_train_full.copy()

In [None]:
from pathlib import Path

In [None]:
# base file structure
project_dir = Path('/content/data/')
img_dir = project_dir / 'images'

In [None]:
project_dir

In [None]:
img_dir

In [None]:
# new file structure (train and test dir)
dataset_dir = Path('./kitchenware')
train_dir = dataset_dir / 'train'
test_dir = dataset_dir / 'test'

In [None]:
dataset_dir

In [None]:
train_dir

In [None]:
test_dir

In [None]:
train_df = df_train_full.copy()
test_df = df_submission[['Id', 'label', 'filename']].copy()
display(train_df.head(), test_df.head())

In [None]:
# Train class directories
for label in train_df['label'].unique():
  label_dir = train_dir / label
  label_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Test class directories
for label in test_df['label'].unique():
  label_dir = test_dir / label
  label_dir.mkdir(parents=True, exist_ok=True)

In [None]:
# Extract image name from 'filename' in train_df
img_lst = []
for img in train_df.filename:
  img_lst.append(img[-8:])

# Create new column 'image' in train_df
train_df['image'] = img_lst
train_df.head()

In [None]:
# Extract image name from 'filename' in test_df
img_lst = []
for img in test_df.filename:
  img_lst.append(img[-8:])

# Create new column 'image' in test_df
test_df['image'] = img_lst
test_df.head()

In [None]:
# List of all images
images = list(img_dir.glob('*.jpg'))
print(f'Found {len(images)} images')

In [None]:
# Add image in the respective classes for train set
count = 0
for img in train_df['image'].to_list():
  label = train_df[train_df['image'] == img]['label'].values[0]
  train_img_path = img_dir / img
  new_train_img_path = train_dir.absolute() / label/ img
  if not new_train_img_path.exists():
      try:
          new_train_img_path.write_bytes(train_img_path.read_bytes())
          count += 1
      except FileNotFoundError:
          pass
    
print(f'Total number of images in train directory: {count}')

In [None]:
# Add image in the respective classes for test set
count = 0
for img in test_df['image'].to_list():
  label = test_df[test_df['image'] == img]['label'].values[0]
  test_img_path = img_dir / img
  new_test_img_path = test_dir.absolute() / label/ img
  if not new_test_img_path.exists():
      try:
          new_test_img_path.write_bytes(test_img_path.read_bytes())
          count += 1
      except FileNotFoundError:
          pass
    
print(f'Total number of images in test directory: {count}')

## Save data to google drive

In [None]:
# zip kitchware directory and name it to preprocessed-kitchenware-dataset
!zip -r preprocessed-kitchenware-dataset.zip /content/kitchenware

In [None]:
# Copy to google drive
!cp -r /content/preprocessed-kitchenware-dataset.zip /content/gdrive/MyDrive

## Download data to disk

In [None]:
from google.colab import files

files.download("/content/preprocessed-kitchenware-dataset.zip")