# Setup Kaggle API and Download Kitchenware Dataset

In [1]:
!pip install -q kaggle

[0m

In [2]:
from google.colab import files

files.upload()

ModuleNotFoundError: No module named 'google.colab'

In [None]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c kitchenware-classification
!mkdir kitchenware-classification
!unzip kitchenware-classification.zip -d kitchenware-classification > /dev/null
!rm kitchenware-classification.zip

In [None]:
!ls

## Kitchenware Classification Dataset Generator

In [None]:
import PIL
import pandas as pd
import matplotlib.pyplot as plt

from hashlib import md5
from pathlib import Path
from collections import defaultdict

In [None]:
# base file structure
project_dir = Path('/content/kitchenware-classification/')
img_dir = project_dir / 'images'

# new file structure (train and test dir)
dataset_dir = Path('./kitchenware-dataset')
train_dir = dataset_dir / 'train'
test_dir = dataset_dir / 'test'

# dataset files
train_data = project_dir / 'train.csv'
test_data = project_dir / 'test.csv'

## Import Dataset Files


In [None]:
# Import train and test data
train_df = pd.read_csv(train_data)
test_df = pd.read_csv(test_data)

# Create new columns 'filename' for images from image id
train_df['filename'] = train_df['Id'].apply(lambda x: f"{x:04d}.jpg")
test_df['filename'] = test_df['Id'].apply(lambda x: f"{x:04d}.jpg")

In [None]:
display(train_df.head(), test_df.head())

## Check Duplicate Images

In [None]:
# Assign hash to the images
hash_dict = defaultdict(list)
for img in img_dir.glob('*.jpg'):
    with img.open('rb') as f:
        img_hash = (md5(f.read()).hexdigest())
        hash_dict[img_hash].append(img)

len(hash_dict)

In [None]:
# Find image duplicates
duplicate_img = []
for k, v in hash_dict.items():
    if len(v) > 1:
        if v[0].name != v[1].name:
            duplicate_img.append(v[0])
            duplicate_img.append(v[1])
            print(v)
            
len(duplicate_img)

In [None]:
plt.figure(figsize=(12,8))

for idx, img in enumerate(duplicate_img):
    im = PIL.Image.open(img)
    plt.subplot(6, 2, idx+1)
    plt.imshow(im)
    plt.axis('off')
plt.show()

## Create Directory Structure

In [None]:
# Make test directory with sub-directories of class names
test_dir.mkdir(parents=True, exist_ok=True)

for label in train_df['label'].unique(): # extract labels from train_df
    label_dir = train_dir / label
    label_dir.mkdir(parents=True, exist_ok=True)

## Move the Images

In [None]:
# List of all images
images = list(img_dir.glob('*.jpg'))
print(f'Found {len(images)} images')

In [None]:
# Move images to train directory
count = 0
for img in train_df['filename'].to_list(): # image filename from df_train
    label = train_df[train_df['filename'] == img]['label'].values[0] # filter filename column and extract values from label column
    train_img_path = img_dir / img
    new_train_img_path = train_dir.absolute() / label / img
    if not new_train_img_path.exists():
        try:
            new_train_img_path.write_bytes(train_img_path.read_bytes())
            count += 1
        except FileNotFoundError:
            pass
    
print(f'Total number of images in train directory: {count}')

In [None]:
# Move images to test directory
count = 0
for img in test_df['filename'].to_list():
    test_img_path = img_dir / img
    new_test_img_path = test_dir.absolute() / img
    if not new_test_img_path.exists():
        try:
            new_test_img_path.write_bytes(test_img_path.read_bytes())
            count += 1
        except FileNotFoundError:
            pass
        
print(f'Total number of images in test directory: {count}')

## Check Results

In [None]:
!ls kitchenware-dataset

In [None]:
# Class directories in train data of images
!ls kitchenware-dataset/train

## Save Data to Google Drive

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
!zip -r kitchenware-dataset.zip /content/kitchenware-dataset

In [None]:
!cp -r /content/kitchenware-dataset.zip /content/drive/MyDrive