In [1]:
import pandas as pd
import math
import imagehash
import matplotlib.pyplot as plt
%matplotlib inline
from PIL import Image
import dask
from dask import bag, diagnostics
from keras.preprocessing.image import ImageDataGenerator
import os
import shutil

import warnings
warnings.filterwarnings('ignore')

In [35]:
df = pd.read_csv('data/train.csv')
n_categories = df['category'].nunique()

In [37]:
print('Number of images before preprocessing:', len(df))

Number of images before preprocessing: 105392


In [9]:
def getImagePath(category, file_name):
    return f'data/train/{category:02d}/{file_name}'

def displayImage(img_path):
    image = Image.open(img_path)
    plt.axis('off')
    plt.imshow(image)
    plt.show()

def saveImage(fig, save_directory, p_hash):
    fig.savefig(f'{save_directory}/{p_hash}', bbox_inches='tight')

In [16]:
# remove blank images
def checkBlankImage(img_path):
    img = Image.open(img_path)
    colors = img.getcolors(img.size[0]*img.size[1])
    img.close()
    return len(colors) == 1

list_of_dict = df.to_dict(orient='records')

sequence = [getImagePath(list_of_dict[i]['category'], list_of_dict[i]['filename']) for i in range(len(list_of_dict))]
dask_bag = bag.from_sequence(sequence).map(checkBlankImage)

df = pd.DataFrame(list_of_dict)

with diagnostics.ProgressBar():
    df['is_blank'] = dask_bag.compute()

blank_df = df[df['is_blank']==True]
blank_df.drop(['is_blank'], axis=1, inplace=True)
blank_df.to_csv('output/blank.csv')

df = df[df['is_blank']==False]
df.drop(['is_blank'], axis=1, inplace=True)
df.to_csv('output/img_removed_blank.csv')

# without dask: 74m40.4s
# with dask bag: 41m29.8s

[########################################] | 100% Completed | 41min 28.3s


In [14]:
# get p-hashes of images
df = pd.read_csv('output/img_removed_blank.csv', index_col=0)

def getImagePHash(img_path):
    return str(imagehash.phash(Image.open(img_path)))

sequence = [getImagePath(df.iloc[i]['category'], df.iloc[i]['filename']) for i in range(len(df))]
dask_bag = bag.from_sequence(sequence).map(getImagePHash)

with diagnostics.ProgressBar():
    df['p-hash'] = dask_bag.compute()

df.to_csv('output/img_p-hashes.csv')

# group images by p-hash
grouped_df = df.groupby('p-hash')['filename', 'category'].apply(lambda x: x.to_dict(orient='records')).reset_index(name='duplicates')

# extract duplicated images
duplicate_df = grouped_df[grouped_df['duplicates'].map(len) > 1]
duplicate_df.to_csv('output/duplicates.csv')

[########################################] | 100% Completed |  7min 32.2s


In [12]:
@dask.delayed
def saveDuplicatedImages(row, save_directory):
    duplicate_list = row['duplicates']
    n_cols = 5
    n_rows = math.ceil(len(duplicate_list)/n_cols)
    fig = plt.figure(figsize=([n_cols*2.5, n_rows*1.5]))
    for n, duplicate in enumerate(eval(duplicate_list)):
        img_path = getImagePath(duplicate['category'], duplicate['filename'])
        image = Image.open(img_path)
        plt.subplot(n_rows, n_cols, n+1)
        plt.title(f"{duplicate['category']:02d}")
        # plt.title(f"{duplicate['category']:02d}\n{duplicate['filename']}")
        plt.axis('off')
        plt.imshow(image)
        image.close()
    saveImage(fig, save_directory, row['p-hash'])

In [13]:
# save duplicated images for visualization
duplicate_df = pd.read_csv('output/duplicates.csv', index_col=0)
save_directory = 'output/duplicates'

tasks = [saveDuplicatedImages(duplicate_df.iloc[i], save_directory) for i in range(len(duplicate_df))]

with diagnostics.ProgressBar():
    result = dask.compute(tasks, scheduler='processes', num_workers=8)

# without dask: 29m11.7s
# with dask: 4m48.6s (8 workers), 5m1.4s (4 workers), 8m28.1s (3 workers)
# with dask bag: 6m4.6s

[########################################] | 100% Completed |  5min  4.9s


In [17]:
# group duplicated images with similar p-hashes
duplicate_df = pd.read_csv('output/duplicates.csv', index_col=0)
duplicate_df['duplicates'] = duplicate_df['duplicates'].apply(eval)
# duplicate_df['similar_duplicates'] = duplicate_df.apply(lambda row: [], axis=1)
duplicate_df['similar_p-hashes'] = duplicate_df.apply(lambda row: [], axis=1)

list_of_dict = duplicate_df.to_dict(orient='records')

def groupSimilarImages(item_dict):
    current_i = item_dict['i']
    current_item = item_dict['item']
    for item in list_of_dict[current_i+1:]:
        if imagehash.hex_to_hash(current_item['p-hash']) - imagehash.hex_to_hash(item['p-hash']) <= 6:
            current_item['duplicates'] = current_item['duplicates'] + item['duplicates']
            # current_item['similar_duplicates'] = item['duplicates']
            current_item['similar_p-hashes'].append(item['p-hash'])
    return current_item

sequence = [{'i': i, 'item': item} for i, item in enumerate(list_of_dict)]
dask_bag = bag.from_sequence(sequence).map(groupSimilarImages)

with diagnostics.ProgressBar():
    grouped_duplicates = dask_bag.compute()

grouped_duplicate_df = pd.DataFrame(grouped_duplicates)
duplicate_list = grouped_duplicate_df['similar_p-hashes'].sum()
grouped_duplicate_df = grouped_duplicate_df[~grouped_duplicate_df['p-hash'].isin(duplicate_list)]

grouped_duplicate_df.to_csv('output/duplicates_similar.csv')

[########################################] | 100% Completed |  1min 38.7s


In [18]:
# save grouped similar duplicated images for visualization
duplicate_df = pd.read_csv('output/duplicates_similar.csv', index_col=0)
save_directory = 'output/duplicates_similar'

tasks = [saveDuplicatedImages(duplicate_df.iloc[i], save_directory) for i in range(len(duplicate_df))]

with diagnostics.ProgressBar():
    result = dask.compute(tasks, scheduler='processes', num_workers=8)

[########################################] | 100% Completed |  5min 53.0s


In [19]:
# check categories of duplicates
duplicate_df = pd.read_csv('output/duplicates_similar.csv', index_col=0)

def checkNumCategory(row):
    category_list = list(set(item['category'] for item in eval(row)))
    return len(category_list)

duplicate_df['n_categories'] = duplicate_df['duplicates'].apply(checkNumCategory)

duplicate_single_cat_df = duplicate_df[duplicate_df['n_categories'] == 1]
duplicate_single_cat_df.to_csv('output/duplicates_single_category.csv')

duplicate_multi_cat_df = duplicate_df[duplicate_df['n_categories'] != 1]
duplicate_multi_cat_df.to_csv('output/duplicates_multi_categories.csv')

In [34]:
# save duplicated images which exist in multiple categories for visualization (ignore for now)
duplicate_multi_cat_df = pd.read_csv('output/duplicates_multi_categories.csv', index_col=0)
save_directory = 'output/duplicates_multi_categories'

tasks = [saveDuplicatedImages(duplicate_multi_cat_df.iloc[i], save_directory) for i in range(len(duplicate_multi_cat_df))]

with diagnostics.ProgressBar():
    result = dask.compute(tasks, scheduler='processes', num_workers=8)

[########################################] | 100% Completed | 16.4s


In [33]:
# remove duplicates that exist in the same category
duplicate_single_cat_df = pd.read_csv('output/duplicates_single_category.csv', index_col=0)

def extractDuplicatesToRemove(row):
    return [item['filename'] for item in eval(row)[1:]]

duplicates_to_remove = duplicate_single_cat_df['duplicates'].apply(extractDuplicatesToRemove).sum()

df = pd.read_csv('output/img_removed_blank.csv', index_col=0)

removed_duplicate_single_cat_df = df[~df['filename'].isin(duplicates_to_remove)]

removed_duplicate_single_cat_df.to_csv('output/img_removed_duplicates.csv')

In [30]:
# perform image augmentation to resample minority classes
df = pd.read_csv('output/img_removed_duplicates.csv', index_col=0)

categories = [11, 17, 18, 29, 32, 33, 37]
target_count = 2509

for category in categories:
    selected_df = df[df['category']==category]

    current_count = len(selected_df)
    print(f'Category: {category}\tCurrent count: {current_count}')

    selected_df = selected_df.sample(n=target_count-current_count, replace=True)
    selected_df['category'] = selected_df['category'].astype(str)

    save_directory = f'output/generated/{category}'

    if os.path.exists(save_directory):
        shutil.rmtree(save_directory)
    os.makedirs(save_directory)

    image_datagen = ImageDataGenerator(
        rotation_range=45,
        width_shift_range=0.2,
        height_shift_range=0.2,
        brightness_range=[0.5,1.2],
        shear_range=0.2,
        zoom_range=0.2,
        channel_shift_range=90,
        fill_mode='reflect',
        horizontal_flip=True,
        vertical_flip=False
    )

    image_generator = image_datagen.flow_from_dataframe(
        dataframe=selected_df,
        x_col='filename',
        y_col='category',
        directory=f'data/train/{category}',
        color_mode='rgb',
        class_mode='categorical',
        batch_size=64,
        seed=42,
        shuffle=True,
        save_to_dir=save_directory,
        save_format='jpg'
    )

    for _ in range(len(image_generator)):
        image_generator.next()

Category: 11	Current count: 1738
Found 771 validated image filenames belonging to 1 classes.
Category: 17	Current count: 1399
Found 1110 validated image filenames belonging to 1 classes.
Category: 18	Current count: 2014
Found 495 validated image filenames belonging to 1 classes.
Category: 29	Current count: 1750
Found 759 validated image filenames belonging to 1 classes.
Category: 32	Current count: 2094
Found 415 validated image filenames belonging to 1 classes.
Category: 33	Current count: 561
Found 1948 validated image filenames belonging to 1 classes.
Category: 37	Current count: 1632
Found 877 validated image filenames belonging to 1 classes.


In [38]:
# create csv to consolidate generated images with original images
df = pd.read_csv('output/img_removed_duplicates.csv', index_col=0)
df['filepath'] = df.apply(lambda row: f"data/train/{row['category']:02d}/{row['filename']}", axis=1)

count_before = len(df)

for category in categories:
    gen_directory = f'output/generated/{category}'
    cat_df = pd.DataFrame()
    cat_df['filename'] = pd.Series(os.listdir(gen_directory))
    cat_df['category'] = category
    cat_df['filepath'] = cat_df['filename'].apply(lambda file_name: f'{gen_directory}/{file_name}')
    df = pd.concat([df, cat_df])

count_after = len(df)

print(f'{count_after - count_before} generated images added.')

df.to_csv('output/img_balanced.csv')

6375 generated images added.


In [39]:
print('Number of images after preprocessing:', len(df))

Number of images after preprocessing: 108518
