In [1]:
import pandas as pd
from pathlib import Path
import shutil
import numpy as np
import traceback
import ujson as json
import umap
import hdbscan
import pickle
import glob
from tqdm.auto import tqdm

In [4]:
# load ResNet activations
df_x = pd.read_csv(r'E:\temp\thesisdata\saatchi_micro_engineered_resnet.csv', index_col=0)
# df_x.head()
X = df_x.values

In [9]:
def create_embedding(data: np.array,
                     n_neighbors: int,
                     n_components: int,
                     metric: str = 'euclidean',
                     full_dataset: bool = False):

    if full_dataset:
        filename_prefix = 'macro_all_'
    else:
        filename_prefix = 'micro_all_'

    embedding_filename = Path(f'{filename_prefix}embedding_{n_neighbors}_{n_components}_{metric}.pkl')

    if embedding_filename.is_file():
        with open(embedding_filename, 'rb') as f:
            clusterable_embedding_ = pickle.load(f)

    else:
        clusterable_embedding_ = umap.UMAP(
            n_neighbors=n_neighbors,
            min_dist=0.0,
            n_components=n_components,
            metric=metric,
            random_state=3,
        ).fit_transform(X)
        with open(embedding_filename, 'wb') as f:
            pickle.dump(clusterable_embedding_, f)
    return clusterable_embedding_

def get_clusters(clusterable_embedding_):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=250,
                                prediction_data=True).fit(clusterable_embedding_)
    soft_clusters_ = hdbscan.all_points_membership_vectors(clusterer)
    return soft_clusters_

In [17]:
clusterable_embedding = create_embedding(data=X, n_neighbors=250, n_components=20, metric='euclidean')
soft_clusters = get_clusters(clusterable_embedding)
class_labels = [preds.argmax() for preds in soft_clusters]
df_x_ = df_x.copy()
df_x_['class'] = class_labels
df_x_ = df_x_[['class']]
df_x_.head()

Unnamed: 0,class
4612763_4612763_1071830_3682599-NRIWFPCP-7.jpg,1
6482875_6482875_242833_5552553-BVGPYMOP-7.jpg,1
7809282_7809282_850193_6877217-PUYOGUKK-7.jpg,1
6660719_6660719_91068_5730385-ZDZKMEVJ-7.jpg,1
7935829_7935829_1667681_7003425-VZVCGCKM-7.jpg,1


In [18]:
df_x_.value_counts()

class
1        13108
0         1729
dtype: int64

In [19]:
len(df_x_.value_counts())


2

In [None]:
class_labels

In [12]:
# Load metadata etc
df_x_.to_csv('saatchi_micro_umap_hdbscan_clustering_test6.csv')
filenames_and_labels = 'saatchi_micro_umap_hdbscan_clustering_test6.csv'
target_column_name = 'class'
image_input_folder = r'E:\temp\thesisdata\micro_dataset1'
image_output_folder = r'E:\temp\thesisdata\umap_hdbscan_test6'
size_ = 128
image_count_per_class = 1000000

# Load target data
targets_df = pd.read_csv(filenames_and_labels, index_col=0)

# Remove unnecessary columns
for col in targets_df.columns:
    if col != target_column_name:
        targets_df.drop(col, axis=1, inplace=True)

# Remove duplicates
targets_df = pd.DataFrame(targets_df.reset_index().
                          drop_duplicates(subset=['index'])).set_index('index')

In [13]:
def resize_pad_crop_image(input_path: str, output_path: str):
    input_path_ = Path(input_path)
    output_path_ = Path(output_path)
    filename = input_path_.name

    assert input_path_.is_file()
    assert output_path_.is_dir(), print('Supplied output path is not a directory:' + output_path_.__str__())

    if input_path_.stat().st_size > 0:
        pass
    else:
        print(f'Filesize is 0, skipping file: {input_path_}')
        return

    full_output_path = output_path_ / filename
    shutil.copy(input_path, full_output_path)

In [14]:
# Create list with unique class labels
label_folder_list = list(np.unique(targets_df[target_column_name].values))
counter = {k: 0 for k in label_folder_list}

# Create the folders
for folder in label_folder_list:
    Path(image_output_folder + '/' + str(folder)).mkdir(parents=True, exist_ok=True)

In [15]:
def run(file):
    filename = None
    try:
        if all(count >= image_count_per_class for count in counter.values()):
            return
        else:
            filename = Path(file).name
            label = targets_df.loc[filename][target_column_name]
            if counter[label] < image_count_per_class:
                image_output_folder_with_label = image_output_folder + '\\' + str(label)
                resize_pad_crop_image(file, image_output_folder_with_label)
                counter.update({label: counter[label] + 1})
    except KeyError:
        print(f'Label not found for file {file}, skipping!')
    except OSError:
        if filename is None:
            filename = file
        print(f'Skipping file {filename} due to OSError encountered: {traceback.format_exc()}, skipping!')

In [16]:
filelist = glob.glob(image_input_folder + '*/*')
print(len(filelist))
for file in tqdm(filelist):
    run(file)

15223


  0%|          | 0/15223 [00:00<?, ?it/s]

Label not found for file E:\temp\thesisdata\micro_dataset1\1547202_1547202_94551_765468-LZWIQDAT-7.jpg, skipping!
Label not found for file E:\temp\thesisdata\micro_dataset1\1731405_1731405_418125_902453-7.jpg, skipping!
Label not found for file E:\temp\thesisdata\micro_dataset1\1784136_1784136_429365_938803-RTKASPON-7.jpg, skipping!
Label not found for file E:\temp\thesisdata\micro_dataset1\1832502_1832502_417160_974813-7.jpg, skipping!
Label not found for file E:\temp\thesisdata\micro_dataset1\1859947_1859947_412755_1005410-7.jpg, skipping!
Label not found for file E:\temp\thesisdata\micro_dataset1\1954361_1954361_344984_1084835-7.jpg, skipping!
Label not found for file E:\temp\thesisdata\micro_dataset1\1976000_1976000_79335_1101883-MDDWHZHO-7.jpg, skipping!
Label not found for file E:\temp\thesisdata\micro_dataset1\1979804_1979804_206047_1104895-HCCBMDXV-7.jpg, skipping!
Label not found for file E:\temp\thesisdata\micro_dataset1\2062029_2062029_680195_1171969-7.jpg, skipping!
Label n

KeyboardInterrupt: 

In [36]:
l = [['a', 'b', 'c'], ['db_credentials.txt', 'embeddings', 'for']]
df = pd.DataFrame(l)
df.columns = [0, 1, 'x']
df

Unnamed: 0,0,1,x
0,a,b,c
1,db_credentials.txt,embeddings,for


In [37]:
for col in df.columns:
    if type(col) == int:
        df.drop(col, axis=1, inplace=True)
df

Unnamed: 0,x
0,c
1,for
