In [10]:
import pandas as pd
from pathlib import Path
import shutil
import numpy as np
import traceback
import ujson as json
import umap
import hdbscan
import pickle
import glob
from tqdm.auto import tqdm

In [11]:
# load ResNet activations
data_dir = r'C:\Users\Rodney\PycharmProjects\Thesis_cur-AI-tor\notebooks\micro_dataset1_resnet18_output_identity.json'
with open(data_dir, 'r') as f:
    data_dict_list = json.load(f)

data_dict = {}
for element in data_dict_list:
    data_dict.update(element)

df_x = pd.DataFrame.from_dict(data_dict, orient='index')
X = df_x.values

In [None]:
# Load engineered data
engineered_features_path = r'C:\Users\Rodney\Desktop\saatchi\df_full.csv'
df_engineered = pd.read_csv(engineered_features_path, index_col=0)
df_engineered.drop('color_dominant', axis=1, inplace=True)

In [79]:
def create_embedding(data: np.array,
                     n_neighbors: int,
                     n_components: int,
                     metric: str = 'euclidean',
                     full_dataset: bool = False):

    if full_dataset:
        filename_prefix = 'macro_'
    else:
        filename_prefix = 'micro_'

    embedding_filename = Path(f'{filename_prefix}embedding_{n_neighbors}_{n_components}_{metric}.pkl')

    if embedding_filename.is_file():
        with open(embedding_filename, 'rb') as f:
            clusterable_embedding_ = pickle.load(f)

    else:
        clusterable_embedding_ = umap.UMAP(
            n_neighbors=n_neighbors,
            min_dist=0.0,
            n_components=n_components,
            metric=metric,
            random_state=3,
        ).fit_transform(X)
        with open(embedding_filename, 'wb') as f:
            pickle.dump(clusterable_embedding_, f)
    return clusterable_embedding_

def get_clusters(clusterable_embedding_):
    clusterer = hdbscan.HDBSCAN(min_cluster_size=1000,
                                prediction_data=True).fit(clusterable_embedding_)
    soft_clusters_ = hdbscan.all_points_membership_vectors(clusterer)
    return soft_clusters_

In [81]:
clusterable_embedding = create_embedding(data=X, n_neighbors=1250, n_components=25, metric='euclidean')
soft_clusters = get_clusters(clusterable_embedding)
class_labels = [preds.argmax() for preds in soft_clusters]
df_x_ = df_x.copy()
df_x_['class'] = class_labels
df_x_ = df_x_[['class']]
df_x_.head()

Unnamed: 0,class
3865991_3865991_691412_2935874-DSMUXGTJ-7.jpg,1
7980766_7980766_669333_7048178-XOYQRJZQ-7.jpg,0
3749936_3749936_314728_2819820-JDANXKLD-7.jpg,1
5610715_5610715_91068_4680525-LMQNOWJA-7.jpg,1
6771765_6771765_786228_5841405-PSPFNCAV-7.jpg,1


In [82]:
df_x_.value_counts()

class
1        9062
0        6161
dtype: int64

In [74]:
# Load metadata etc
df_x_.to_csv('saatchi_micro_umap_hdbscan_clustering_test6.csv')
filenames_and_labels = 'saatchi_micro_umap_hdbscan_clustering_test6.csv'
target_column_name = 'class'
image_input_folder = r'E:\temp\thesisdata\micro_dataset1'
image_output_folder = r'E:\temp\thesisdata\umap_hdbscan_test6'
size_ = 128
image_count_per_class = 1000000

# Load target data
targets_df = pd.read_csv(filenames_and_labels, index_col=0)

# Remove unnecessary columns
for col in targets_df.columns:
    if col != target_column_name:
        targets_df.drop(col, axis=1, inplace=True)

# Remove duplicates
targets_df = pd.DataFrame(targets_df.reset_index().
                          drop_duplicates(subset=['index'])).set_index('index')

In [75]:
def resize_pad_crop_image(input_path: str, output_path: str):
    input_path_ = Path(input_path)
    output_path_ = Path(output_path)
    filename = input_path_.name

    assert input_path_.is_file()
    assert output_path_.is_dir(), print('Supplied output path is not a directory:' + output_path_.__str__())

    if input_path_.stat().st_size > 0:
        pass
    else:
        print(f'Filesize is 0, skipping file: {input_path_}')
        return

    full_output_path = output_path_ / filename
    shutil.copy(input_path, full_output_path)

In [76]:
# Create list with unique class labels
label_folder_list = list(np.unique(targets_df[target_column_name].values))
counter = {k: 0 for k in label_folder_list}

# Create the folders
for folder in label_folder_list:
    Path(image_output_folder + '/' + str(folder)).mkdir(parents=True, exist_ok=True)

In [77]:
def run(file):
    filename = None
    try:
        if all(count >= image_count_per_class for count in counter.values()):
            return
        else:
            filename = Path(file).name
            label = targets_df.loc[filename][target_column_name]
            if counter[label] < image_count_per_class:
                image_output_folder_with_label = image_output_folder + '\\' + str(label)
                resize_pad_crop_image(file, image_output_folder_with_label)
                counter.update({label: counter[label] + 1})
    except KeyError:
        print(f'Label not found for file {file}, skipping!')
    except OSError:
        if filename is None:
            filename = file
        print(f'Skipping file {filename} due to OSError encountered: {traceback.format_exc()}, skipping!')

In [78]:
filelist = glob.glob(image_input_folder + '*/*')
print(len(filelist))
for file in tqdm(filelist):
    run(file)

15223


  0%|          | 0/15223 [00:00<?, ?it/s]