In [1]:
import pandas as pd
import fastdup
import cv2
import os

In [2]:
import numpy as np

In [3]:
import math

# 1. Create dict based on kmeans_assignments.csv

In [4]:
# read 'kmeans_assignments.csv' and save as 'df'
df = pd.read_csv("~/Documents/New Fast Image Analysis/Karta_out/kmeans_assignments.csv")

In [5]:
# create unique list of clusters
UniqueClusters = df.cluster.unique()

In [6]:
# create a data frame dictionary to store your data frames
DataFrameDict = {elem : pd.DataFrame() for elem in UniqueClusters}

for key in DataFrameDict.keys():
    DataFrameDict[key] = df[:][df.cluster == key]

In [7]:
for key in DataFrameDict.keys():
    print(key, len(DataFrameDict[key]))

3 438
7 413
6 668
18 122
17 412
2 338
13 308
19 606
16 633
9 265
21 139
20 337
12 577
0 146
8 572
1 435
15 200
23 466
25 182
24 382
22 667
14 683
5 194
4 670
10 713
11 518


# 2. Save images into different folders based on clusters

In [8]:
# make parent folder
sorted_folder = '/home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images'
os.makedirs(sorted_folder, exist_ok=True)

In [38]:
startingDir = os.getcwd()

for key in DataFrameDict.keys():
    # define cluster specific folder
    specific_path = os.path.join(sorted_folder, str(key))
    if not os.path.exists(specific_path):
        os.makedirs(specific_path)
            
    for image_path in DataFrameDict[key]['filename']:
        # Using cv2.imread() method to read the image
        img = cv2.imread('/home/yiwei/Documents/New Fast Image Analysis/' + image_path)
        # Filename
        filename = image_path.split('/')[-1]
        # Change the current directory to specified directory 
        os.chdir(specific_path)
        # Using cv2.imwrite() method to save the image
        cv2.imwrite(filename, img)
        # change back to original directoy
        os.chdir(startingDir)

# 3. Define fastdup function

In [18]:
# define fastdup run function baesd on cluster id
def fastdup_run_cluster(cluster_id):
    # run fastdup
    fastdup.run(input_dir = "./Karta_sorted_images/"+str(cluster_id), 
                work_dir = './Karta_sorted_images/'+str(cluster_id)+'_out')

## 3.1 Use create_similarity_gallery

In [19]:
# define results directory baesd on cluster id
def results_dir_cluster(cluster_id):
    return '/home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/'+str(cluster_id)+'_out'

In [29]:
# define fastdup similar gallery function baesd on cluster id
# and return similarity dataframe
def fastdup_similar_gallery(results_dir, cluster_id):
    similar_gallery_save_path = os.path.join(results_dir, 'similar-gallery')
    if not os.path.exists(similar_gallery_save_path):
        os.makedirs(similar_gallery_save_path)
    
    df_simi = fastdup.create_similarity_gallery(results_dir, save_path=similar_gallery_save_path)
    
    df_simi.to_csv(similar_gallery_save_path + '/df_simi_'+str(cluster_id)+'.csv')
    return df_simi

## 3.2 Set uniqueness punishment score

In [38]:
# generate uniqueness punishment score based on cluster id
def uniqueness_punishment_score(df_simi, results_dir, cluster_id):
    df_uniq = pd.DataFrame(columns=['filepath', 'uniqueness punishment'])
    
    for i in range(len(df_simi)):
        # check if item from 'from' column already exists in 'df_temp'
        # if yes, then pass; if not, then save it and its punishment value to 'df_temp'
        if df_simi['from'][i] in df_uniq['filepath'].values:
            pass
        else:
            row_from = [df_simi['from'][i], 1.0]
            df_uniq.loc[len(df_uniq)] = row_from
    
        for j in range(len(df_simi['to'][i])):
            # check if item from 'to' column already exists in 'df_temp'
            # if yes, set the latest punishment value as 0.5; if not, pass
            if df_simi['to'][i][j] in df_uniq['filepath'].values:
                df_uniq.at[len(df_uniq)-1, 'uniqueness punishment'] = 0.5
            else:
                pass
    
        for j in range(len(df_simi['to'][i])):
            # check if item from 'to' column already exists in 'df_temp'
            # if yes, pass; if not, then save it and its punishment value to 'df_temp'
            if df_simi['to'][i][j] in df_uniq['filepath'].values:
                pass
            else:
                row_to = [df_simi['to'][i][j], 0.5]
                df_uniq.loc[len(df_uniq)] = row_to
                
    # export 'df_uniq' as 'cluster_id_uniqueness_punishment_score.csv'
    df_uniq.to_csv(results_dir + '/cluster_'+str(cluster_id)+'_uniqueness_punishment_score.csv')

## 3.3 Use create_outliers_gallery

In [22]:
# generate outliers based on cluster id
def fastdup_outlier_gallery(results_dir, cluster_id):
    outlier_gallery_save_path = os.path.join(results_dir, 'outlier-gallery')
    if not os.path.exists(outlier_gallery_save_path):
        os.makedirs(outlier_gallery_save_path)
    
    fastdup.create_outliers_gallery(results_dir, save_path=outlier_gallery_save_path)
    
    df_outl = pd.DataFrame(columns=['filepath', 'outlier distance'])
    
    df_outl_temp = pd.read_csv(results_dir+"/outliers.csv")
    
    for i in range(len(df_outl_temp)):
        # check if item from 'from' column already exists in 'df_outl'
        # if yes, then pass; if not, then save it and its outlier distance to 'df_outl'
        if df_outl_temp['from'][i] in df_outl['filepath'].values:
            pass
        else:
            row_from = [df_outl_temp['from'][i], df_outl_temp['distance'][i]]
            df_outl.loc[len(df_outl)] = row_from
            
    # export 'df_outl' as 'cluster_id_outlier_distance.csv'
    df_outl.to_csv(outlier_gallery_save_path + '/cluster_'+str(cluster_id)+'_outlier_distance.csv')

# 4. Run Function from step3 respectively on clusters

In [47]:
for key in DataFrameDict.keys():
    fastdup_run_cluster(key)
    results_dir = results_dir_cluster(key)
    df_simi = fastdup_similar_gallery(results_dir, key)
    uniqueness_punishment_score(df_simi, results_dir, key)
    fastdup_outlier_gallery(results_dir, key)

FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/3
Found total 438 images to run on
Found total 438 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
81) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/3_out/nnf.index
Total time took 6108 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 2 nearly identical images(d>0.980), which are 0.15 %
Found a total of 287 above threshold images (d>0.900), which are 21.84 %
Found a total of 43 outlier images         (d<0.050), which are 3.27 %
Min distance found 0.636 max distance 0.985
[1;32m1676546142 : INFO:     (add_vertices:460): Num vertices for group 0: 438
[0m[1;32m1676546142 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)
[0m[1;32m1676546142 : INFO:     (commit_edge_buffer:680): Shuffling edges ...
[0m[1;32m1676546142 : INFO:     (commit_edge_buffer:688)

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 30.82it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/3_out/similar-gallery/topk_similarity.html
read outliers 43


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 14561.03it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/3_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/7
Found total 413 images to run on
Found total 413 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
51) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/7_out/nnf.index
Total time took 12085 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 0 nearly identical images(d>0.980), which are 0.00 %
Found a total of 130 above threshold images (d>0.900), which are 10.49 %
Found a total of 41 outlier images         (d<0.050), which are 3.31 %
Min distance found 0.709 max distance 0.976
[1;32m1676546157 : INFO:     (add_vertices:460): Num vertices for group 0: 413
[0m1676546157 : PROGRESS: (_p:516): +-----------------------------+
[0m1676546157 : PROGR

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 32.81it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/7_out/similar-gallery/topk_similarity.html
read outliers 41


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 31583.61it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/7_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/6
Found total 668 images to run on
Found total 668 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
131) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/6_out/nnf.index
Total time took 22168 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 2 nearly identical images(d>0.980), which are 0.10 %
Found a total of 1005 above threshold images (d>0.900), which are 50.15 %
Found a total of 66 outlier images         (d<0.050), which are 3.29 %
Min distance found 0.667 max distance 0.986
[1;32m1676546181 : INFO:     (add_vertices:460): Num vertices for group 0: 668
[0m[1;32m1676546181 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)


100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 30.95it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/6_out/similar-gallery/topk_similarity.html
read outliers 66


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 31619.33it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/6_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/18
Found total 122 images to run on
Found total 122 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
30) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/18_out/nnf.index
Total time took 5039 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 16 nearly identical images(d>0.980), which are 4.37 %
Found a total of 191 above threshold images (d>0.900), which are 52.19 %
Found a total of 12 outlier images         (d<0.050), which are 3.28 %
Min distance found 0.715 max distance 0.986
[1;32m1676546188 : INFO:     (add_vertices:460): Num vertices for group 0: 122
[0m[1;32m1676546188 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)


100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 29.64it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/18_out/similar-gallery/topk_similarity.html
read outliers 12


100%|█████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 20164.92it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/18_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/17
Found total 412 images to run on
Found total 412 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
140) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/17_out/nnf.index
Total time took 17181 ms
Found a total of 49 fully identical images (d>0.990), which are 3.96 %
Found a total of 97 nearly identical images(d>0.980), which are 7.85 %
Found a total of 332 above threshold images (d>0.900), which are 26.86 %
Found a total of 41 outlier images         (d<0.050), which are 3.32 %
Min distance found 0.688 max distance 0.997
[1;32m1676546208 : INFO:     (add_vertices:460): Num vertices for group 0: 412
[0m[1;32m1676546208 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 27.65it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/17_out/similar-gallery/topk_similarity.html
read outliers 41


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 16657.28it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/17_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/2
Found total 338 images to run on
Found total 338 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
32) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/2_out/nnf.index
Total time took 11051 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 93 nearly identical images(d>0.980), which are 9.17 %
Found a total of 545 above threshold images (d>0.900), which are 53.75 %
Found a total of 33 outlier images         (d<0.050), which are 3.25 %
Min distance found 0.731 max distance 0.989
[1;32m1676546221 : INFO:     (add_vertices:460): Num vertices for group 0: 338
[0m[1;32m1676546221 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)


100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 27.89it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/2_out/similar-gallery/topk_similarity.html
read outliers 33


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 31418.01it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/2_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/13
Found total 308 images to run on
Found total 308 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
64) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/13_out/nnf.index
Total time took 13555 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 0 nearly identical images(d>0.980), which are 0.00 %
Found a total of 375 above threshold images (d>0.900), which are 40.58 %
Found a total of 30 outlier images         (d<0.050), which are 3.25 %
Min distance found 0.635 max distance 0.977
[1;32m1676546236 : INFO:     (add_vertices:460): Num vertices for group 0: 308
[0m1676546236 : PROGRESS: (_p:516): +-----------------------------+
[0m1676546236 : PRO

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 26.90it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/13_out/similar-gallery/topk_similarity.html
read outliers 30


100%|███████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 25151.03it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/13_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/19
Found total 606 images to run on
Found total 606 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
73) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/19_out/nnf.index
Total time took 22120 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 45 nearly identical images(d>0.980), which are 2.48 %
Found a total of 1110 above threshold images (d>0.900), which are 61.06 %
Found a total of 60 outlier images         (d<0.050), which are 3.30 %
Min distance found 0.835 max distance 0.986
[1;32m1676546260 : INFO:     (add_vertices:460): Num vertices for group 0: 606
[0m[1;32m1676546260 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 27.10it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/19_out/similar-gallery/topk_similarity.html
read outliers 60


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 28976.19it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/19_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/16
Found total 633 images to run on
Found total 633 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
105) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/16_out/nnf.index
Total time took 23161 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 0 nearly identical images(d>0.980), which are 0.00 %
Found a total of 409 above threshold images (d>0.900), which are 21.54 %
Found a total of 63 outlier images         (d<0.050), which are 3.32 %
Min distance found 0.698 max distance 0.974
[1;32m1676546286 : INFO:     (add_vertices:460): Num vertices for group 0: 633
[0m1676546286 : PROGRESS: (_p:516): +-----------------------------+
[0m1676546286 : P

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 28.76it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/16_out/similar-gallery/topk_similarity.html
read outliers 63


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 31418.01it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/16_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/9
Found total 265 images to run on
Found total 265 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
59) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/9_out/nnf.index
Total time took 7336 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 2 nearly identical images(d>0.980), which are 0.25 %
Found a total of 432 above threshold images (d>0.900), which are 54.34 %
Found a total of 26 outlier images         (d<0.050), which are 3.27 %
Min distance found 0.657 max distance 0.984
[1;32m1676546297 : INFO:     (add_vertices:460): Num vertices for group 0: 265
[0m[1;32m1676546297 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)
[0

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 27.45it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/9_out/similar-gallery/topk_similarity.html
read outliers 26


100%|███████████████████████████████████████████████████████████████████████████| 18/18 [00:00<00:00, 21213.11it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/9_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/21
Found total 139 images to run on
Found total 139 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
10) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/21_out/nnf.index
Total time took 5019 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 12 nearly identical images(d>0.980), which are 2.88 %
Found a total of 102 above threshold images (d>0.900), which are 24.46 %
Found a total of 13 outlier images         (d<0.050), which are 3.12 %
Min distance found 0.702 max distance 0.988
[1;32m1676546306 : INFO:     (add_vertices:460): Num vertices for group 0: 139
[0m[1;32m1676546306 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)


100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 24.74it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/21_out/similar-gallery/topk_similarity.html
read outliers 13


100%|█████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 18724.57it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/21_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/20
Found total 337 images to run on
Found total 337 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
66) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/20_out/nnf.index
Total time took 12223 ms
Found a total of 4 fully identical images (d>0.990), which are 0.40 %
Found a total of 29 nearly identical images(d>0.980), which are 2.87 %
Found a total of 432 above threshold images (d>0.900), which are 42.73 %
Found a total of 33 outlier images         (d<0.050), which are 3.26 %
Min distance found 0.718 max distance 0.991
[1;32m1676546322 : INFO:     (add_vertices:460): Num vertices for group 0: 337
[0m[1;32m1676546322 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 24.64it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/20_out/similar-gallery/topk_similarity.html
read outliers 33


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 32375.95it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/20_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/12
Found total 577 images to run on
Found total 577 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
167) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/12_out/nnf.index
Total time took 17209 ms
Found a total of 2 fully identical images (d>0.990), which are 0.12 %
Found a total of 13 nearly identical images(d>0.980), which are 0.75 %
Found a total of 517 above threshold images (d>0.900), which are 29.87 %
Found a total of 57 outlier images         (d<0.050), which are 3.29 %
Min distance found 0.695 max distance 0.990
[1;32m1676546343 : INFO:     (add_vertices:460): Num vertices for group 0: 577
[0m[1;32m1676546343 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 28.69it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/12_out/similar-gallery/topk_similarity.html
read outliers 57


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 32413.48it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/12_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/0
Found total 146 images to run on
Found total 146 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
40) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/0_out/nnf.index
Total time took 1096 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 49 nearly identical images(d>0.980), which are 11.19 %
Found a total of 224 above threshold images (d>0.900), which are 51.14 %
Found a total of 14 outlier images         (d<0.050), which are 3.20 %
Min distance found 0.709 max distance 0.988
[1;32m1676546349 : INFO:     (add_vertices:460): Num vertices for group 0: 146
[0m[1;32m1676546349 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)


100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 24.80it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/0_out/similar-gallery/topk_similarity.html
read outliers 14


100%|█████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<00:00, 28468.13it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/0_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/8
Found total 572 images to run on
Found total 572 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
174) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/8_out/nnf.index
Total time took 18217 ms
Found a total of 2 fully identical images (d>0.990), which are 0.12 %
Found a total of 8 nearly identical images(d>0.980), which are 0.47 %
Found a total of 418 above threshold images (d>0.900), which are 24.36 %
Found a total of 57 outlier images         (d<0.050), which are 3.32 %
Min distance found 0.750 max distance 0.994
[1;32m1676546371 : INFO:     (add_vertices:460): Num vertices for group 0: 572
[0m[1;32m1676546371 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)
[

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 30.01it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/8_out/similar-gallery/topk_similarity.html
read outliers 57


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 32909.41it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/8_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/1
Found total 435 images to run on
Found total 435 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
96) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/1_out/nnf.index
Total time took 13126 ms
Found a total of 44 fully identical images (d>0.990), which are 3.37 %
Found a total of 36 nearly identical images(d>0.980), which are 2.76 %
Found a total of 543 above threshold images (d>0.900), which are 41.61 %
Found a total of 43 outlier images         (d<0.050), which are 3.30 %
Min distance found 0.760 max distance 0.996
[1;32m1676546388 : INFO:     (add_vertices:460): Num vertices for group 0: 435
[0m[1;32m1676546388 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)


100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 34.50it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/1_out/similar-gallery/topk_similarity.html
read outliers 43


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 33852.33it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/1_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/15
Found total 200 images to run on
Found total 200 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
16) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/15_out/nnf.index
Total time took 7028 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 0 nearly identical images(d>0.980), which are 0.00 %
Found a total of 342 above threshold images (d>0.900), which are 57.00 %
Found a total of 20 outlier images         (d<0.050), which are 3.33 %
Min distance found 0.831 max distance 0.972
[1;32m1676546399 : INFO:     (add_vertices:460): Num vertices for group 0: 200
[0m1676546399 : PROGRESS: (_p:516): +-----------------------------+
[0m1676546399 : PROG

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 23.45it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/15_out/similar-gallery/topk_similarity.html
read outliers 20


100%|███████████████████████████████████████████████████████████████████████████| 15/15 [00:00<00:00, 30885.89it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/15_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/23
Found total 466 images to run on
Found total 466 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
89) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/23_out/nnf.index
Total time took 14137 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 0 nearly identical images(d>0.980), which are 0.00 %
Found a total of 311 above threshold images (d>0.900), which are 22.25 %
Found a total of 46 outlier images         (d<0.050), which are 3.29 %
Min distance found 0.747 max distance 0.976
[1;32m1676546418 : INFO:     (add_vertices:460): Num vertices for group 0: 466
[0m1676546418 : PROGRESS: (_p:516): +-----------------------------+
[0m1676546418 : PR

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 33.94it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/23_out/similar-gallery/topk_similarity.html
read outliers 46


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 34464.29it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/23_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/25
Found total 182 images to run on
Found total 182 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
13) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/25_out/nnf.index
Total time took 4021 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 0 nearly identical images(d>0.980), which are 0.00 %
Found a total of 258 above threshold images (d>0.900), which are 47.25 %
Found a total of 18 outlier images         (d<0.050), which are 3.30 %
Min distance found 0.818 max distance 0.959
[1;32m1676546426 : INFO:     (add_vertices:460): Num vertices for group 0: 182
[0m1676546426 : PROGRESS: (_p:516): +-----------------------------+
[0m1676546426 : PRO

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 27.43it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/25_out/similar-gallery/topk_similarity.html
read outliers 18


100%|███████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 32263.88it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/25_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/24
Found total 382 images to run on
Found total 382 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
45) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/24_out/nnf.index
Total time took 12075 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 33 nearly identical images(d>0.980), which are 2.88 %
Found a total of 538 above threshold images (d>0.900), which are 46.95 %
Found a total of 38 outlier images         (d<0.050), which are 3.32 %
Min distance found 0.798 max distance 0.989
[1;32m1676546443 : INFO:     (add_vertices:460): Num vertices for group 0: 382
[0m[1;32m1676546443 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 28.37it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/24_out/similar-gallery/topk_similarity.html
read outliers 38


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 30515.13it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/24_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/22
Found total 667 images to run on
Found total 667 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
91) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/22_out/nnf.index
Total time took 20143 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 0 nearly identical images(d>0.980), which are 0.00 %
Found a total of 112 above threshold images (d>0.900), which are 5.60 %
Found a total of 66 outlier images         (d<0.050), which are 3.30 %
Min distance found 0.702 max distance 0.959
[1;32m1676546467 : INFO:     (add_vertices:460): Num vertices for group 0: 667
[0m1676546467 : PROGRESS: (_p:516): +-----------------------------+
[0m1676546467 : PRO

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 32.36it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/22_out/similar-gallery/topk_similarity.html
read outliers 66


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 34592.20it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/22_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/14
Found total 683 images to run on
Found total 683 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
130) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/14_out/nnf.index
Total time took 22173 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 14 nearly identical images(d>0.980), which are 0.68 %
Found a total of 966 above threshold images (d>0.900), which are 47.14 %
Found a total of 68 outlier images         (d<0.050), which are 3.32 %
Min distance found 0.802 max distance 0.989
[1;32m1676546493 : INFO:     (add_vertices:460): Num vertices for group 0: 683
[0m[1;32m1676546493 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 27.54it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/14_out/similar-gallery/topk_similarity.html
read outliers 68


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 35833.44it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/14_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/5
Found total 194 images to run on
Found total 194 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
15) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/5_out/nnf.index
Total time took 6025 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 2 nearly identical images(d>0.980), which are 0.34 %
Found a total of 373 above threshold images (d>0.900), which are 64.09 %
Found a total of 19 outlier images         (d<0.050), which are 3.26 %
Min distance found 0.822 max distance 0.984
[1;32m1676546503 : INFO:     (add_vertices:460): Num vertices for group 0: 194
[0m[1;32m1676546503 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)
[0

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 22.62it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/5_out/similar-gallery/topk_similarity.html
read outliers 19


100%|███████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 32896.50it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/5_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/4
Found total 670 images to run on
Found total 670 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
185) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/4_out/nnf.index
Total time took 21233 ms
Found a total of 2 fully identical images (d>0.990), which are 0.10 %
Found a total of 0 nearly identical images(d>0.980), which are 0.00 %
Found a total of 572 above threshold images (d>0.900), which are 28.46 %
Found a total of 67 outlier images         (d<0.050), which are 3.33 %
Min distance found 0.789 max distance 0.992
[1;32m1676546529 : INFO:     (add_vertices:460): Num vertices for group 0: 670
[0m[1;32m1676546529 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0)
[

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 31.18it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/4_out/similar-gallery/topk_similarity.html
read outliers 67


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 22227.37it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/4_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/10
Found total 713 images to run on
Found total 713 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
122) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/10_out/nnf.index
Total time took 23365 ms
Found a total of 0 fully identical images (d>0.990), which are 0.00 %
Found a total of 0 nearly identical images(d>0.980), which are 0.00 %
Found a total of 800 above threshold images (d>0.900), which are 37.40 %
Found a total of 71 outlier images         (d<0.050), which are 3.32 %
Min distance found 0.810 max distance 0.978
[1;32m1676546559 : INFO:     (add_vertices:460): Num vertices for group 0: 713
[0m1676546559 : PROGRESS: (_p:516): +-----------------------------+
[0m1676546559 : PR

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 32.76it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/10_out/similar-gallery/topk_similarity.html
read outliers 71


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 32313.59it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/10_out/outlier-gallery/outliers.html
FastDup Software, (C) copyright 2022 Dr. Amir Alush and Dr. Danny Bickson.
Going to loop over dir Karta_sorted_images/11
Found total 518 images to run on
Found total 518 images to run on■■■■■■■■■■■■■■■■■■■] 100% Estimated: 0 Minutes 0 Features
70) Finished write_index() NN model
Stored nn model index file Karta_sorted_images/11_out/nnf.index
Total time took 17102 ms
Found a total of 66 fully identical images (d>0.990), which are 4.25 %
Found a total of 21 nearly identical images(d>0.980), which are 1.35 %
Found a total of 475 above threshold images (d>0.900), which are 30.57 %
Found a total of 51 outlier images         (d<0.050), which are 3.28 %
Min distance found 0.764 max distance 0.997
[1;32m1676546581 : INFO:     (add_vertices:460): Num vertices for group 0: 518
[0m[1;32m1676546581 : INFO:     (commit_edge_buffer:609): In commit edge buffer (0,0

100%|██████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 32.20it/s]


Stored similar images view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/11_out/similar-gallery/topk_similarity.html
read outliers 51


100%|███████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 37415.74it/s]


Stored outliers visual view in  /home/yiwei/Documents/New Fast Image Analysis/Karta_sorted_images/11_out/outlier-gallery/outliers.html


# 5. Set cluster's priority

In [9]:
uniqueness_dir = '/home/yiwei/Documents/New Fast Image Analysis/Karta_uniqueness_out/'
# read df_combined.csv
df_all_score = pd.read_csv(uniqueness_dir+'/df_combined.csv')

In [10]:
# rename first column as filepath
df_all_score.rename(columns={"index": "filepath"}, inplace=True)

In [12]:
# create a data frame dictionary to store your data frames
DF_Dict_Score = {elem : pd.DataFrame() for elem in UniqueClusters}

for key in DF_Dict_Score.keys():
    DF_Dict_Score[key] = df_all_score[:][df_all_score.cluster == key]

## 5.1 Compute labeled number in clusters

In [15]:
df_labeled = pd.DataFrame(columns=['cluster id', 'cluster number', 'labeled number', 'labeled percent(%)'])

In [16]:
for key in DF_Dict_Score.keys():
    cluster_number = len(DF_Dict_Score[key].index)
    labeled_number = len(DF_Dict_Score[key][DF_Dict_Score[key]['Labeled']==1])
    labeled_percent = "{:.2f}".format(100*labeled_number/cluster_number)
    df_labeled.loc[len(df_labeled)] = [key, cluster_number, labeled_number, labeled_percent]

In [17]:
print(df_labeled)

    cluster id  cluster number  labeled number labeled percent(%)
0            3             438               5               1.14
1            7             413               1               0.24
2            6             668              82              12.28
3           18             122              19              15.57
4           17             412               2               0.49
5            2             338               0               0.00
6           13             308              99              32.14
7           19             606             103              17.00
8           16             633              22               3.48
9            9             265              52              19.62
10          21             139               1               0.72
11          20             337               2               0.59
12          12             577             122              21.14
13           0             146               0               0.00
14        

## 5.2 Compute cluster priority

In [18]:
# generate rank column
df_labeled['cluster rank'] = np.nan

In [19]:
# add rank based on cluster's labeled number and labeled percent
for i in range(len(df_labeled.index)):
    if df_labeled['labeled number'][i] == 0:
        df_labeled.at[i, 'cluster rank'] = 1
    elif df_labeled['labeled number'][i] < 10 and float(df_labeled['labeled percent(%)'][i]) < 3:
        df_labeled.at[i, 'cluster rank'] = 2
    elif df_labeled['labeled number'][i] < 50 and float(df_labeled['labeled percent(%)'][i]) < 30:
        df_labeled.at[i, 'cluster rank'] = 3
    else:
        df_labeled.at[i, 'cluster rank'] = 4

In [21]:
# seperate clusters based on cluster rank
df_labeled_1 = pd.DataFrame(columns=['cluster id', 'cluster number', 'labeled number', 'labeled percent(%)', 'cluster rank'])
df_labeled_2 = pd.DataFrame(columns=['cluster id', 'cluster number', 'labeled number', 'labeled percent(%)', 'cluster rank'])
df_labeled_3 = pd.DataFrame(columns=['cluster id', 'cluster number', 'labeled number', 'labeled percent(%)', 'cluster rank'])
df_labeled_4 = pd.DataFrame(columns=['cluster id', 'cluster number', 'labeled number', 'labeled percent(%)', 'cluster rank'])

for i in range(len(df_labeled.index)):
    if df_labeled['cluster rank'][i] == 1:
        df_labeled_1.loc[len(df_labeled_1)] = df_labeled.values[i]
    elif df_labeled['cluster rank'][i] == 2:
        df_labeled_2.loc[len(df_labeled_2)] = df_labeled.values[i]
    elif df_labeled['cluster rank'][i] == 3:
        df_labeled_3.loc[len(df_labeled_3)] = df_labeled.values[i]
    else:
        df_labeled_4.loc[len(df_labeled_4)] = df_labeled.values[i]

In [22]:
print(df_labeled_1)

  cluster id cluster number labeled number labeled percent(%) cluster rank
0          2            338              0               0.00          1.0
1          0            146              0               0.00          1.0
2         15            200              0               0.00          1.0
3         25            182              0               0.00          1.0
4         14            683              0               0.00          1.0
5          5            194              0               0.00          1.0
6          4            670              0               0.00          1.0


In [23]:
# creating a more specific rank column and passing the returned rank series
df_labeled_1["specific rank"] = df_labeled_1["cluster number"].rank()
df_labeled_2["specific rank"] = df_labeled_2["labeled number"].rank()
df_labeled_3["specific rank"] = df_labeled_3["labeled number"].rank()
df_labeled_4["specific rank"] = df_labeled_4["labeled number"].rank()

In [24]:
print(df_labeled_4)

  cluster id cluster number labeled number labeled percent(%) cluster rank  \
0          6            668             82              12.28          4.0   
1         13            308             99              32.14          4.0   
2         19            606            103              17.00          4.0   
3          9            265             52              19.62          4.0   
4         12            577            122              21.14          4.0   

   specific rank  
0            2.0  
1            3.0  
2            4.0  
3            1.0  
4            5.0  


# df_labeled_info = pd.DataFrame()
df_labeled_info = pd.concat([df_labeled_info, df_labeled_1], ignore_index=True)
df_labeled_info = pd.concat([df_labeled_info, df_labeled_2], ignore_index=True)
df_labeled_info = pd.concat([df_labeled_info, df_labeled_3], ignore_index=True)
df_labeled_info = pd.concat([df_labeled_info, df_labeled_4], ignore_index=True)

In [26]:
# save dataframe 'df_labeled_info' as 'cluster_labeled_info.csv'
df_labeled_info.to_csv(sorted_folder + '/cluster_labeled_info.csv')

# 6. add uniqueness punishment value and outlier value into DF_Dict_Score

## 6.1 Define Generate file name column function

In [82]:
# define Generate file name column function
def gen_filename_col(df_dict, clutser_id):
    # generate file name column
    df_dict[clutser_id]['file name'] = np.nan
    
    # reset index
    df_dict[clutser_id].reset_index(inplace=True)
    
    # set the last part of filepath as file name value
    for i in range(len(df_dict[clutser_id].index)):
        df_dict[clutser_id].at[i, 'file name'] = df_dict[clutser_id]['filepath'][i].split('/')[-1]

## 6.2 Define Generate uniqueness punishment column function

In [66]:
# define Generate uniqueness punishment column function
def gen_uniq_puni_col(parent_folder, df_dict, clutser_id):
    # read 'cluster_id_uniqueness_punishment_score.csv' and save as 'df_uniq_punish'
    df_uniq_punish = pd.read_csv(parent_folder+'/'+str(clutser_id)+'_out/cluster_'+str(clutser_id)+'_uniqueness_punishment_score.csv')
    
    # generate file name column
    df_uniq_punish['file name'] = np.nan
    
    # set the last part of filepath as file name value
    for i in range(len(df_uniq_punish.index)):
        df_uniq_punish.at[i, 'file name'] = df_uniq_punish['filepath'][i].split('/')[-1]
        
    # set index as filename
    df_dict[clutser_id].set_index('file name', inplace=True)
    df_uniq_punish.set_index('file name', inplace=True)
    
    # concatenate uniqueness score into 'df_dict[clutser_id]'
    df_dict[clutser_id] = pd.concat([df_dict[clutser_id], df_uniq_punish['uniqueness punishment']], axis=1)
    
    # reset index
    df_dict[clutser_id].reset_index(inplace=True)
    
    # change NaN in column 'uniqueness punishment' as 1
    for i in range(len(df_dict[clutser_id].index)):
        if math.isnan(df_dict[clutser_id]['uniqueness punishment'][i]):
            df_dict[clutser_id].at[i, 'uniqueness punishment'] = 1
        else:
            pass

## 6.3 Define Generate outlier column function

In [67]:
# define Generate outlier column function
def gen_outl_col(parent_folder, df_dict, clutser_id):
    # read 'cluster_id_outlier_distance.csv' and save as 'df_outl_dist'
    df_outl_dist = pd.read_csv(parent_folder+'/'+str(clutser_id)+'_out/outlier-gallery/cluster_'+str(clutser_id)+'_outlier_distance.csv')
    
    # generate file name column
    df_outl_dist['file name'] = np.nan
    
    # set the last part of filepath as file name value
    for i in range(len(df_outl_dist.index)):
        df_outl_dist.at[i, 'file name'] = df_outl_dist['filepath'][i].split('/')[-1]
        
    # set index as filename
    df_dict[clutser_id].set_index('file name', inplace=True)
    df_outl_dist.set_index('file name', inplace=True)
    
    # concatenate uniqueness score into 'df_dict[clutser_id]'
    df_dict[clutser_id] = pd.concat([df_dict[clutser_id], df_outl_dist['outlier distance']], axis=1)
    
    # reset index
    df_dict[clutser_id].reset_index(inplace=True)
    
    # change NaN in column 'outlier distance' as 0
    for i in range(len(df_dict[clutser_id].index)):
        if math.isnan(df_dict[clutser_id]['outlier distance'][i]):
            df_dict[clutser_id].at[i, 'outlier distance'] = 0
        else:
            pass

## 6.4 add uniqueness punishment value and outlier value into DF_Dict_Score

In [99]:
for key in DataFrameDict.keys():
    gen_filename_col(DF_Dict_Score, key)
    gen_uniq_puni_col(sorted_folder, DF_Dict_Score, key)
    gen_outl_col(sorted_folder, DF_Dict_Score, key)

In [104]:
# make parent folder
score_dir = sorted_folder + '/score'
os.makedirs(score_dir, exist_ok=True)

In [105]:
# create '0~25_score.csv' file based on clusters
for key in DataFrameDict.keys():
    file_path = score_dir + '/' + str(key) + '_score.csv'
    DF_Dict_Score[key].to_csv(file_path)

In [109]:
# Add all score into dataframe 'df_all_score'
df_all_score = pd.DataFrame()
for key in DataFrameDict.keys():
    df_all_score = pd.concat([df_all_score, DF_Dict_Score[key]])
# reset index
df_all_score.reset_index(inplace=True)
print(df_all_score)

       level_0                        file name  index  Unnamed: 0  \
0            0   186544_94bd6_5919bf66b7cd8.jpg     49          49   
1            1   186544_a9b9c_5919bf519228d.jpg     55          55   
2            2   186544_b32c8_5919c13c5c0d4.jpg     62          62   
3            3   186544_ba53c_5919bf5c1728c.jpg     67          67   
4            4  3659305_d7c07_60c43b9230ccf.jpg    379         379   
...        ...                              ...    ...         ...   
11079      513    54822_8fa60_58cea16dc7f04.jpg  10612       10612   
11080      514    54822_a4473_58cea13232758.jpg  10623       10623   
11081      515    54822_bcba4_58cea13feada8.jpg  10644       10644   
11082      516    54822_d72a3_58cea140a1fb5.jpg  10661       10661   
11083      517    54822_e2477_58cea1353fbde.jpg  10669       10669   

                                                          filepath      blur  \
0       Kartaview_Dataset/Basic_set/186544_94bd6_5919bf66b7cd8.jpg  0.402757   

In [110]:
# save dataframe 'df_all_score' as 'all_score.csv'
df_all_score.to_csv(score_dir + '/all_score.csv')