In [14]:
import numpy as np
import torch as torch
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import euclidean_distances

In [15]:
embeddings = np.load("./person_vehicle_embeddings.npy")

In [16]:
# define constants
dataset_size = embeddings.shape[0]
emb_size = embeddings.shape[1]
cluster_num = 2
save_folder = "./clusters"

In [17]:
# do kmeans clustering with scikit
kmeans = KMeans(n_clusters=cluster_num, random_state=0)
kmeans.fit(embeddings)

In [22]:
# Get cluster assignments
labels = kmeans.labels_

# Get cluster centers
centroids = kmeans.cluster_centers_

# Get distances from datapoints to nearest cluster center
dist_to_center = []
for i, sample in enumerate(embeddings):
    center = labels[i]
    distance = euclidean_distances(sample.reshape(1,-1), centroids[center].reshape(1,-1))
    dist_to_center.append(distance[0][0])

# save cluster information
np.save(f"{save_folder}/kmeans_centroids.npy", centroids) # cluster centers
np.save(f"{save_folder}/nearest_cent.npy", labels) # nearest centroid for each data point (embedding)
np.save(f"{save_folder}/dist_to_cent.npy", dist_to_center) # l2 norms to center

In [31]:
save_sorted_clusters_loc = "sorted_clusters"

sorted_clusters = {}

for clust_num in range(cluster_num):
    sorted_clusters[clust_num] = []  # Initialize an empty list for each cluster number
    for sample_index, dist in enumerate(dist_to_center):
        if labels[sample_index] == clust_num:
            sorted_clusters[clust_num].append([sample_index, dist_to_center[sample_index]])


In [32]:
# Sort the arrays in each list by their distance to the center in descending order
for key, value in sorted_clusters.items():
    value.sort(key=lambda x: x[1], reverse=True)

In [34]:
# Assuming sorted_clusters is a dictionary where keys are cluster numbers and values are the data related to each cluster
for key, value in sorted_clusters.items():
    filename = f"{save_folder}/{save_sorted_clusters_loc}/sorted_cluster_{key}.npy"
    np.save(filename, value)

In [35]:
from argparse import Namespace
from SemDeDup import SemDeDup

args = Namespace(
    embs_memory_loc = "./person_vehicle_embeddings.npy",
    num_clusters = 2,
    sorted_clusters_path = "./clusters/sorted_clusters",
    save_loc = "./clusters",
    which_to_keep = "hard",
    eps_list = [0.2],
    seed = 42
)

sem_de_dup = SemDeDup(args)
sem_de_dup.process_clusters()

100%|██████████| 2/2 [00:00<00:00, 76.60it/s]

./clusters/./clusters/dataframes/cluster_0.pkl
cluster_size:  84
Step time: 0.008594036102294922(s)
DONE cluster:  0
./clusters/./clusters/dataframes/cluster_1.pkl
cluster_size:  138
Step time: 0.0002949237823486328(s)
DONE cluster:  1
DONE in 0.00 minutes





In [36]:
import os
from tqdm import tqdm
import pickle
import numpy as np

IMAGE_NAME_INDEX = 0

def extract_pruned_data(
    sorted_clusters_path,
    semdedup_pruning_tables_path,
    eps,
    num_clusters,
    output_txt_path,
    retreive_kept_samples=True,
):

    ## -- list of paths to the examples we want to keep/remove.
    example_paths = []

    for cluster_id in tqdm(range(0, num_clusters)):
        
        # if not os.path.exists(f"{semdedup_pruning_tables_path}/cluster_{cluster_id}.pkl"):
        #     continue

        cluster_i = np.load(
            os.path.join(sorted_clusters_path, f"sorted_cluster_{cluster_id}.npy")
        )
        
        with open(
            f"{semdedup_pruning_tables_path}/cluster_{cluster_id}.pkl", "rb"
        ) as file:
            semdedup_pruning_tables = pickle.load(file)

        ## -- See which examples to keep/remove from this cluster.
        ## -- Use retreive_kept_samples=True when kept dataset size <= 50%. This will return a smaller output text file,
        ## -- semdedup_pruning_tables contain True values for the examples to be removed.
        images_to_keep_or_remove = semdedup_pruning_tables.iloc[:,1][
            semdedup_pruning_tables.iloc[:,1] == (not retreive_kept_samples)
        ].index.to_numpy()
        if "indices" in semdedup_pruning_tables.columns:
            cluster_i = cluster_i[semdedup_pruning_tables.iloc[:,0]]
        ## -- retrieve only the examples we want and add to the list.
        dedup_cluster = cluster_i[images_to_keep_or_remove]
        example_paths += dedup_cluster[:, IMAGE_NAME_INDEX].astype("<U32").tolist()

    with open(output_txt_path, "w") as fp:
        fp.write("\n".join(example_paths))

    print(f"DONE saving {len(example_paths)} image paths")

    return


In [37]:
# get removed data from semdedup
eps = 0.02
output_txt_path = "./pleasework.txt"

extract_pruned_data("./clusters/sorted_clusters", "./clusters/clusters/dataframes", eps, cluster_num, output_txt_path, retreive_kept_samples=True)
print("Done!")

100%|██████████| 2/2 [00:00<00:00, 260.08it/s]

DONE saving 2 image paths
Done!





In [38]:
import numpy as np
with open(output_txt_path, "r") as file:
    removed_data = file.readlines()
    removed_data = [line.rstrip('\n') for line in removed_data]
    # Assuming you have a variable `double_var` which is of type double
    removed_data = [int(float(i)) for i in removed_data]
    

In [39]:
removed_data

[111, 174]