In [1]:
import platform 
import os
if platform.system() == 'Darwin':
    DATA_PATH = "/Users/maltegenschow/Documents/Uni/Thesis/Data.nosync"
    ROOT_PATH = "/Users/maltegenschow/Documents/Uni/Thesis/Thesis"
elif platform.system() == 'Linux':
    DATA_PATH = "/pfs/work7/workspace/scratch/tu_zxmav84-thesis/Data.nosync"
    ROOT_PATH = "/pfs/work7/workspace/scratch/tu_zxmav84-thesis/Thesis"

current_wd = os.getcwd()

In [2]:
from glob import glob 
import pandas as pd
import torch
import numpy as np
import itertools
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [3]:
save_path = f"{DATA_PATH}/Models/InterfaceGAN/Inputs/disentangled_typicality/"
latents_path = f"{DATA_PATH}/Models/e4e/00005_snapshot_1200/inversions/latents_dict.pt"

In [4]:
# Load in Latents
latents = torch.load(latents_path)

In [5]:
meta = pd.read_csv('disentangled_typicality_scores.csv')

In [7]:
def prepare_data(embedding_type):
    summary_stats = {}
    for n in [1000,2000,3000]:
        upper_bound = meta[f"{embedding_type}_typicality"].nlargest(n).min()
        lower_bound = meta[f"{embedding_type}_typicality"].nsmallest(n).max()

        meta_copy = meta.copy()
        # Assign 0/1 labels based on typicality scores
        meta_copy.loc[meta_copy[f"{embedding_type}_typicality"] >= upper_bound, 'label'] = 1
        meta_copy.loc[meta_copy[f"{embedding_type}_typicality"] <= lower_bound, 'label'] = 0

        # Drop unlabeled samples
        subset = meta_copy.dropna(subset=['label']).copy()
        subset.label = subset.label.astype(int)

        # Crate target
        target = np.array(subset.label).reshape(-1, 1)

        # Subset latents
        latents_subset = [latents[sku].squeeze(0) for sku in subset.sku]



        # Save everything
        os.makedirs(f"{save_path}/{embedding_type}/{n}/", exist_ok=True)

        # Concatenate all dimensions and save
        latents_concatenated = torch.stack([elem.flatten() for elem in latents_subset])
        np.save(f"{save_path}/{embedding_type}/{n}/latents_concatenated.npy", latents_concatenated.numpy())
        np.save(f"{save_path}/{embedding_type}/{n}/target.npy", target)
        subset.to_csv(f'{save_path}/{embedding_type}/{n}/metadata.csv', index=False)

        summary_stats[n] = {
            'num_samples': target.shape[0],
            'num_positives': target.sum(),
            'num_negatives': (1 - target).sum(),
            'lower_threshhold':lower_bound, 
            'upper_threshhold':upper_bound
        }

    # Save summary stats
    stats = pd.DataFrame(summary_stats).T
    stats.to_csv(f"{save_path}/summary_stats.csv")

In [8]:
embedding_types = [elem.replace('_typicality', '') for elem in meta.filter(like = 'typicality').columns.to_list()]
embedding_types

['dinov2_vitb14',
 'disentangled_embeddings_concat',
 'disentangled_concat_ex_Color',
 'disentangled_concat_ex_Fabric',
 'disentangled_concat_ex_Fit',
 'disentangled_concat_ex_Neckline',
 'disentangled_concat_ex_Pattern',
 'disentangled_concat_ex_Collar',
 'disentangled_concat_ex_Length',
 'disentangled_concat_ex_Shape',
 'disentangled_concat_ex_Sleeve_Length',
 'Color',
 'Fabric',
 'Fit',
 'Neckline',
 'Pattern',
 'Collar',
 'Length',
 'Shape',
 'Sleeve_Length']

In [9]:
for embedding_type in tqdm(embedding_types):
    prepare_data(embedding_type)

100%|██████████| 20/20 [00:30<00:00,  1.55s/it]


In [None]:
# ## Code to remove files after training has been completed

# f = glob(f"{save_path}/*/*/*.npy")

# for file in f: 
#     os.remove(file)