In [21]:
import argparse
import glob
import traceback
import pandas as pd
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

root = "csvs"
output = "dataset"
ratio = [0.8, 0.1, 0.1]


csv_files = glob.glob(os.path.join(root, '*.csv'))
df_list = [pd.read_csv(f) for f in csv_files]
data = pd.concat(df_list)



In [22]:

# assert index in ['psnr', 'ssim', 'lpips', 'chamfer']
# ascending = True if index in ['lpips', 'chamfer'] else False

index = "lpips"

# 1. remove data where index is bigger than 0.2
print(len(data))
data = data[data[index] < 0.2]
print(len(data))

data.head()


163646
148930


Unnamed: 0.1,Unnamed: 0,scale,path,psnr,ssim,lpips,chamfer
0,0,"(1088, 2176)",/root/autodl-tmp/animedata/anime_dataset/1,25.975638,0.92431,0.1343,6.9e-05
1,1,"(1088, 2176)",/root/autodl-tmp/animedata/anime_dataset/10,32.397934,0.978068,0.07196,2.7e-05
2,2,"(1088, 2176)",/root/autodl-tmp/animedata/anime_dataset/100,25.214299,0.951444,0.183,9.5e-05
3,3,"(1088, 2176)",/root/autodl-tmp/animedata/anime_dataset/1000,25.707103,0.924513,0.1951,0.000104
4,4,"(1088, 2176)",/root/autodl-tmp/animedata/anime_dataset/1001,27.837862,0.968125,0.11035,7e-05


In [23]:
df_var = data.groupby(['path']).agg({index: 'var'})
df_var.reset_index(inplace=True)
print(len(df_var))

# 2.1 remove data where var > 5e-4
df_var = df_var[df_var[index] < 5e-4]
print(len(df_var))

print(df_var[index].max())

16708
16584
0.000484111011111111


In [24]:
df_var.sort_values(by=[index], ascending=True, inplace=True)
df_var.head()

print(len(df_var))

# 3.1 remove data with max var and min var
# df_var = df_var.iloc[1500:-500]
# print(len(df_var))




16584


In [28]:
ascending = True

data.sort_values(by=[index, 'path'], ascending=ascending, inplace=True)

data.drop_duplicates(subset='path', keep='first', inplace=True)

print(len(data))

# 2.2: remove data
data = data[data['path'].isin(df_var['path'])]

# 3 mark data var < 5e-7, make scale column to 'any'
low_var_index = df_var[df_var['lpips'] < 5e-7].index
data.loc[data.index.isin(low_var_index), 'scale'] = "any"

print(len(data))


16584
16584


In [29]:
# data to csv data_cleaned.csv
data.to_csv("data_cleaned.csv")

In [30]:
train, test = train_test_split(data, train_size=ratio[0], random_state=42)
val, test = train_test_split(test, train_size=ratio[1] / (ratio[1] + ratio[2]), random_state=42)

for dataset, name in tqdm(zip([train, val, test], ['train', 'val', 'test'])):
    for _, row in tqdm(dataset.iterrows()):
        
        src_path = row['path']
        scale = row['scale'].replace(",", "x").replace(" ", "").replace("(", "").replace(")", "")
        link_name = f"{'_'.join(row['path'].split('/')[-2:])}"
        link_path_dir = os.path.join(output, name, scale)
        os.makedirs(link_path_dir, exist_ok=True)
        link_path = os.path.join(link_path_dir, link_name)
        
        # link src_path dir to link_path dir
        if not os.path.exists(link_path):
            try:
                os.symlink(src_path, link_path)
            except:
                print(f"Error creating symlink for {src_path} to {link_path}, {traceback.format_exc()}")
        
        # img, scale = blend_images(row['path'], row['scale'])
        # if img is not None:
        #     folder_path = os.path.join(output, name, scale)
        #     os.makedirs(folder_path, exist_ok=True)
        #     file_name = f"{'_'.join(row['path'].split('/')[-2:])}.jpg"
        #     cv2.imwrite(os.path.join(folder_path, file_name), img)

13267it [00:09, 1343.08it/s]
1658it [00:05, 286.09it/s]
1659it [00:01, 912.36it/s] 
3it [00:17,  5.90s/it]
