# Data processing

Processing the shapenet dataset to be compatible with if-net texture

## Locate the folders and identifiers

Shapenet has a different folder system, where each mesh is in a subfolder, in this first step, we find each unique identifier of the models and create new folders at the desired location

In [None]:
from pathlib import Path
import os
import numpy as np
from scipy.spatial import cKDTree as KDTree
import trimesh
import data_processing.utils as utils
import data_processing.mesharray as mesharray
import config.config_loader as cfg_loader
import glob

# The location of the original dataset
datasetLocation = Path("../../datasets/ShapeNet/03001627")
# The desired output location for the processed dataset
datasetOutFolder = Path("dataset/SHARP2020/shapenet")
# The config file
configFilePath = Path("projects/if-net_texture/config/SHARP2020/shapenet.yaml")

utils.make_dir_if_not_exist(datasetOutFolder)
utils.make_dir_if_not_exist(datasetOutFolder/ "test")
utils.make_dir_if_not_exist(datasetOutFolder/ "train")

In [None]:
# Looking for all obj files
path = datasetLocation / "**" / "*.obj"
print("Looking for all files that match: " + str(path))
files = glob.glob(str(path), recursive=True)
print("Found " + str(len(files)) + " files")

## Load the configuration

In [None]:
cfg = cfg_loader.load(configFilePath)

# shorthands
trainTestRatio =  cfg["preprossesing"]["trainTestRatio"]
bbox = cfg['data_bounding_box']
res = cfg['input_resolution']
num_points = cfg['input_points_number']
bbox_str = cfg['data_bounding_box_str']
grid_points = utils.create_grid_points_from_xyz_bounds(*bbox, res)
kdtree = KDTree(grid_points)

## Create the processed files

1) Load the trimesh scene
2) sample each submesh into a separate pointcloud
3) Combine them into one big material pointcloud
4) Resample to theset number of points
5) cut random holes in the point cloud by selecting nearest neighbours
6) Save the complete sampled shape as a npz file
    > id/id_normalized_color_samples100000_bbox-0.8,0.8,-0.15,2.1,-0.8,0.8.npz
7) save 4 variations in the same folder

In [None]:
nrOfFiles = 100 #len(files)
trainCases = int(np.round(nrOfFiles * trainTestRatio))

for i in range(nrOfFiles):
    print("Processing " + str(i) + "/" + str(nrOfFiles))
    objFile = files[i]
    newId = Path(objFile).parent.parent.name
    # Make the new destination folder for the files
    if(i < trainCases):
        newIdPath = datasetOutFolder / "train" / newId
    else:
        newIdPath = datasetOutFolder / "test" / newId
    utils.make_dir_if_not_exist(newIdPath)
    
    # Create a new mesharray object to parse the data
    meshArray = mesharray.MeshArray(
        id = newId, 
        bbox=bbox, 
        kdtree = kdtree,
        grid_points=grid_points, 
        num_points=num_points ).from_trimesh(objFile)

    # Save the full point cloud as a colored point cloud
    meshArray.savez(newIdPath / (newId + "_normalized_color_samples" + str(num_points) + "_bbox" + bbox_str + ".npz"))
    # Create a number of incomplete meshes and save them as voxelised point clouds for standardisation
    meshArray.filter_points(newIdPath, 4, 4, 2e-2)