In [None]:
# Download form HF
# !git clone git@hf.co:datasets/ShapeNet/ShapeNetCore

In [10]:
import np
from tqdm.auto import tqdm
import os
import zipfile

def unzip_files(folder_path, extract_base = ''):
    # Get the list of zip files in the specified folder
    zip_files = [f for f in os.listdir(folder_path) if f.endswith('.zip')]

    # Create a folder for each zip file and extract its contents
    for zip_file in tqdm(zip_files):
        zip_file_path = os.path.join(folder_path, zip_file)
        extract_folder = os.path.join(extract_base, os.path.splitext(zip_file)[0])

        # Create a folder with the same name as the zip file
        os.makedirs(extract_folder, exist_ok=True)

        # Extract the contents of the zip file into the created folder
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_folder)

        print(f"Extracted {zip_file} into {extract_folder}")

In [None]:
# unzip_files('data/ShapeNetCore', extract_base='data/ShapeNetCore_unziped')

In [2]:
import json
taxonomy = json.load(open('data/shapenetcore.taxonomy.json'))

In [3]:
taxonomy[0]['metadata']

{'numInstances': 4045,
 'name': '02691156',
 'numChildren': 11,
 'label': 'airplane,aeroplane,plane'}

In [4]:
tax_map = { tax['metadata']['name']:tax['metadata']['label'] + '\n' + tax['li_attr']['title'].replace('\n', '').strip() for tax in taxonomy}

In [5]:
# Add missing key for cellphone
tax_map['02992529'] = "cellphone,mobile,mobilephone,phone\na handheld device used by people for telecommunication"

In [15]:
import torch
from transformers import BertTokenizerFast, BertModel

tokenizer = BertTokenizerFast.from_pretrained("setu4993/LEALLA-small")
model = BertModel.from_pretrained("setu4993/LEALLA-small").to('mps')
model = model.eval()


def tokenize_sentences(sentences):
    english_inputs = tokenizer(sentences, return_tensors="pt", padding=True, max_length=512, truncation=True).to('mps')
    with torch.no_grad():
        english_outputs = model(**english_inputs).pooler_output

    return english_outputs.cpu().numpy()



In [16]:
import numpy as np

tokenized_taxonomy = tokenize_sentences(list(tax_map.values()))
tokenized_taxonomy_map = {val: {'sentece': list(tax_map.values())[i], 'tokens': tokenized_taxonomy[i]} for i, val in enumerate(list(tax_map.keys()))}

np.save('data/shapnet_tokenized', tokenized_taxonomy_map)

In [1]:
tax_map

NameError: name 'tax_map' is not defined

In [6]:
import os

def find_obj_files(folder_path):
    obj_files = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(".obj"):
                obj_files.append(os.path.join(root, file))
    return obj_files

obj_files = find_obj_files('data/ShapeNetCore_unziped')

In [7]:
from os import path

tax = [{
    'id': file.split('/')[4],
    'category': file.split('/')[2],
    'obj': '/'.join(file.split('/')[2:]),
    'text': tax_map[file.split('/')[3]]
} for file in obj_files]

In [17]:
import numpy as np

np.save('data/shapnet_tokenized', {
    'taxonomy_map': tax,
    'tokenized_taxonomy': tokenized_taxonomy_map
})

In [21]:
import os
import pandas as pd

# Specify the path to the folder containing your CSV files
folder_path = 'data/shapenet_metadata'

# Get a list of all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty list to store individual DataFrames
dfs = []

# Loop through each CSV file and read it into a DataFrame, then add it to the list
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate the list of DataFrames into a single DataFrame
combined_data = pd.concat(dfs, ignore_index=True)

# Now combined_data contains the data from all CSV files in the folder
combined_data['fullId'] = combined_data['fullId'].str.replace('3dw.', '')

combined_data

Unnamed: 0,fullId,wnsynset,wnlemmas,up,front,name,tags
0,e3a0391e9fdb7671fdb7801f07fef3bd,02808440,"bathtub,bathing tub,bath,tub","0.0\,0.0\,1.0","0.0\,1.0\,0.0",bathtub boat,
1,19da369ff1db9ecd670204dee97dc037,02808440,"bathtub,bathing tub,bath,tub","0.0\,0.0\,1.0000001","0.0\,-1.0000001\,0.0",Freestanding Bathtub,
2,f9fe136c1738dcb1ec3cc26553cfa06b,02808440,"bathtub,bathing tub,bath,tub","0.0\,0.0\,1.0","0.0\,-1.0\,0.0",Villeroy & Boch Oberon 190x90cm,
3,dd49561080df1a3f798df87163b1956c,02808440,"bathtub,bathing tub,bath,tub","0.0\,0.0\,1.0","0.0\,-1.0\,0.0",Toto Nexus Bath Tub,
4,beafc45419f70db3252e0775cff1c394,02808440,"bathtub,bathing tub,bath,tub","0.0\,0.0\,1.0","0.0\,-1.0\,0.0",bathtub,
...,...,...,...,...,...,...,...
55490,27f58201df188ce0c76e1e2d1feb4ae,02801938,"basket,handbasket","0.0\,0.0\,1.0","0.0\,1.0\,0.0",easter basket,
55491,34fd44c46f40044339da77d05979b2,02801938,"basket,handbasket","0.0\,0.0\,1.0000001","0.0\,1.0000001\,0.0",odpadkový koš,
55492,e3bae8da192ab3d4a17ae19fa77775ff,02801938,"basket,handbasket","0.0\,0.0\,1.0","-1.0\,0.0\,0.0",Stratton Daybed Pottery barn,
55493,dafcdefea7e1445edce1af5c27e3da18,02801938,"basket,handbasket","0.0\,0.0\,1.0","-1.0\,0.0\,0.0","Balloon basket with 6"" keepout area",


In [24]:
tax[0]

{'id': 'eecbec1d6360c2c77b4c98ce79dd9c8f',
 'category': '04468005',
 'obj': '04468005/04468005/eecbec1d6360c2c77b4c98ce79dd9c8f/models/model_normalized.obj',
 'text': "train,railroad train\npublic transport provided by a line of railway cars coupled together and drawn by a locomotive; 'express trains don't stop at Princeton Junction'"}

In [26]:
c = 0
for t in tax:
    if(len(df[df['fullId'].str.contains(t['id'])]) > 0):
        c = c + 1
print(c)

125
