In [24]:
import numpy as np
from stl import mesh
from PIL import Image
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d as mplot3d

# 1. 读取STL文件
your_mesh = mesh.Mesh.from_file('data/thingiverse/thing-3591512-file-6420490.stl')

# 2. 获取STL模型的尺寸信息
min_x, max_x = np.min(your_mesh.x), np.max(your_mesh.x)
min_y, max_y = np.min(your_mesh.y), np.max(your_mesh.y)
min_z, max_z = np.min(your_mesh.z), np.max(your_mesh.z)

# 计算宽度、高度和深度
width = max_x - min_x
height = max_y - min_y
depth = max_z - min_z

# 设置输出图像的宽度（可以根据需要更改）
output_width = 800  # 设置输出图像的宽度，高度将根据宽高比自动计算
output_height = int(output_width * (height / width))  # 计算高度以保持宽高比

# 3. 使用matplotlib渲染图像，并设置图形尺寸和缓冲区分辨率
fig = plt.figure(figsize=(output_width / 100, output_height / 100))
fig.set_dpi(100)  # 设置缓冲区分辨率
ax = fig.add_subplot(111, projection='3d')

# 设置摄像机的位置
ax.view_init(elev=20, azim=30)

# 为模型的每个面设置一个颜色
ax.add_collection3d(mplot3d.art3d.Poly3DCollection(your_mesh.vectors, facecolors='gray'))

# 设置轴的限制
margin = 10
ax.set_xlim([min_x - margin, max_x + margin])
ax.set_ylim([min_y - margin, max_y + margin])
ax.set_zlim([min_z - margin, max_z + margin])

# 隐藏坐标轴
ax.axis('off')

# 设置背景为白色
ax.set_facecolor((1, 1, 1, 0))
fig.patch.set_facecolor((1, 1, 1, 0))

# 4. 将matplotlib图像转换为PIL Image
fig.canvas.draw()
img_arr = np.array(fig.canvas.renderer.buffer_rgba())
img = Image.fromarray(img_arr)

# 如果你想保存图像
img.save('output_image.png')

# 关闭图形，以便不在屏幕上显示它
plt.close(fig)


In [42]:
def render_obj(path):
    with open(path, 'r') as obj_file:
        lines = obj_file.readlines()

    vertices = []
    faces = []

    for line in lines:
        if line.startswith('v '):
            parts = line.strip().split()
            x, y, z = map(float, parts[1:])
            vertices.append([x, y, z])
        elif line.startswith('f '):
            parts = line.strip().split()
            face = [int(vertex.split('/')[0]) for vertex in parts[1:]]
            faces.append(face)

    vertices = np.array(vertices)
    faces = np.array(faces)

    fig = plt.figure(figsize=(8, 8))
    ax = fig.add_subplot(111, projection='3d')

    for face in faces:
        vertices_3d = vertices[face - 1]
        ax.add_collection3d(mplot3d.art3d.Poly3DCollection([vertices_3d], facecolors='gray'))

    min_x, max_x = np.min(vertices[:, 0]), np.max(vertices[:, 0])
    min_y, max_y = np.min(vertices[:, 1]), np.max(vertices[:, 1])
    min_z, max_z = np.min(vertices[:, 2]), np.max(vertices[:, 2])

    margin = 10
    ax.set_xlim([min_x - margin, max_x + margin])
    ax.set_ylim([min_y - margin, max_y + margin])
    ax.set_zlim([min_z - margin, max_z + margin])

    ax.axis('off')
    ax.set_facecolor((1, 1, 1, 0))
    fig.patch.set_facecolor((1, 1, 1, 0))

    fig.canvas.draw()
    img_arr = np.array(fig.canvas.renderer.buffer_rgba())
    img = Image.fromarray(img_arr)

    img.save(path[:-4] + '.png')

    return img, path[:-4] + '.png'

render_obj('data/thingiverse/thing-3591512-file-6420490.stl')

FileNotFoundError: [Errno 2] No such file or directory: 'data/thingiverse/thing-3591512-file-6420490.stl'

In [25]:
from transformers import CLIPProcessor, CLIPModel
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

Downloading (…)rocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [40]:
import clip
import torch
from PIL import Image


image = Image.open('r_32.png')

def test(caption, image=image):
    device = "cuda"
    
    model, transform = clip.load("ViT-B/32", device=device)
    image = transform(image).unsqueeze(0).to(device)
    text = clip.tokenize(["a "+caption]).to(device)
    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

    similarity_score = (image_features @ text_features.T).mean()
    return similarity_score


In [None]:
from transformers import AutoProcessor, Blip2ForConditionalGeneration
import torch
import subprocess

device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16).to(device)
        

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

RuntimeError: Input type (torch.cuda.HalfTensor) and weight type (torch.HalfTensor) should be the same

In [38]:
def test_blip(image):

    inputs = processor(image, return_tensors="pt").to(device, torch.float16)

    generated_ids = model.generate(**inputs, max_new_tokens=20)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    
    return generated_text


In [41]:
test('chair'), test('dog'), test('table'), test('seat'), test('computer'), test('person'), test('object')

AttributeError: 'CLIP' object has no attribute 'generate'