In [2]:

from pymilvus import MilvusClient, DataType
client = MilvusClient(uri="http://192.168.3.116:19530")



In [4]:
collection_name = "blackwell"
embedding_dim=2048
if client.has_collection(collection_name=collection_name):
    client.drop_collection(collection_name=collection_name)

In [5]:

schema = client.create_schema(auto_id=False, enable_dynamic_field=False)
schema.add_field(field_name="id", datatype=DataType.VARCHAR, is_primary=True,max_length=65535)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=embedding_dim)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65535)
schema.add_field(field_name="metadata", datatype=DataType.VARCHAR, max_length=65535)

index_params = MilvusClient.prepare_index_params()
index_params.add_index(
    field_name="vector",
    metric_type="L2",
    index_type="AUTOINDEX",
)

if client.has_collection(collection_name=collection_name):
    client.drop_collection(collection_name=collection_name)
client.create_collection(
    collection_name=collection_name,
    schema=schema,
    index_params=index_params,
    consistency_level="Strong",
)

client.load_collection(collection_name=collection_name)


In [6]:
from pypdf import PdfReader

def pdf_text(path):# pageは配列
    reader = PdfReader(path)
    #texts = [reader.pages[i].extract_text() or "" for i in page ]
    texts = [ j.extract_text() or "" for j in reader.pages]
    return " ".join(texts)

#def clean_cut_text(text):
#    text2 = text.replace("\n","")
#   return text2.split(".")

def clean_cut_text(text):
    text2 = text.replace("\n", "")
    clean_text_list = [s.strip() for s in text2.split(".") if s.strip()]
    sentence = ""
    for j in clean_text_list:
        sentence = sentence + j + " "
    return sentence

def extract_pdf(path):
    text = pdf_text(path)
    texts = clean_cut_text(text)
    return texts

def chunk_text_by_chars(text, name, chunk_size=512, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunks.append(f"{name} is {text[start:end]}")
        start += chunk_size - overlap
    return chunks





In [17]:
import os
#ファイル名と製品名を記入
PATH = "/workspace/intern/Datasheet"
files = os.listdir(f"{PATH}/GPU/Blackwell")
files = [file for file in os.listdir(f"{PATH}/GPU/Blackwell") if file.endswith(".pdf")]

In [18]:
names = ["blackwell-rtx-pro-4000","blackwell-rtx-pro-5000","blackwell-rtx-pro-4500",
          "blackwell-rtx-pro-6000","blackwell-rtx-pro-6000-max-q"]

In [19]:
product_name = []
product_chunks = []
for i in range(len(files)):
    texts = extract_pdf(f"{PATH}/GPU/Blackwell/{files[i]}")
    chunks = chunk_text_by_chars(texts,names[i])
    product_chunks = product_chunks + chunks
    for j in range(len(chunks)):
        product_name.append(names[i])

In [20]:
import requests
import json

url = "http://llama-32-nv-embedqa-1b-v2.runai-clv01.aitrynow-run-inf.macnica.co.jp/v1/embeddings"

headers = {
    "Content-Type": "application/json"
}

def embedding(text):
    payload ={
    "input": text,
    "model": "nvidia/llama-3.2-nv-embedqa-1b-v2",
    "input_type": "passage",
    "encoding_format": "float",
    "dimensions": None,
    "user": "user-identifier",
    "truncate": "NONE"
  }

    return requests.post(url, headers=headers, data=json.dumps(payload)).json()['data'][0]['embedding']
    

In [21]:
import numpy as np
vectors = []

for i in product_chunks:
    vectors.append(np.array(embedding(i)))
print("Dim:", vectors[0].shape)  # Dim: 768 (768,)

data = [
    {"id": str(i), "vector": vectors[i], "text": product_chunks[i],"metadata": product_name[i]}
    for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

Dim: (2048,)
Data has 78 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'metadata'])
Vector dim: 2048


In [22]:
res = client.insert(collection_name=collection_name, data=data)

print(res)

{'insert_count': 78, 'ids': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77']}
