In [2]:
from pymilvus import MilvusClient, DataType
client = MilvusClient(uri="http://192.168.3.116:19530")

In [3]:
collection_name = "demo1"
embedding_dim=2048
if client.has_collection(collection_name=collection_name):
    client.drop_collection(collection_name=collection_name)

In [4]:

schema = client.create_schema(auto_id=False, enable_dynamic_field=False)
schema.add_field(field_name="id", datatype=DataType.VARCHAR, is_primary=True,max_length=65535)
schema.add_field(field_name="vector", datatype=DataType.FLOAT_VECTOR, dim=embedding_dim)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=65535)


index_params = MilvusClient.prepare_index_params()
index_params.add_index(
    field_name="vector",
    metric_type="L2",
    index_type="AUTOINDEX",
)

if client.has_collection(collection_name=collection_name):
    client.drop_collection(collection_name=collection_name)
client.create_collection(
    collection_name=collection_name,
    schema=schema,
    index_params=index_params,
    consistency_level="Strong",
)

client.load_collection(collection_name=collection_name)


In [5]:
import requests
import json

url = "http://llama-32-nv-embedqa-1b-v2.runai-clv01.aitrynow-run-inf.macnica.co.jp/v1/embeddings"

headers = {
    "Content-Type": "application/json"
}

def embedding(text):
    payload ={
    "input": text,
    "model": "nvidia/llama-3.2-nv-embedqa-1b-v2",
    "input_type": "passage",
    "encoding_format": "float",
    "dimensions": None,
    "user": "user-identifier",
    "truncate": "NONE"
  }

    return requests.post(url, headers=headers, data=json.dumps(payload)).json()['data'][0]['embedding']
    

In [6]:
import pandas as pd
data = pd.read_csv('/data/aiqtoolkit/intern/vector_setup/datas/personal_data/personal_info.csv')

In [8]:
food = ['カレー','寿司','天ぷら','ラーメン','ハンバーガー']

In [15]:
data

[{'id': '0',
  'vector': array([ 0.00156307, -0.04971313,  0.02215576, ..., -0.0109787 ,
          0.00145817, -0.03640747]),
  'text': '稲垣洸大(Kohdai Inagaki)の好きな食べ物はカレーです'},
 {'id': '1',
  'vector': array([ 0.00142384, -0.05780029,  0.02093506, ..., -0.00853729,
         -0.0049324 , -0.04696655]),
  'text': '川辺空雅(Kuga Kawabe)の好きな食べ物は寿司です'},
 {'id': '2',
  'vector': array([-0.0041008 , -0.03997803,  0.01319122, ..., -0.0292511 ,
         -0.02142334, -0.04980469]),
  'text': '小寺玲(Rei Kotera)の好きな食べ物は天ぷらです'},
 {'id': '3',
  'vector': array([-0.00562668, -0.0567627 ,  0.00391388, ..., -0.0085907 ,
         -0.02664185, -0.03863525]),
  'text': '北島佑樹(Yuki Kitajima)の好きな食べ物はラーメンです'},
 {'id': '4',
  'vector': array([ 0.0189209 , -0.05471802,  0.00457382, ...,  0.00196457,
          0.00424194, -0.02818298]),
  'text': '吉田龍太郎(Ryutaro Yoshida)の好きな食べ物はハンバーガーです'}]

In [9]:
data['food'] = food

In [10]:
personal_texts = []
for i in range(len(data)):
    one = data.loc[i]
    text = f"{one["氏名"]}({one["ローマ字"]})の好きな食べ物は{one["food"]}です"
    personal_texts.append(text)

In [13]:
import numpy as np
vectors = []

for i in personal_texts:
    vectors.append(np.array(embedding(i)))
print("Dim:", vectors[0].shape)  # Dim: 768 (768,)

data = [
    {"id": str(i), "vector": vectors[i], "text": personal_texts[i]}
    for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

Dim: (2048,)
Data has 5 entities, each with fields:  dict_keys(['id', 'vector', 'text'])
Vector dim: 2048


In [14]:
res = client.insert(collection_name=collection_name, data=data)

print(res)

{'insert_count': 5, 'ids': ['0', '1', '2', '3', '4']}


In [25]:
query_vectors = [[0.1]*2048] 
res = client.search(
            collection_name=collection_name,
            data=query_vectors,
            limit=2,
            output_fields=["text"],
        )

In [26]:
res[0]

[{'id': '3', 'distance': 21.618587493896484, 'entity': {'text': '北島佑樹(Yuki Kitajima)は8月5日(August 5th)生まれです'}}]

In [153]:
import pandas as pd

def make_data(j_name, en_name, j_birth, en_birth):
    data = [{
        "氏名": j_name,
        "ローマ字": en_name,
        "生年月日": j_birth,
        "birthday": en_birth
    }]
    return pd.DataFrame(data)


def insert_data(data, client, collection_name, embedding_func):
    # テキスト生成
    text = f"{data['氏名'].values[0]} ({data['ローマ字'].values[0]}) は {data['生年月日'].values[0]} ({data['birthday'].values[0]}) 生まれです"
    
    # ベクトル化
    vector = embedding_func(text)

    # 既存の要素数を確認し、idを重複しないように付与
    row_count = int(client.get_collection_stats(collection_name)["row_count"])
    new_id = str(row_count + 1)

    # 挿入データ構築
    insert_payload = [{
        "id": new_id,
        "vector": vector,
        "text": text
    }]

    # 挿入
    res = client.insert(collection_name=collection_name, data=insert_payload)
    client.flush(collection_name)
    return res


In [154]:
a = make_data("越野陵駕","Koshino Ryoga","8月6日","August 6th")
b = insert_data(a,client,collection_name,embedding)