# 本地构建arxiv的向量数据库

学校的向量数据库不方便

使用milvus作为向量数据库，对arxiv的数据的所有文章

向量化abstract，然后和其他所有字段插入数据库

In [2]:
import dask.bag as db
import json

data_path = './arxiv/versions/210/arxiv-metadata-oai-snapshot.json'
papers_db = db.read_text(data_path, blocksize="10mb").map(json.loads)

In [8]:
def display(results,indent=4):
    dict_result = [doc.to_dict() if hasattr(doc, 'to_dict') else vars(doc) for doc in results]
    print(json.dumps(dict_result, indent=indent))
first_two_papers = papers_db.take(2)
print(json.dumps(first_two_papers[0], indent=4))

{
    "id": "0704.0001",
    "submitter": "Pavel Nadolsky",
    "authors": "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
    "title": "Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies",
    "comments": "37 pages, 15 figures; published version",
    "journal-ref": "Phys.Rev.D76:013009,2007",
    "doi": "10.1103/PhysRevD.76.013009",
    "report-no": "ANL-HEP-PR-07-12",
    "categories": "hep-ph",
    "license": null,
    "abstract": "  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement

In [9]:
# 数据清洗与转换：编写预处理辅助函数对数据进行清洗和转换，例如提取论文的发布日期、组合标题和摘要字段等。
# 以下是提取论文发布日期并转换为 Unix 时间戳
import datetime
import time

def v1_date(row):
    """
    提取作者将论文的第一个版上传到arxiv的日期，并将其转换为Unix时间戳，添加到该行的新字段中
    """
    # versions = row["versions"]
    # date = None
    # for version in versions:
    #     if version["version"] == "v1":
    #         date = datetime.datetime.strptime(version["created"], "%a, %d %b %Y %H:%M:%S %Z")
    #         date = int(time.mktime(date.timetuple()))
    #         row["unix_time"] = date
    return row

In [10]:
# 对数据中的abstract字段向量化

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("./all-MiniLM-L12-v2")
def embed_abstract(row):
    abstract = row["abstract"]
    embedding = model.encode(abstract)
    row["abstract_vector"] = embedding
    return row

papers_with_embeddings = papers_db.map(embed_abstract)

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
first_two_papers = papers_with_embeddings.take(1)
first_two_papers

({'id': '0704.0001',
  'submitter': 'Pavel Nadolsky',
  'authors': "C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan",
  'title': 'Calculation of prompt diphoton production cross sections at Tevatron and\n  LHC energies',
  'comments': '37 pages, 15 figures; published version',
  'journal-ref': 'Phys.Rev.D76:013009,2007',
  'doi': '10.1103/PhysRevD.76.013009',
  'report-no': 'ANL-HEP-PR-07-12',
  'categories': 'hep-ph',
  'license': None,
  'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with d

In [15]:
# 存储到 Milvus

from pymilvus import connections

conn = connections.connect(
    host="127.0.0.1",
    port="19530",
    db_name="my_database"
)

In [16]:
# 创建数据集合
from pymilvus import Collection, FieldSchema, CollectionSchema, DataType

fields = [
    FieldSchema(name='id', dtype=DataType.INT64, is_primary=True),
    FieldSchema(name='abstract_vector', dtype=DataType.FLOAT_VECTOR, dim=384),
    FieldSchema(name='abstract', dtype=DataType.VARCHAR,max_length=1200)    
]
schema = CollectionSchema(fields=fields)
collection = Collection(name='arxiv_abstracts', schema=schema)

In [None]:
# 插入集合
import numpy as np

def insert_data(papers):
    data = []
    for row in papers:
        data.append([row["id"], np.array(row["abstract_vector"],row["abstract"], dtype=np.float32)])
    collection.insert(data)

insert_data(papers_with_embeddings)

In [17]:
index_params = {
    "metric_type": "L2",
    "index_type": "IVF_FLAT",
    "params": {"nlist": 1024}
}
collection.create_index(field_name='abstract_vector', index_params=index_params)

Status(code=0, message=)

In [18]:
# 查看集合
from pymilvus import MilvusClient, DataType

client = MilvusClient(
    uri="http://localhost:19530",
    db_name="my_database"
)
res = client.list_collections()

print(res)

['arxiv_abstracts']


In [19]:
res = client.describe_collection(
    collection_name="arxiv_abstracts"
)

print(res)



{'collection_name': 'arxiv_abstracts', 'auto_id': False, 'num_shards': 1, 'description': '', 'fields': [{'field_id': 100, 'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'params': {}, 'is_primary': True}, {'field_id': 101, 'name': 'abstract_vector', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}, {'field_id': 102, 'name': 'abstract', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 1200}}], 'functions': [], 'aliases': [], 'collection_id': 454645014249876585, 'consistency_level': 2, 'properties': {}, 'num_partitions': 1, 'enable_dynamic_field': False}


In [20]:
from pymilvus import MilvusClient

client = MilvusClient(
    uri="http://localhost:19530",
    db_name="my_database"
)
client.load_collection(
    collection_name="arxiv_abstracts"
)
res = client.get(
    collection_name="arxiv_abstracts",
    ids=[0, 1, 2],
    output_fields=["id", "abstract"]
)
print(res)

data: [] 


In [22]:
from pymilvus import Collection
collection = Collection("arxiv_abstracts")
num_entities = collection.num_entities
print(f"集合中的数据量为: {num_entities}")

集合中的数据量为: 0
