In [1]:
# imports and constants
import openai
import tiktoken
import os
import pdfminer
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfpage import PDFPage
from pdfminer.high_level import extract_text


import json
import requests
from tqdm.notebook import tqdm  # for printing progress bars
import numpy as np
# from redis import Redis
# from redis.commands.search.query import Query
# from redis.commands.search.field import (
#     TextField,
#     VectorField,
#     NumericField
# )
# from redis.commands.search.indexDefinition import (
#     IndexDefinition,
#     IndexType
# )
# 导入milvus相关库
from pymilvus import Collection, CollectionSchema, FieldSchema, DataType, connections

from IPython.display import clear_output, display, Markdown
import time
from datetime import datetime, date



GPT_MODEL = "gpt-4"

INDEX_NAME = "SangforWP"
VECTOR_DIM = 1536 
DISTANCE_METRIC = "COSINE"                # distance metric for the vectors (ex. COSINE, IP, L2)

# r = Redis() #Initialize Redis client with default settings
# 连接milvus数据库
connections.connect("default", host="127.0.0.1", port="19530")



None


In [None]:
data_dir = os.path.join(os.curdir, 'WhitePapers')
pdf_files = sorted([x for x in os.listdir(data_dir) 
                   if x != '.DS_Store' and os.path.isfile(os.path.join(data_dir, x))])
pdf_files

# Use pdfminer to extract text from the PDF
full_path = os.path.join(data_dir, pdf_files[0])
with open(full_path, 'rb') as f:
    number_of_pages = len(list(PDFPage.get_pages(f)))
pdf_content = {}
for n in range(number_of_pages):
    text = extract_text(full_path, page_numbers=[n])
    pdf_content[n + 1] = text
pdf_content

In [None]:
# Create search index

# # define RediSearch vector fields to use FLAT index
# page_embedding = VectorField("page_embedding",
#     "FLAT", {
#         "TYPE": "FLOAT32",
#         "DIM": VECTOR_DIM,
#         "DISTANCE_METRIC": DISTANCE_METRIC
#     }
# )
# 1.准备阶段
page_embedding = FieldSchema(name="page_embedding", dtype=DataType.FLOAT_VECTOR, dim=VECTOR_DIM)

# Define RediSearch fields for each of the columns in the dataset
# This is where you should add any additional metadata you want to capture
# page_num = NumericField("page_num", sortable=True)
# content = TextField("content")
page_num = FieldSchema(name="page_num", dtype=DataType.INT64, is_primary=True)
# 定义一个可变长度的文本字段 content，假设已知最长文本不超过 3096 字符
# milvus数据库定义字段后，不能动态修改。故可预估一个最大值 or 采用分段存储的方式
content = FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=3096)

fields = [page_num, content, page_embedding] 
# indexDefinition = IndexDefinition(prefix=[INDEX_NAME], index_type=IndexType.HASH)

collection_name = "pdf_page_collection"
#创建集合 Schema
schema = CollectionSchema(fields=fields, description="A collection for storing PDF page information")
#创建集合
collection = Collection(name=collection_name, schema=schema)

# 2.准备数据批量插入
insert_data = {
    "page_num": [],  # 存储页面编号
    "content": [],   # 存储文本内容
    "page_embedding": []  # 存储嵌入向量
}

for item_key, item_value in pdf_content.items():
    # 调用 OpenAI API 生成文本嵌入向量
    response = openai.Embedding.create(input=item_value, model="text-embedding-ada-002")
    embedding = response["data"][0]["embedding"]

    # 嵌入向量转换为与 Milvus 兼容的 float32 列表形式
    embedding = np.array(embedding).astype(np.float32).tolist()

    # 收集插入数据
    insert_data["page_num"].append(item_key)
    insert_data["content"].append(item_value)
    insert_data["page_embedding"].append(embedding)

# 3.插入数据
collection.insert([insert_data["page_num"], insert_data["content"], insert_data["page_embedding"]])

# 4.创建索引
index_params = {
    "metric_type": "L2",  # 距离度量类型，例如 L2 欧氏距离
    "index_type": "IVF_FLAT",  # 索引类型
    "params": {"nlist": 4096},  # 索引参数，根据实际情况调整
}

collection.create_index(field_name="page_embedding", index_params=index_params)

# try:
#     r.ft(INDEX_NAME).create_index(fields=feilds, definition=indexDefinition)
# except Exception as e:
#     print(e)
# print(r.ft(INDEX_NAME).info())
try:
    collection.create_index(field_name="page_embedding", index_params=index_params)
    print(f"Index for 'page_embedding' created successfully in '{collection_name}'.")
except Exception as e:
    print(f"Failed to create index in '{collection_name}': {e}")

# 调用 `load()` 确保索引被正确加载到内存中，它对后续的查询性能很重要
collection.load()

# 获取并打印集合信息
info = collection.describe()
print(info)


# for item_key, item_value in pdf_content.items():
#     # Create embedding with GPT(ada)
#     page_embedding = openai.Embedding.create(input=item_value, model="text-embedding-ada-002")["data"][0]["embedding"]
#     # Prepare embedding vector for RediSearch
#     page_embedding = np.array(page_embedding).astype(np.float32).tobytes()
#     key = f"{INDEX_NAME}:Reliability:{item_key}"
#     r.hset(key, mapping={'page_num': item_key, 'content': item_value, 'page_embedding': page_embedding})