In [12]:
import langchain
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
from dotenv import load_dotenv
load_dotenv()

In [13]:
import os

In [14]:
def read_pdf_files(directory):
    loader = PyPDFDirectoryLoader(directory)
    documents=loader.load()
    return documents

In [15]:
doc=read_pdf_files('documents/')
doc

[Document(page_content='The EG Agile Playbook\nNext Generation Agile\nExported on 11/29/2023', metadata={'source': 'documents\\The EG Agile Playbook-v3-20231129_183927 2.pdf', 'page': 0}),
 Document(page_content='Next Generation Agile – The EG Agile Playbook\n2Table of Contents\n1 1 - Introduction ................................................................................................ 26\n1.1 1.1 - EG Vision and goal .................................................................................. 26\n1.2 1.2 - Purpose of the playbook ........................................................................ 27\n1.2.1 Content of the playbook ................................................................................. 27\n1.2.2 Target recipients of the playbook .................................................................. 27\n1.3 1.3 - About the tooling .................................................................................... 28\n1.4 1.4 - About the processes

In [23]:
def convert_to_chunk(docs,chunk_size,overlap):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = splitter.split_documents(docs)
    return chunks

In [24]:
chunk_array=convert_to_chunk(docs=doc,chunk_size=800,overlap=50)
chunk_array

[Document(page_content='The EG Agile Playbook\nNext Generation Agile\nExported on 11/29/2023', metadata={'source': 'documents\\The EG Agile Playbook-v3-20231129_183927 2.pdf', 'page': 0}),
 Document(page_content='Next Generation Agile – The EG Agile Playbook\n2Table of Contents\n1 1 - Introduction ................................................................................................ 26\n1.1 1.1 - EG Vision and goal .................................................................................. 26\n1.2 1.2 - Purpose of the playbook ........................................................................ 27\n1.2.1 Content of the playbook ................................................................................. 27\n1.2.2 Target recipients of the playbook .................................................................. 27\n1.3 1.3 - About the tooling .................................................................................... 28', metadata={'source': 'documen

In [28]:
chunk_array[0].page_content,chunk_array[0].metadata['page']

('The EG Agile Playbook\nNext Generation Agile\nExported on 11/29/2023', 0)

In [None]:
import spacy
import json
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# nlp = spacy.load('en_core_web_lg')

descriptions=

description_vectors_list = []
reduced_vector_list = []
description_list = []

for description in descriptions:
    doc=model.encode(description)
    # doc = nlp(description)

    # reduced_vector = doc.vector[:300].tolist()

    # entry = {"vector": reduced_vector, "description": description}
    reduced_vector_list.append(doc)
    description_list.append(description)

import time

import numpy as np
from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

fmt = "\n=== {:30} ===\n"
search_latency_fmt = "search latency = {:.4f}s"
# num_entities, dim = 3000, 300
dim=384
print(dim)


print(fmt.format("start connecting to Milvus"))
connections.connect("default", host="localhost", port="19530")

has = utility.has_collection("hello_milvus")
print(f"Does collection hello_milvus exist in Milvus: {has}")


fields = [
    FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="description", dtype=DataType.VARCHAR,max_length=1000),
    FieldSchema(name="vectorembeddings", dtype=DataType.FLOAT_VECTOR, dim=dim)
]

schema = CollectionSchema(fields, "hello_milvus is the simplest demo to introduce the APIs")

print(fmt.format("Create collection `hello_milvus`"))
hello_milvus = Collection("first", schema, consistency_level="Strong")



print(fmt.format("Start inserting entities"))
# rng = np.random.default_rng(seed=19530)
# print(rng)
entities = [
    description_list,
    reduced_vector_list
    # provide the pk field because `auto_id` is set to False
       # field embeddings, supports numpy.ndarray and list
]

insert_result = hello_milvus.insert(entities)

hello_milvus.flush()

print(fmt.format("Start Creating index IVF_FLAT"))
index = {
    "index_type": "IVF_FLAT",
    "metric_type": "L2",
    "params": {"nlist": 128},
}

hello_milvus.create_index("vectorembeddings", index)


# Before conducting a search or a query, you need to load the data in `hello_milvus` into memory.
print(fmt.format("Start loading"))
hello_milvus.load()