In [1]:
!pip install -U pymilvus
!pip install -U sentence-transformers
%pip install -qU langchain-community pypdf
!pip install -qU langchain-openai
%%capture
!pip install retry
!pip install "pymilvus[model]"

Note: you may need to restart the kernel to use updated packages.


UsageError: Line magic function `%%capture` not found.


In [36]:
#Import needed libs
import os
import pprint
import requests
import pandas as pd
from retry import retry
from pymilvus import MilvusClient, model # Model is a embedding function
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

### Ignore above errors

## 1. Loading

Might have to look into the image loading thingy cause this probably does not cover it completely.

In [8]:
file_path = "WEF_Future_of_Jobs_Report_2025.pdf" # You might have to put absolute path in here 
loader = PyPDFLoader(
                    file_path,
                    mode = "single",
                    )

In [10]:
docs = loader.load() # Returns list of length 1 
pprint.pp(docs[0].metadata)

{'producer': 'Adobe PDF Library 17.0',
 'creator': 'Adobe InDesign 20.0 (Macintosh)',
 'creationdate': '2025-01-17T08:10:53+01:00',
 'moddate': '2025-01-17T10:21:50+01:00',
 'trapped': '/False',
 'source': 'WEF_Future_of_Jobs_Report_2025.pdf',
 'total_pages': 290}


## 2. Splitting 

Were going with text structure based splitting for now, even though **[Semantic Splitting](https://python.langchain.com/docs/concepts/text_splitters/#semantic-meaning-based)** might be the move (but this is easier right now)

In [13]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=50)
texts = text_splitter.split_text(docs[0].page_content)

In [15]:
len(texts)

1879

## 3. Storing

### 3.1. Embedding

Trasnform the chunks into vectors

In [19]:
client = MilvusClient("Hackathon.db")

In [21]:
if client.has_collection(collection_name="hackthon_collection"):
    client.drop_collection(collection_name="hackathon_collection")
client.create_collection(
    collection_name="hackthon_collection",
    dimension= 768,
)

In [23]:
embedding_fn = model.DefaultEmbeddingFunction() 

vectors = embedding_fn.encode_documents(texts)
print("Dim:", embedding_fn.dim, vectors[0].shape)  # Dim: 768 (768,)

data = [
    {"id": i, "vector": vectors[i], "text": texts[i], "subject": "history"}
    for i in range(len(vectors))
]

print("Data has", len(data), "entities, each with fields: ", data[0].keys())
print("Vector dim:", len(data[0]["vector"]))

Dim: 768 (768,)
Data has 1879 entities, each with fields:  dict_keys(['id', 'vector', 'text', 'subject'])
Vector dim: 768


### 3.2. Storing into vector store (possibly index for speedy retrieval)

In [25]:
res = client.insert(collection_name="hackthon_collection", data=data)

print(res)

{'insert_count': 1879, 'ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215,


---
## Final: Searching Test

Lets see if the Vector DB works as expected

In [28]:
query_vectors = embedding_fn.encode_queries(["Job stability of network engineers"])

res = client.search(
    collection_name="hackthon_collection",  # target collection
    data=query_vectors,  # query vectors
    limit=2,  # number of returned entities
    output_fields=["text", "subject"],  # specifies fields to be returned
)

print(res[0][0].text)

and encryption technologies (41%, compared 
to 12% globally). Growing job roles in the sector 
are foreseen to include Digital Transformation 
Specialists, Software and Applications Developers, 
and Sales and Marketing Professionals. Alongside 
AI and cybersecurity skills, the industry expects 
stronger emphasis on resilience, flexibility, and 
agility than most other sectors, while more 
employers expect demand for programming and 
design and user experience skills to decline than


---

Context retrieval works!