# Connect to Milvus

In [2]:
from pymilvus import(
    Milvus,
    IndexType,
    Status,
    connections,
    FieldSchema,
    DataType,
    Collection,
    CollectionSchema
)

## Techzone's Standalone Milvus instance
# host = '161.156.196.183'
# port = '8080'
# password = '4XYg2XK6sMU4UuBEjHq4EhYE8mSFO3Qq'
# user = 'root'
# server_pem_path =  './data/cert.pem'
# server_name = 'localhost'
## Techzone's watsonx.data Milvus service
host = 'useast.services.cloud.techzone.ibm.com'
port = '28048'
password = 'password'
user = 'ibmlhadmin'
server_pem_path = './data/milvus_cert.pem'
server_name = 'watsonxdata'

connections.connect(alias = 'default',
                host = host,
                port = port,
                user = user,
                password = password,
                server_pem_path=server_pem_path,
                server_name = server_name,
                secure = True)

In [3]:
connections.list_connections()

[('default', <pymilvus.client.grpc_handler.GrpcHandler at 0x22652979ee0>)]

# Creating Milvus schema definition

## News collections

In [6]:
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), # Primary key
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=300,),
    FieldSchema(name="article", dtype=DataType.VARCHAR, max_length=2500,),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1024,),
    FieldSchema(name="school_year", dtype=DataType.INT32),
    FieldSchema(name="in_effect", dtype=DataType.VARCHAR, max_length=100,),
    FieldSchema(name="file_links", dtype=DataType.VARCHAR, max_length=300),
    FieldSchema(name="created_at", dtype=DataType.VARCHAR, max_length=200,),
    FieldSchema(name="updated_at", dtype=DataType.VARCHAR, max_length=200,),
]

schema = CollectionSchema(fields, "news schema")

In [7]:
collection_names = ['recruitment', 'timetable', 'scholarship', 'academic_affairs', 'events']

Add search index

In [8]:
for col in collection_names:
    collection = Collection(col, schema)
    index_params = {
        'metric_type':'L2',
        'index_type':"IVF_FLAT",
        'params':{"nlist":2048}
    }
    collection.create_index(field_name='embedding', index_params=index_params)

## For student handbook collections (default collections)

In [9]:
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True), # Primary key
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=300,),
    FieldSchema(name="article", dtype=DataType.VARCHAR, max_length=5000,),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=1024,),
    FieldSchema(name="page_number", dtype=DataType.INT32),
    FieldSchema(name="school_year", dtype=DataType.INT32,),
    FieldSchema(name="in_effect", dtype=DataType.VARCHAR, max_length=100,),
    FieldSchema(name="created_at", dtype=DataType.VARCHAR, max_length=200,),
    FieldSchema(name="updated_at", dtype=DataType.VARCHAR, max_length=200,),
] #Missing page number

schema = CollectionSchema(fields, "news schema")

collection = Collection('student_handbook', schema)

index_params = {
    'metric_type':'L2',
    'index_type':"IVF_FLAT",
    'params':{"nlist":2048}
}
collection.create_index(field_name='embedding', index_params=index_params)

Status(code=0, message=)

## Check collection

In [11]:
connections._fetch_handler('default').list_collections()

['timetable',
 'events',
 'student_handbook',
 'recruitment',
 'scholarship',
 'academic_affairs']

UTILS: drop all collections

In [5]:
for col in connections._fetch_handler('default').list_collections():
    connections._fetch_handler('default').drop_collection(col)

# Process data
- Embedding content
- Preprocess headings
- Add columns
- Remove nulls
- Reformat

## Initialize the embedding model

Import

In [1]:
import os
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames as EmbedParams
from ibm_watsonx_ai.foundation_models.utils.enums import EmbeddingTypes
from ibm_watsonx_ai.foundation_models import Embeddings
from dotenv import load_dotenv

Credentials & initialize

In [2]:
load_dotenv('./data/.env')
my_credentials = {
    "url": "https://us-south.ml.cloud.ibm.com",
    "apikey": os.environ['WATSONX_APIKEY'],
}

# model_id = 'sentence-transformers/all-minilm-l12-v2'
model_id = 'intfloat/multilingual-e5-large'
gen_parms = None
project_id = os.environ['WATSONX_PROJECT_ID']
space_id = None
verify = False

# Set the truncate_input_tokens to a value that is equal to or 
# less than the maximum allowed tokens for the embedding model that you are using. 
# If you don't specify this value and the input has more tokens than the model can process, 
# an error is generated.

embed_params = {
    EmbedParams.TRUNCATE_INPUT_TOKENS: 512,
}

model = Embeddings(
    model_id=model_id,
    credentials=my_credentials,
    params=embed_params,
    project_id=project_id,
    verify=verify
)

Preprocess function
- As mentioned from [the model card](https://huggingface.co/intfloat/multilingual-e5-large), it is advised to format the *to-be-embed* content as following `passage:...` and/or `query:...`

In [3]:
def preprocess_embed(text, isQuery=False):
    if isQuery:
        return "query: " + text
    else: #embed content
        return "passage: " + text

## DATA: Student handbook

In [4]:
import pandas as pd
from glob import glob

paths = glob('./data/student_handbook/*')

In [13]:
from pypdf import PdfReader
from tqdm.notebook import tqdm

data = []
for path in tqdm(paths, desc="Documents", position=0):
    fp = open(path, 'rb')
    reader = PdfReader(fp)
    n_pages = len(reader.pages)

    title = path.split('\\')[-1].split('.')[0]
    for i, page in tqdm(enumerate(reader.pages), desc="Pages", position=1, leave=False):
        text = page.extract_text() #Article
        embed_text = preprocess_embed(title + '\n' + text) 
        embedding = model.embed_query(embed_text) #Embedding
        
        page_number = i + 1 #Page number
        school_year = 2024 #School year
        in_effect = 2024 #In effect till

        record = {
            'title': title,
            'article': text,
            'embedding': embedding,
            'page_number': page_number,
            'school_year': school_year,
            'in_effect': in_effect,
            'created_at': '',
            'updated_at': ''
        }
        data.append(record)
    fp.close()

Documents:   0%|          | 0/5 [00:00<?, ?it/s]

Pages: 0it [00:00, ?it/s]

Pages: 0it [00:00, ?it/s]

KeyboardInterrupt: 

In [14]:
len(data)

11

In [32]:
path = r'.\data\student_handbook\Khoa công nghệ thông tin.pdf'
reader = PdfReader(path)

text = reader.pages[0].extract_text()
fp.close()
text

'KHOA \nCÔNG NGH/uni1EC6 THÔNG TIN\nThông tin chung\ninfo@/f_it.hcmus.edu.vn\nH/uni1ED7 tr/uni1EE3 sinh viên: tlsv@/f_it.hcmus.edu.vn Giáo v/uni1EE5: o Chương trình Chu/uni1EA9n: giaovu@/f_it.hcmus.edu.vn o Chương trình Ch/uni1EA5t lư/uni1EE3ng cao: giaovu.clc@/f_it.hcmus.edu.vno Chương trình Tiên ti/uni1EBFn: giaovu@apcs./f_itus.edu.vnC/uni1ED1 v/uni1EA5n h/uni1ECDc t/uni1EADp: cvht@/f_it.hcmus.edu.vn Chương trình đ/uni1EC1 án: ctdb@hcmus.edu.vn(028) 38 354 266 (Ext: 500)(028) 62 884 499 (Ext: 4000)Văn phòng khoa: phòng I.53, 227 Nguy/uni1EC5n Văn C/uni1EEB, Q.5, TP .HCM\nwww./f_it.hcmus.edu.vnBan Ch/uni1EE7 nhi/uni1EC7m:Trư/uni1EDFng Khoa: TS. Đinh Bá Ti/uni1EBFnPhó Trư/uni1EDFng Khoa: TS. Lâm Quang Vũ, PGS.TS. Nguy/uni1EC5n Văn Vũ, ThS. Văn Chí Nam \nM/uni1EE4C TIÊU ĐÀO T/uni1EA0O\nS/uni1EE8 M/uni1EC6NH\n- Cung c/uni1EA5p các tr/uni1EA3i nghi/uni1EC7m gi/uni1EA3ng d/uni1EA1y và h/uni1ECDc t/uni1EADp hàng đ/uni1EA7u cho các chương trình đào t/uni1EA1o b/uni1EADc đ/uni1EA1i h/uni1ECDc

In [15]:
print(data[0]['article'])

KHOA 
CÔNG NGH/uni1EC6 THÔNG TIN
Thông tin chung
info@/f_it.hcmus.edu.vn
H/uni1ED7 tr/uni1EE3 sinh viên: tlsv@/f_it.hcmus.edu.vn Giáo v/uni1EE5: o Chương trình Chu/uni1EA9n: giaovu@/f_it.hcmus.edu.vn o Chương trình Ch/uni1EA5t lư/uni1EE3ng cao: giaovu.clc@/f_it.hcmus.edu.vno Chương trình Tiên ti/uni1EBFn: giaovu@apcs./f_itus.edu.vnC/uni1ED1 v/uni1EA5n h/uni1ECDc t/uni1EADp: cvht@/f_it.hcmus.edu.vn Chương trình đ/uni1EC1 án: ctdb@hcmus.edu.vn(028) 38 354 266 (Ext: 500)(028) 62 884 499 (Ext: 4000)Văn phòng khoa: phòng I.53, 227 Nguy/uni1EC5n Văn C/uni1EEB, Q.5, TP .HCM
www./f_it.hcmus.edu.vnBan Ch/uni1EE7 nhi/uni1EC7m:Trư/uni1EDFng Khoa: TS. Đinh Bá Ti/uni1EBFnPhó Trư/uni1EDFng Khoa: TS. Lâm Quang Vũ, PGS.TS. Nguy/uni1EC5n Văn Vũ, ThS. Văn Chí Nam 
M/uni1EE4C TIÊU ĐÀO T/uni1EA0O
S/uni1EE8 M/uni1EC6NH
- Cung c/uni1EA5p các tr/uni1EA3i nghi/uni1EC7m gi/uni1EA3ng d/uni1EA1y và h/uni1ECDc t/uni1EADp hàng đ/uni1EA7u cho các chương trình đào t/uni1EA1o b/uni1EADc đ/uni1EA1i h/uni1ECDc và sau đ

## DATA: News data

In [1]:
import pandas as pd

df = pd.read_csv('./data/FIT_news.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 640 entries, 0 to 639
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        640 non-null    object
 1   article      640 non-null    object
 2   file_links   640 non-null    object
 3   url          640 non-null    object
 4   type         640 non-null    object
 5   created_at   640 non-null    object
 6   school_year  640 non-null    int64 
 7   in_effect    640 non-null    int64 
dtypes: int64(2), object(6)
memory usage: 40.1+ KB


In [62]:
from glob import glob

paths = glob('./data/thesis_rag_data/*')

In [63]:
import json 
data = []
for path in paths:
    with open(path, 'r', encoding='utf-8') as rstream:
        data = json.load(rstream)

    for d in data:
        d.pop('_id', None)
        d['school_year'] = int(d['school_year'])
    collection = Collection(path.split('\\')[-1].split('.')[0])
    collection.insert(data)


In [None]:
path = paths[4]
path.split('\\')[-1].split('.')[0]

In [7]:
import json
with open(path, 'r', encoding='utf-8') as rstream:
    data = json.load(rstream)

for d in data:
    d.pop('_id', None)
    d['school_year'] = int(d['school_year'])

In [None]:
for col in collection_names:
    Collection(col).insert(data)

In [134]:
collection = Collection('timetable')

In [None]:
collection.insert(data)

***

In [80]:
# for c in connections._fetch_handler('default').list_collections():
#     Collection(c).drop()

***

In [65]:
import os
from ibm_watsonx_ai.metanames import EmbedTextParamsMetaNames as EmbedParams
from ibm_watsonx_ai.foundation_models.utils.enums import EmbeddingTypes
from ibm_watsonx_ai.foundation_models import Embeddings
from dotenv import load_dotenv

In [66]:
load_dotenv('./data/.env')
my_credentials = {
    "url": "https://us-south.ml.cloud.ibm.com",
    "apikey": os.environ['WATSONX_APIKEY'],
}

# model_id = 'sentence-transformers/all-minilm-l12-v2'
model_id = 'intfloat/multilingual-e5-large'
gen_parms = None
project_id = os.environ['WATSONX_PROJECT_ID']
space_id = None
verify = False

# Set the truncate_input_tokens to a value that is equal to or 
# less than the maximum allowed tokens for the embedding model that you are using. 
# If you don't specify this value and the input has more tokens than the model can process, 
# an error is generated.

embed_params = {
    EmbedParams.TRUNCATE_INPUT_TOKENS: 512,
}

model = Embeddings(
    model_id=model_id,
    credentials=my_credentials,
    params=embed_params,
    project_id=project_id,
    verify=verify
)

In [67]:
query="Khoa công nghệ thông tin là gì"

In [68]:
query_embeddings = model.embed_query(query)

col_name = "student_handbook"
collection = Collection(col_name)
collection.load()
search_params = {
    "metric_type": "L2",
    "params": {"nprobe": 5}
}
top_k = 3
output_fields = ['title', 'article'] #Section field to be added
results = collection.search(
    data=[query_embeddings],
    anns_field="embedding",
    param=search_params,
    limit=top_k,
    expr=None,
    output_fields=output_fields
)

In [None]:
results[0][0].to_dict()