# Libraries

In [15]:
from pymilvus import connections
from pymilvus import db
from pymilvus import Role, utility
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
from sentence_transformers import SentenceTransformer
import pandas as pd
import json


# Connect to DB

In [2]:
connections.add_connection(
    learn={"host": "localhost", "port": "19530", "username": "", "password": ""}
)

connection_id = "learn"
connections.connect(connection_id)

print(connections.list_connections())

current_dbs = db.list_database(using=connection_id)
print("Current databases:", current_dbs)

db_name = "wiki_db"

if db_name not in current_dbs:
    print("Creating database:", db_name)
    wiki_db = db.create_database(db_name, using=connection_id)

db.using_database(db_name, using=connection_id)

[('default', None), ('learn', <pymilvus.client.grpc_handler.GrpcHandler object at 0x000001FCEC28A900>)]
Current databases: ['default']
Creating database: wiki_db


# Create New User

In [5]:
current_users = utility.list_usernames(using=connection_id)

print("Current User list:", current_users)

new_user = "wiki_public"

if new_user not in current_users:
    utility.create_user(new_user, "password", using=connection_id)

public_role = Role("public", using=connection_id)
print("Role public exists?", public_role.is_exist())


Current User list: ['root']
Role public exists? True


# Create Collection

In [9]:
#Define fields
course_id = FieldSchema(
    name="course_id",
    dtype=DataType.INT64,
    is_primary=True,
    max_length=32)

title= FieldSchema(
    name="title",
    dtype=DataType.VARCHAR,
    max_length=256)

description= FieldSchema(
    name="description",
    dtype=DataType.VARCHAR,
    max_length=2048)

desc_embedding = FieldSchema(
    name="desc_embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=384
)

wiki_schema=CollectionSchema(
    fields=[course_id, title, description, desc_embedding],
    description="Courses List",
    enable_dynamic_field=True
)

collection_name="Course_List"

wiki_collection=Collection(
    name=collection_name,
    schema=wiki_schema,
    using=connection_id,
    shard_num=2
)

print("Current collections: ",utility.list_collections(using=connection_id))

r_collection=Collection(collection_name, using=connection_id)
print("\n", r_collection.schema)

Current collections:  ['courses_list', 'Course_List']

 {'auto_id': False, 'description': 'Courses List', 'fields': [{'name': 'course_id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': False}, {'name': 'title', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'description', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048}}, {'name': 'desc_embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}], 'enable_dynamic_field': True}


# Define Embedding Model

In [11]:
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Data Preparation

In [44]:
df = pd.read_csv("course-descriptions.csv")
df.head()

Unnamed: 0,Course ID,Title,Description
0,1001,Architecting Big Data applications,Learn how to architect both simple and complex...
1,1002,MLOps Essentials: Monitoring Model Drift and Bias,As more and more ML models are developed and d...
2,1003,Apache Kafka Essential Training: Getting Started,"In this course, instructor introduces Apache..."
3,1004,Applied AI: Building NLP Apps with Hugging Fac...,Explore models designed for common NLP use cas...
4,1005,Deep Learning : Getting started,Deep learning as a technology has grown leaps ...


In [20]:
i_course_id = df["Course ID"].tolist()
i_title = df["Title"].tolist()
i_description = df["Description"].tolist()

i_desc_embedding=[model.encode(i)
                  for i in i_description]

insert_data=[i_course_id, i_title, i_description, i_desc_embedding]

# Insert Data

In [21]:
course_collection = Collection(collection_name, using=connection_id)

mr=course_collection.insert(insert_data)
print("Inserted data. Now performing flush operation")
course_collection.flush(timeout=180)

Inserted data. Now performing flush operation


In [22]:
# Build Index
index_params = {
    "metric_type":"L2",
    "index_type":"IVF_FLAT",
    "params" :{"nlist":1024}
}

course_collection.create_index(
    field_name="desc_embedding",
    index_params=index_params
)

utility.index_building_progress(collection_name,using=connection_id)

{'total_rows': 5,
 'indexed_rows': 5,
 'pending_index_rows': 0,
 'state': 'Finished'}

# Query Data

# Scalar Search

In [23]:
course_collection.load()
print("Collection loaded")

q_result= course_collection.query(
    expr = "course_id == 1001",
    output_fields = ["title","description"]
)

for i, row in enumerate(q_result):
    print("Title:", row['title'])
    print("Description:\n", row['description'])

Collection loaded
Title: Architecting Big Data applications
Description:
 Learn how to architect both simple and complex batch processing applications, as you discover the basic principles of big data architectures such as horizontal scaling, distributed processing, technology selection and integration, and scheduling.


# Vector Search

In [39]:
def query_search(query, model, params):
    q_vector = model.encode(query)
    search_results = course_collection.search(
        data=[q_vector],
        anns_field="desc_embedding", 
        param=params,
        limit=1, 
        expr=None, 
        output_fields=["title", "description"],
        consistency_level="Strong"
    )
    
    for i in search_results[0]:
        course_id = i.id
        distance = round(i.distance, 2)
        title = i.entity.get("title")
        description = i.entity.get("description")

        paragraph = f"{course_id:03} {distance} {title} \n{description.strip()}"
        print(paragraph)


In [40]:
query = "Give me courses related to MLOps"
query_search(query, model, index_params)

1002 1.05 MLOps Essentials: Monitoring Model Drift and Bias 
As more and more ML models are developed and deployed, the need arises to ensure that the models are effective and safe and that they perform as desired. Model monitoring, a core function of MLOps, helps data scientists and MLOps engineers to meet this need. In this course, data analytics expert   discusses the types of monitoring needed for ML models.


In [47]:
query = "Give Courses related to deep learning"
query_search(query, model, index_params)

1005 0.72 Deep Learning : Getting started 
Deep learning as a technology has grown leaps and bounds in the last few years. More and more AI solutions use deep learning as their foundational technology. Studying this technology, however, has several challenges.  Instructor   starts off with an intro to deep learning, including artificial neural networks and architectures. He navigates through various building blocks of neural networks with simple and easy to understand explanations.


In [49]:
query = "Give me courses related to Apache Kafka Essentials"
query_search(query, model, index_params)

1003 0.48 Apache Kafka Essential Training: Getting Started 
In this course, instructor   introduces Apache Kafka and explains its fundamental concepts and basic operations.  covers basic concepts like messages, topics, logs, and more. He shows you how to use the Kafka command line, as well as partitions and groups. He goes over Kafka Java programming, then concludes with a use case project.
