In [None]:
! pip install openai pymilvus datasets tqdm

Collecting pymilvus
  Downloading pymilvus-2.5.0-py3-none-any.whl.metadata (5.7 kB)
Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting grpcio<=1.67.1,>=1.49.1 (from pymilvus)
  Downloading grpcio-1.67.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting python-dotenv<2.0.0,>=1.0.1 (from pymilvus)
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting ujson>=2.0.0 (from pymilvus)
  Downloading ujson-5.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.3 kB)
Collecting milvus-lite>=2.4.0 (from pymilvus)
  Downloading milvus_lite-2.4.10-py3-none-manylinux2014_x86_64.whl.metadata (9.0 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from

In [None]:
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get("OPENAI_API_KEY")

In [None]:
from openai import OpenAI

openai_client = OpenAI()

In [None]:
COLLECTION_NAME = "movie_search"
DIMENSION = 1536
BATCH_SIZE = 1000

In [None]:
from pymilvus import MilvusClient

# Connect to Milvus Database
client = MilvusClient("./milvus_demo.db")

DEBUG:pymilvus.milvus_client.milvus_client:Created new connection using: 90b09b952e84458d97d690bba0f35ebc


In [None]:
# Remove collection if it already exists
if client.has_collection(COLLECTION_NAME):
    client.drop_collection(COLLECTION_NAME)

In [None]:
from pymilvus import DataType

# 1. Create schema
schema = MilvusClient.create_schema(
    auto_id=True,
    enable_dynamic_field=False,
)

# 2. Add fields to schema
schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True)
schema.add_field(field_name="title", datatype=DataType.VARCHAR, max_length=640)
schema.add_field(field_name="rating", datatype=DataType.VARCHAR, max_length=640)
schema.add_field(field_name="number_of_rating", datatype=DataType.VARCHAR, max_length=640)
schema.add_field(field_name="image", datatype=DataType.VARCHAR, max_length=6400)
schema.add_field(field_name="description", datatype=DataType.VARCHAR, max_length=64000)
schema.add_field(field_name="embedding", datatype=DataType.FLOAT_VECTOR, dim=DIMENSION)

# 3. Create collection with the schema
client.create_collection(collection_name=COLLECTION_NAME, schema=schema)

DEBUG:pymilvus.milvus_client.milvus_client:Successfully created collection: movie_search


In [None]:
# 1. Prepare index parameters
index_params = client.prepare_index_params()

# 2. Add an index on the embedding field
index_params.add_index(
    field_name="embedding", metric_type="IP", index_type="AUTOINDEX", params={}
)

# 3. Create index
client.create_index(collection_name=COLLECTION_NAME, index_params=index_params)

# 4. Load Collection
client.load_collection(collection_name=COLLECTION_NAME)

DEBUG:pymilvus.milvus_client.milvus_client:Successfully created an index on collection: movie_search


In [None]:
import pandas as pd

df = pd.read_csv('/content/anime_data.csv')
df

Unnamed: 0,Image,Name,Rating,Number of Episodes,Description
0,https://m.media-amazon.com/images/M/MV5BYWFhOW...,1. Dandadan,8.7,(12K),"When Momo and Okarun's beliefs clash, they're ..."
1,https://m.media-amazon.com/images/M/MV5BNjY4MD...,2. Attack on Titan,9.1,(566K),"After his hometown is destroyed, young Eren Ja..."
2,https://m.media-amazon.com/images/M/MV5BMjgyM2...,3. Bleach: Thousand-Year Blood War,9.0,(50K),The peace is suddenly broken when warning sire...
3,https://m.media-amazon.com/images/M/MV5BMTNjNG...,4. One Piece,9.0,(272K),Monkey D. Luffy sets off on an adventure with ...
4,https://m.media-amazon.com/images/M/MV5BMWU1OG...,5. Demon Slayer: Kimetsu no Yaiba,8.6,(172K),A family is attacked by demons and only two me...
...,...,...,...,...,...
995,https://m.media-amazon.com/images/M/MV5BMGY3Zm...,996. D-1 Devastator,,,"Ryo joins the an automobile company, where tak..."
996,https://m.media-amazon.com/images/M/MV5BOGNkZT...,997. A Christmas to End All Time,8.0,(9),Jimmy Stewart and George Bailey are church pla...
997,https://m.media-amazon.com/images/M/MV5BMzE0OT...,998. Intermezzo,6.4,(33),Bear joins Mimiru for an event held in the Dun...
998,https://m.media-amazon.com/images/M/MV5BYjkxYT...,999. Tales of Destiny,7.7,(12),Video Game


In [None]:
df['Name'] = df['Name'].str.replace(r'^\d+\.\s*', '', regex=True)
df["Name"]

Unnamed: 0,Name
0,Dandadan
1,Attack on Titan
2,Bleach: Thousand-Year Blood War
3,One Piece
4,Demon Slayer: Kimetsu no Yaiba
...,...
995,D-1 Devastator
996,A Christmas to End All Time
997,Intermezzo
998,Tales of Destiny


In [None]:
df["Rating"] = df["Rating"].astype(str)
df["Number of Episodes"] = df["Number of Episodes"].astype(str)
df["Image"] = df["Image"].astype(str)

## Insert the Data
Now that we have our data on our machine we can begin embedding it and inserting it into Milvus. The embedding function takes in text and returns the embeddings in a list format.

In [None]:
def emb_texts(texts):
    res = openai_client.embeddings.create(input=texts, model="text-embedding-3-small")
    return [res_data.embedding for res_data in res.data]

In [None]:
from tqdm import tqdm

# batch (data to be inserted) is a list of dictionaries
batch = []

# Embed and insert in batches
for i in tqdm(range(0, len(df))):
    batch.append(
        {
            "title": df.iloc[i]["Name"] or "",
            "image": df.iloc[i]["Image"] or "",
            "number_of_rating": df.iloc[i]["Number of Episodes"] or "",
            "rating": df.iloc[i]["Rating"] or "",
            "description": df.iloc[i]["Description"] or "",
        }
    )

    if len(batch) % BATCH_SIZE == 0 or i == len(df) - 1:
        embeddings = emb_texts([item["description"] for item in batch])

        for item, emb in zip(batch, embeddings):
            item["embedding"] = emb

        client.insert(collection_name=COLLECTION_NAME, data=batch)
        batch = []

In [None]:
import textwrap


def query(query, top_k=5):
    text = query

    res = client.search(
        collection_name=COLLECTION_NAME,
        data=emb_texts(text),
        limit=top_k,
        output_fields=["title", "image", "number_of_rating", "rating", "description"],
        search_params={
            "metric_type": "IP",
            "params": {},
        },
    )

    print("Query:", text)

    for hit_group in res:
        print("Results:")
        for rank, hit in enumerate(hit_group, start=1):
            entity = hit["entity"]

            print(
                f"\tRank: {rank} Score: {hit['distance']:} Title: {entity.get('title', '')}"
            )
            print(
                f"\t\t Image url: {entity.get('image', '')} "
                f"Number of Rating: {entity.get('number_of_rating', '')} "
                f"Rating: {entity.get('rating', '')}"
            )
            description = entity.get("description", "")
            print(textwrap.fill(description, width=88))
            print()


my_query = ("anime about fighting with monsters")

query(my_query)

Query: anime about fighting with monsters
Results:
	Rank: 1 Score: 0.5676732063293457 Title: Digimon Tamers
		 Image url: https://m.media-amazon.com/images/M/MV5BMTUyNTEwNjcwNV5BMl5BanBnXkFtZTcwMDI4MDE2MQ@@._V1_QL75_UY207_CR34,0,140,207_.jpg Number of Rating:  (2.4K) Rating: 7.6
An animated series based on the popular children's toy "Digimon", in which kids raise
electronic monsters to fight against those raised by other kids.

	Rank: 2 Score: 0.5429167747497559 Title: Otakus
		 Image url: https://m.media-amazon.com/images/M/MV5BODRiZTE2OTYtNjg1MC00OTkyLWFlM2MtOWMzNWRiMzIzNWE0XkEyXkFqcGc@._V1_QL75_UY207_CR3,0,140,207_.jpg Number of Rating:  (132) Rating: 8.2
Two fans about Japan animation fight against each other to demonstrate who knows best
about it.

	Rank: 3 Score: 0.5067501664161682 Title: TV Animation X: Unmei No Tatakai
		 Image url: https://m.media-amazon.com/images/M/MV5BNTQwZTVjZDgtZGZkMi00NWNhLWIwZWUtMWFiZWQ1NGNjYWJjXkEyXkFqcGc@._V1_QL75_UY207_CR32,0,140,207_.jpg Number of R