In [2]:
import pandas as pd
import numpy as np

In [3]:
# View Config
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 1000)
pd.set_option('max_colwidth', 400)

In [4]:
df = pd.read_csv('goodreads_data.csv')

In [5]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL
0,0,To Kill a Mockingbird,Harper Lee,"The unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it. ""To Kill A Mockingbird"" became both an instant bestseller and a critical success when it was first published in 1960. It went on to win the Pulitzer Prize in 1961 and was later made into an Academy Award-winning film, also a classic.Compassionate, dramatic, and deeply moving, ""To Kill ...","['Classics', 'Fiction', 'Historical Fiction', 'School', 'Literature', 'Young Adult', 'Historical']",4.27,5691311,https://www.goodreads.com/book/show/2657.To_Kill_a_Mockingbird


In [6]:
df.columns

Index(['Unnamed: 0', 'Book', 'Author', 'Description', 'Genres', 'Avg_Rating', 'Num_Ratings', 'URL'], dtype='object')

In [7]:
def create_textual_representation(row):
    return f"""Book: {row['Book']}
Author: {row['Author']}
Genres: {row['Genres']}
Description: {row['Description']}
"""


In [8]:
df['textual_representation'] = df.apply(create_textual_representation, axis=1)

In [9]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,Book,Author,Description,Genres,Avg_Rating,Num_Ratings,URL,textual_representation
0,0,To Kill a Mockingbird,Harper Lee,"The unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it. ""To Kill A Mockingbird"" became both an instant bestseller and a critical success when it was first published in 1960. It went on to win the Pulitzer Prize in 1961 and was later made into an Academy Award-winning film, also a classic.Compassionate, dramatic, and deeply moving, ""To Kill ...","['Classics', 'Fiction', 'Historical Fiction', 'School', 'Literature', 'Young Adult', 'Historical']",4.27,5691311,https://www.goodreads.com/book/show/2657.To_Kill_a_Mockingbird,"Book: To Kill a Mockingbird\nAuthor: Harper Lee\nGenres: ['Classics', 'Fiction', 'Historical Fiction', 'School', 'Literature', 'Young Adult', 'Historical']\nDescription: The unforgettable novel of a childhood in a sleepy Southern town and the crisis of conscience that rocked it. ""To Kill A Mockingbird"" became both an instant bestseller and a critical success when it was first published in 1960..."


In [11]:
!pip install "pymilvus[model]"



In [10]:
from pymilvus import model

embedding_fn = model.DefaultEmbeddingFunction()

In [12]:
docs = [i for i in df['textual_representation']] 


In [13]:
import numpy as np
import faiss
dim = 768

index = faiss.IndexFlatL2(dim)

X = np.zeros((len(df['textual_representation']), dim), dtype='float32')

In [14]:
X.shape

(10000, 768)

In [15]:
for i, representation in enumerate(df['textual_representation']):
    embedding = embedding_fn.encode_documents([representation])

    X[i] = np.array(embedding)
        
    if i % 30 == 0:
        print('processed', str(i), 'instances')

processed 0 instances
processed 30 instances
processed 60 instances
processed 90 instances
processed 120 instances
processed 150 instances
processed 180 instances
processed 210 instances
processed 240 instances
processed 270 instances
processed 300 instances
processed 330 instances
processed 360 instances
processed 390 instances
processed 420 instances
processed 450 instances
processed 480 instances
processed 510 instances
processed 540 instances
processed 570 instances
processed 600 instances
processed 630 instances
processed 660 instances
processed 690 instances
processed 720 instances
processed 750 instances
processed 780 instances
processed 810 instances
processed 840 instances
processed 870 instances
processed 900 instances
processed 930 instances
processed 960 instances
processed 990 instances
processed 1020 instances
processed 1050 instances
processed 1080 instances
processed 1110 instances
processed 1140 instances
processed 1170 instances
processed 1200 instances
processed 1230

In [16]:
index.add(X)

In [17]:
faiss.write_index(index, 'index') 