In [14]:
from sentence_transformers import SentenceTransformer

# Load the Sentence Transformers model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def generate_embedding(text):
    return model.encode(text).tolist()


In [15]:
import pandas as pd
import numpy as np

# Read the Parquet file
df_pandas = pd.read_parquet("data/data_with_embeddings.parquet")

# Check if the embeddings are already in list format
# If they are, no need to apply np.fromstring() again.
# Assuming embeddings are stored as lists (float16)
print(type(df_pandas["embedding"][0]))  # Should show <class 'list'>

# You can now directly use the embeddings as numpy arrays
df_pandas["embedding"] = df_pandas["embedding"].apply(lambda x: np.array(x, dtype=np.float16))

# Now you can use df_pandas with embeddings as numpy arrays
print(df_pandas.head())


<class 'list'>
                                               title  \
0    'Ring of fire' appearance in COVID-19 pneumonia   
1  'Total relaxation': Buddhist mindfulness-based...   
2                          (Our) world with COVID-19   
3  16S metagenomic analysis reveals adaptability ...   
4  2022 HFCT Focused Update of the 2019 HFCT Hear...   

                                            abstract                 authors  \
0                                               None     ['Piyavisetpat N.']   
1                                               None       ['Suwanvecho S.']   
2  The pandemic COVID-19 certainly has terrible i...        ['Vasuratna A.']   
3  A bacterial consortium, named SWO, was enriche...      ['Muangchinda C.']   
4                                               None  ['Ariyachaipanich A.']   

  published_date language  citedby_count  \
0     2020-06-24      eng            1.0   
1     2019-03-01      eng            0.0   
2     2021-05-01      eng          

In [18]:
import faiss
import numpy as np

# Create FAISS index
dimension = len(df_pandas["embedding"][0])  # Embedding dimension
index = faiss.IndexFlatL2(dimension)

# Convert embeddings to numpy array
embeddings = np.array(df_pandas["embedding"].tolist()).astype('float32')

# Add embeddings to the FAISS index
index.add(embeddings)

# Perform a similarity search
query = "Artificial intelligence"
query_embedding = model.encode(query).astype('float32')
k = 5  # Number of results to retrieve
distances, indices = index.search(np.array([query_embedding]), k)

# Display results
for idx in indices[0]:
    print(df_pandas.iloc[idx][["title", "subject_codes", "language", "citedby_count", "publisher"]])


title            Recent progress and new developments of applic...
subject_codes             ['ENER', 'EART', 'ENER', 'EART', 'EART']
language                                                       eng
citedby_count                                                 15.0
publisher                                  KeAi Communications Co.
Name: 19159, dtype: object
title            Literature reviews on applying artificial inte...
subject_codes                                             ['COMP']
language                                                       eng
citedby_count                                                  3.0
publisher          CEUR-WSceurws@sunsite.informatik.rwth-aachen.de
Name: 15907, dtype: object
title            AI Builders: Teaching Thai Students to Build E...
subject_codes    ['COMP', 'ENGI', 'ENGI', 'SOCI', 'COMP', 'ENGI...
language                                                       eng
citedby_count                                                  2.0
publishe