In [None]:
from sentence_transformers import SentenceTransformer

# Load the Sentence Transformers model
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
import pandas as pd
import numpy as np

# Read the Parquet file
df_pandas = pd.read_parquet("data/data_with_embeddings.parquet")

# Check if the embeddings are already in list format
# If they are, no need to apply np.fromstring() again.
# Assuming embeddings are stored as lists (float16)
print(type(df_pandas["embedding"][0]))  # Should show <class 'list'>

# You can now directly use the embeddings as numpy arrays
df_pandas["embedding"] = df_pandas["embedding"].apply(lambda x: np.array(x, dtype=np.float16))

# Now you can use df_pandas with embeddings as numpy arrays
print(df_pandas.head())


In [None]:
import numpy as np
from sklearn.preprocessing import normalize
from scipy.spatial.distance import cosine

# Normalize the embeddings and query for cosine similarity
embeddings_normalized = normalize(np.array(df_pandas["embedding"].tolist()).astype('float32'))

# Perform a similarity search using cosine similarity
query = "oral"
query_embedding = model.encode(query).astype('float32')

# Normalize query embedding
query_normalized = normalize(query_embedding.reshape(1, -1))

# Calculate cosine similarity (1 - cosine distance)
cosine_similarities = np.dot(embeddings_normalized, query_normalized.T).flatten()

k = 2  # Number of results to retrieve
top_k_indices = cosine_similarities.argsort()[-k:][::-1]
top_k_similarities = cosine_similarities[top_k_indices]

# Display results
for idx, sim in zip(top_k_indices, top_k_similarities):
    print(f"Title: {df_pandas.iloc[idx]['title']}")
    print(f"Subject Codes: {df_pandas.iloc[idx]['subject_codes']}")
    print(f"Cited by Count: {df_pandas.iloc[idx]['citedby_count']}")
    print(f"Publisher: {df_pandas.iloc[idx]['publisher']}")
    print(f"Language: {df_pandas.iloc[idx]['language']}")
    print(f"Published Date: {df_pandas.iloc[idx]['published_date']}")
    print(f"Authors: {df_pandas.iloc[idx]['authors']}")
    print(f"Cosine Similarity: {sim:.4f}\n")