In [3]:
import pandas as pd
import numpy as np
import re

In [5]:
df = pd.read_csv('C:/Users/LSheldon/Downloads/sentences_corpus.csv')

In [7]:
df.head()

Unnamed: 0,unit,section_name,page_number,page_title,par_in_page,par_header,sent_in_par,sentence_text
0,1,Introduction & Looking at Molecules,10001,Unit 1 Overview,1,,1,Discover O-Chem is split into units.
1,1,Introduction & Looking at Molecules,10001,Unit 1 Overview,2,,1,Each unit corresponds to one exam in the course.
2,1,Introduction & Looking at Molecules,10001,Unit 1 Overview,3,,1,These word clouds were generated by computer a...
3,1,Introduction & Looking at Molecules,10001,Unit 1 Overview,3,,2,This one is a visual representation of Unit 1.
4,1,Introduction & Looking at Molecules,10011,Section Overview,1,,1,Each unit is composed of several sections.


In [9]:
df = df.drop(columns=['par_header', 'unit', 'section_name', 'page_title', 'par_in_page', 'sent_in_par'], axis=1)

In [11]:
df['sentence_text'] = df['sentence_text'].apply(
    lambda x: re.sub(r'[^\w\s]', '', str(x)) if isinstance(x, str) else None
)

In [13]:
df.dropna(subset=['sentence_text'], inplace=True)

In [15]:
df.reset_index(drop=True, inplace=True)

In [17]:
df

Unnamed: 0,page_number,sentence_text
0,10001,Discover OChem is split into units
1,10001,Each unit corresponds to one exam in the course
2,10001,These word clouds were generated by computer a...
3,10001,This one is a visual representation of Unit 1
4,10011,Each unit is composed of several sections
...,...,...
5971,20444,The example below shows an example of using an...
5972,20445,Lets look again at the target learning outcome...
5973,20445,Here are some practice problems
5974,20445,Since this is the last section of Unit 8 lets ...


In [19]:
df.shape

(5976, 2)

In [21]:
sents = df['sentence_text'].tolist()
pages = df['page_number'].tolist()

In [23]:
len(sents)

5976

In [25]:
len(pages)

5976

In [27]:
# ---------- #

In [29]:
import torch
from transformers import AutoTokenizer, AutoModel

In [31]:
model_ckpt = 'sentence-transformers/all-MiniLM-L6-v2'
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [33]:
encoded_input = tokenizer(sents, padding=True, truncation=True, return_tensors="pt")

In [35]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [37]:
import torch.nn.functional as F

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [39]:
sentence_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
print(f"Sentence embeddings shape: {sentence_embeddings.size()}")

Sentence embeddings shape: torch.Size([5976, 384])


In [41]:
# -------------------- #

In [43]:
from sklearn.neighbors import NearestNeighbors

In [45]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine')

In [47]:
np_embeddings = sentence_embeddings.detach().numpy()

In [49]:
knn.fit(np_embeddings)

In [51]:
query = "What is a functional group, and can you give examples of common functional groups in organic molecules"

In [53]:
encoded_query = tokenizer([query], padding=True, truncation=True, return_tensors="pt")

In [55]:
with torch.no_grad():
    query_output = model(**encoded_query)

In [57]:
query_embedding = mean_pooling(query_output, encoded_query["attention_mask"])
query_embedding = F.normalize(query_embedding, p=2, dim=1)
np_query_embedding = query_embedding.detach().numpy()

In [59]:
distances, indices = knn.kneighbors(np_query_embedding)

In [61]:
top5_sentences_knn = df.iloc[indices[0]]['sentence_text'].values
top5_similarities_knn = 1 - distances[0]

In [63]:
result_df = pd.DataFrame({
    'sentence': top5_sentences_knn,
    'similarity_score': top5_similarities_knn
})

In [65]:
result_df

Unnamed: 0,sentence,similarity_score
0,As you know chemists define functional group f...,0.778043
1,We refer to these small groups of atoms as fun...,0.774391
2,As we learned last semester the whole point of...,0.753032
3,Here are some example molecules that contain a...,0.704954
4,This section will build upon your knowledge of...,0.701036


In [67]:
# -------------------- #

In [69]:
from pinecone import Pinecone, ServerlessSpec

In [71]:
pc = Pinecone(api_key="")

In [73]:
index_name = "semanticsearchllmbased"

# pc.create_index(
#     name=index_name,
#     dimension=len(sentence_embeddings[0]), 
#     metric="cosine", 
#     spec=ServerlessSpec(
#         cloud="aws",
#         region="us-east-1"
#     ) 
# )

In [91]:
cleaned_embeddings = []

for s in sentence_embeddings:
    clean = []
    for f in s:
        clean.append(float(f))
    cleaned_embeddings.append(clean)

In [95]:
upsert_data = []

for ind, row in df.iterrows():
    metadata = {}
    metadata['page_number'] = str(row['page_number'])
    metadata['sentence_text'] = row['sentence_text']
    data_tup = (str(ind), list(cleaned_embeddings[ind]), metadata)
    upsert_data.append(data_tup)

In [97]:
df

Unnamed: 0,page_number,sentence_text
0,10001,Discover OChem is split into units
1,10001,Each unit corresponds to one exam in the course
2,10001,These word clouds were generated by computer a...
3,10001,This one is a visual representation of Unit 1
4,10011,Each unit is composed of several sections
...,...,...
5971,20444,The example below shows an example of using an...
5972,20445,Lets look again at the target learning outcome...
5973,20445,Here are some practice problems
5974,20445,Since this is the last section of Unit 8 lets ...


In [99]:
index = pc.Index(index_name)

In [101]:
from pinecone.exceptions import PineconeException

In [103]:
batch_size = 64  

for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    try:
        index.upsert(vectors=batch)
    except PineconeApiException as e:
        print(f"Batch {i//batch_size + 1} failed with error: {e}")

# for i in range(0, len(upsert_data), batch_size):
#     batch = upsert_data[i:i + batch_size]
#     index.upsert(vectors=batch)

In [106]:
# ----- #

In [125]:
cleaned_query_embedding = np_query_embedding.flatten().tolist()

In [128]:
top_k = 5

response = index.query(
    vector=cleaned_query_embedding,
    top_k=top_k,
    include_metadata=True 
)

In [131]:
for match in response['matches']:
    print(f"Score: {match['score']}")
    print(f"Page Number: {match['metadata']['page_number']}")
    print(f"Sentence Text: {match['metadata']['sentence_text']}")
    print("----")

Score: 0.778043032
Page Number: 20360
Sentence Text: As you know chemists define functional group families because the molecules in that group have properties that are similar to each other and different from molecules in other families
----
Score: 0.774391174
Page Number: 10284
Sentence Text: We refer to these small groups of atoms as functional groups because they add particular types of functionality to a molecule
----
Score: 0.753032506
Page Number: 20632
Sentence Text: As we learned last semester the whole point of defining functional groups is to cluster compounds based on the similarities of their properties
----
Score: 0.70495373
Page Number: 10284
Sentence Text: Here are some example molecules that contain alcohol and ether functional groups
----
Score: 0.701036
Page Number: 10045
Sentence Text: This section will build upon your knowledge of functional groups and organic reactions especially substitutionatcarbonyl reactions
----
