In [2]:
import pandas as pd
import numpy as np
import re

In [8]:
df = pd.read_csv('C:/Users/LSheldon/Downloads/sentences_corpus.csv')

In [10]:
df = df.drop(columns=['par_header', 'unit', 'section_name', 'page_title', 'par_in_page', 'sent_in_par'], axis=1)

In [12]:
df['sentence_text'] = df['sentence_text'].apply(
    lambda x: re.sub(r'[^\w\s]', '', str(x)) if isinstance(x, str) else None
)

In [14]:
df.dropna(subset=['sentence_text'], inplace=True)

In [16]:
df.reset_index(drop=True, inplace=True)

In [18]:
df

Unnamed: 0,page_number,sentence_text
0,10001,Discover OChem is split into units
1,10001,Each unit corresponds to one exam in the course
2,10001,These word clouds were generated by computer a...
3,10001,This one is a visual representation of Unit 1
4,10011,Each unit is composed of several sections
...,...,...
5971,20444,The example below shows an example of using an...
5972,20445,Lets look again at the target learning outcome...
5973,20445,Here are some practice problems
5974,20445,Since this is the last section of Unit 8 lets ...


In [20]:
sents = df['sentence_text'].tolist()
pages = df['page_number'].tolist()

In [22]:
sents_2 = sents[:100]
len(sents_2)

100

In [24]:
# ----------- #

In [26]:
import boto3
import json
from tqdm import tqdm

In [29]:
bedrock = boto3.client(
    service_name='bedrock-runtime',
    aws_access_key_id='',
    aws_secret_access_key='',
    region_name='us-east-2'
)

In [31]:
model_id = 'amazon.titan-embed-text-v2:0'
accept = 'application/json'
content_type = 'application/json'

In [33]:
embeddings = []

for s in tqdm(sents):

    body = json.dumps({
    "inputText": s,
    })

    response = bedrock.invoke_model(
    body=body,
    modelId=model_id,
    accept=accept,
    contentType=content_type
    )

    response_body = json.loads(response['body'].read())
    embedding = response_body.get('embedding')

    embeddings.append(embedding)

100%|██████████| 5976/5976 [10:02<00:00,  9.92it/s]


In [35]:
# ---------- #

In [37]:
upsert_data = []

for ind, row in tqdm(df.iterrows()):
    metadata = {}
    metadata['page_number'] = str(row['page_number'])
    metadata['sentence_text'] = row['sentence_text']
    data_tup = (str(ind), list(embeddings[ind]), metadata)
    upsert_data.append(data_tup)

5976it [00:00, 12278.67it/s]


In [39]:
from pinecone import Pinecone, ServerlessSpec

In [43]:
pc = Pinecone(api_key="")

In [46]:
index_name = "semanticsearchaws"

pc.create_index(
    name=index_name,
    dimension=len(embeddings[0]), 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [47]:
index = pc.Index(index_name)

In [48]:
from pinecone.exceptions import PineconeException

In [49]:
batch_size = 64  

for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    try:
        index.upsert(vectors=batch)
    except PineconeApiException as e:
        print(f"Batch {i//batch_size + 1} failed with error: {e}")

In [None]:
# ----------- #

In [17]:
from sklearn.neighbors import NearestNeighbors

In [18]:
knn = NearestNeighbors(n_neighbors=5, metric='cosine')

In [19]:
knn.fit(embeddings)

In [34]:
query = "Electric voltage and probes"

In [35]:
body = json.dumps({
"inputText": query,
})

response = bedrock.invoke_model(
body=body,
modelId=model_id,
accept=accept,
contentType=content_type
)

response_body = json.loads(response['body'].read())
embedding = response_body.get('embedding')

In [36]:
embedding = np.array(embedding)
embedding = embedding.reshape(1, -1)

In [37]:
distances, indices = knn.kneighbors(embedding)

In [38]:
top5_sentences_knn = df.iloc[indices[0]]['sentence_text'].values
top5_similarities_knn = 1 - distances[0]

In [39]:
result_df = pd.DataFrame({
    'sentence': top5_sentences_knn,
    'similarity_score': top5_similarities_knn
})

In [40]:
result_df

Unnamed: 0,sentence,similarity_score
0,An electric voltage is connected from the prob...,0.691268
1,Since the probes up and down movements are ver...,0.370758
2,In order to complete the circuit electrons mus...,0.351123
3,At points where the surface is lower the probe...,0.348681
4,A laser is reflected off the probe and the up ...,0.319024


In [41]:
result_df['sentence'].iloc[0]

'An electric voltage is connected from the probe to the edge of the surface'

In [33]:
df['sentence_text'].iloc[53]

'An electric voltage is connected from the probe to the edge of the surface'