In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('C:/Users/LShel/Downloads/words_corpus.csv')

In [3]:
unique_words = np.unique(df['word'])

In [4]:
vocab_dict = {word: i for i, word in enumerate(unique_words)}

In [5]:
#split sentence string input into tokens, the build the embedding vector

def generate_frequency_vector(words, vocab_dict):
    frequency_vector = np.zeros(len(vocab_dict))
    
    for word in words:
        if word in vocab_dict:  
            index = vocab_dict[word]
            frequency_vector[index] += 1 
    
    return frequency_vector

In [6]:
sents_df = pd.read_csv('C:/Users/LShel/Downloads/sentences_corpus.csv')

In [7]:
sents_df = sents_df.drop(columns=['par_header', 'unit', 'section_name', 'par_in_page', 'sent_in_par'], axis=1)

In [8]:
sents_df

Unnamed: 0,page_number,page_title,sentence_text
0,10001,Unit 1 Overview,Discover O-Chem is split into units.
1,10001,Unit 1 Overview,Each unit corresponds to one exam in the course.
2,10001,Unit 1 Overview,These word clouds were generated by computer a...
3,10001,Unit 1 Overview,This one is a visual representation of Unit 1.
4,10011,Section Overview,Each unit is composed of several sections.
...,...,...,...
5975,20444,Acetals in Synthesis,The example below shows an example of using an...
5976,20445,Target Learning Outcomes & Practice Problems,Let's look again at the target learning outcom...
5977,20445,Target Learning Outcomes & Practice Problems,Here are some practice problems.
5978,20445,Target Learning Outcomes & Practice Problems,"Since this is the last section of Unit 8, let'..."


In [9]:
sents_df['sentence_text'] = sents_df['sentence_text'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x.lower())) if isinstance(x, str) else '')

In [10]:
sents_df['sentence_embedding'] = sents_df['sentence_text'].apply(
    lambda sentence: generate_frequency_vector(sentence.split(" "), vocab_dict)
)

In [11]:
sents_df

Unnamed: 0,page_number,page_title,sentence_text,sentence_embedding
0,10001,Unit 1 Overview,discover ochem is split into units,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,10001,Unit 1 Overview,each unit corresponds to one exam in the course,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,10001,Unit 1 Overview,these word clouds were generated by computer a...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,10001,Unit 1 Overview,this one is a visual representation of unit 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,10011,Section Overview,each unit is composed of several sections,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...
5975,20444,Acetals in Synthesis,the example below shows an example of using an...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5976,20445,Target Learning Outcomes & Practice Problems,lets look again at the target learning outcome...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5977,20445,Target Learning Outcomes & Practice Problems,here are some practice problems,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5978,20445,Target Learning Outcomes & Practice Problems,since this is the last section of unit 8 lets ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [12]:
# sents_df.to_csv('C:/Users/LShel/OneDrive/Documents/SemanticSearch_Test/embeddings.csv', index=False)

In [13]:
input_query = "What is the difference between an alkane, alkene, and alkyne"

In [14]:
input = [w.lower() for w in input_query.split(" ")]

In [15]:
input_embedding = generate_frequency_vector(input, vocab_dict)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity

In [17]:
sents_df['similarity'] = cosine_similarity(sents_df['sentence_embedding'].tolist(), input_embedding.reshape(1, -1))

In [18]:
top_5_similar = sents_df.nlargest(5, 'similarity')

In [19]:
top_5_similar

Unnamed: 0,page_number,page_title,sentence_text,sentence_embedding,similarity
4363,20886,Alkane Directing Groups,recall the difference between polar and polari...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.534522
3091,10673,Catalysis,often this difference is so large that it is t...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.516398
1442,10170,Polarized Bonds & Resonance Structures,these two molecules are the same size and shap...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.503115
910,10652,Vibrational Mixing,what is different between the two nh bonds,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.5
1928,10176,The Strength of Acids,the 34 number difference between the pka value...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.492366


In [20]:
ret_string = ""

for ind, row in top_5_similar.iterrows():
    ret_string += str(row['page_number']) + '  -  ' + row['page_title'] + '  -  ' + row['sentence_text'] + '  -  ' + str(row['similarity'])
    ret_string += '\n'

print(ret_string)

20886  -  Alkane Directing Groups  -  recall the difference between polar and polarizable  -  0.5345224838248487
10673  -  Catalysis  -  often this difference is so large that it is the difference between the reaction working very well and not working at all  -  0.5163977794943222
10170  -  Polarized Bonds & Resonance Structures  -  these two molecules are the same size and shape and the only difference between them is that the ch2 group in 2methyl1butene has been replaced with an oxygen atom in butanone  -  0.5031152949374527
10652  -  Vibrational Mixing  -  what is different between the two nh bonds  -  0.4999999999999999
10176  -  The Strength of Acids  -  the 34 number difference between the pka values 50 and 16 indicates that methane is sqrt1034  1017 times less acidic than water and therefore not actually considered to be an acid at all  -  0.49236596391733095



In [21]:
#-----

In [22]:
sent_embeddings = sents_df['sentence_embedding'].tolist()

In [54]:
cleaned_embeddings = []

for s in sent_embeddings:
    clean = []
    for f in s:
        clean.append(float(f))
    cleaned_embeddings.append(clean)

In [55]:
len(cleaned_embeddings[0])

5278

In [66]:
from pinecone import Pinecone, ServerlessSpec

In [67]:
pc = Pinecone(api_key="")

In [68]:
index_name = "semanticsearch"

pc.create_index(
    name=index_name,
    dimension=len(sent_embeddings[0]), 
    metric="cosine", 
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [69]:
sents_df

Unnamed: 0,page_number,page_title,sentence_text,sentence_embedding,similarity
0,10001,Unit 1 Overview,discover ochem is split into units,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.158114
1,10001,Unit 1 Overview,each unit corresponds to one exam in the course,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.117851
2,10001,Unit 1 Overview,these word clouds were generated by computer a...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.083333
3,10001,Unit 1 Overview,this one is a visual representation of unit 1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.125000
4,10011,Section Overview,each unit is composed of several sections,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.133631
...,...,...,...,...,...
5975,20444,Acetals in Synthesis,the example below shows an example of using an...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.273861
5976,20445,Target Learning Outcomes & Practice Problems,lets look again at the target learning outcome...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.106600
5977,20445,Target Learning Outcomes & Practice Problems,here are some practice problems,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.000000
5978,20445,Target Learning Outcomes & Practice Problems,since this is the last section of unit 8 lets ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.258199


In [70]:
upsert_data = []

for ind, row in sents_df.iterrows():

    if 1.0 not in list(cleaned_embeddings[ind]):
        continue
    else:
        metadata = {}
        metadata['page_number'] = str(row['page_number'])
        metadata['sentence_text'] = row['sentence_text']
        data_tup = (str(ind), list(cleaned_embeddings[ind]), metadata)
        upsert_data.append(data_tup)

In [71]:
index = pc.Index(index_name)

In [72]:
batch_size = 64  

for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    try:
        index.upsert(vectors=batch)
    except PineconeApiException as e:
        print(f"Batch {i//batch_size + 1} failed with error: {e}")

In [73]:
#----------#

In [92]:
input_query_2 = "carbon atoms"

In [93]:
input_2 = [w.lower() for w in input_query_2.split(" ")]

In [94]:
input_embedding_2 = generate_frequency_vector(input_2, vocab_dict)

In [95]:
clean_query_embedding = [float(e) for e in input_embedding_2]

In [96]:
top_k = 5

response = index.query(
    vector=clean_query_embedding,
    top_k=top_k,
    include_metadata=True 
)

In [97]:
for match in response['matches']:
    print(f"Score: {match['score']}")
    print(f"Page Number: {match['metadata']['page_number']}")
    print(f"Sentence Text: {match['metadata']['sentence_text']}")
    print("----")

Score: 0.672581077
Page Number: 10623
Sentence Text: halogen atoms form one bond to carbon atoms
----
Score: 0.667541564
Page Number: 10731
Sentence Text: since the weights of the carbon and hydrogen atoms do not change the fact that ch bonds on linear carbon atoms absorb a higher frequency of ir compared to those on tetrahedral carbon atoms suggests that the ch bonds between hydrogen atoms and linear carbon atoms are somewhat stronger than those between hydrogen atoms and tetrahedral carbon atoms
----
Score: 0.586766243
Page Number: 10110
Sentence Text: if you were trying to form a molecule with two carbon atoms and four hydrogen atoms you could start by bonding the two carbon atoms together and bonding each hydrogen to one of the carbon atoms left image below
----
Score: 0.577350259
Page Number: 10977
Sentence Text: carbon atoms begin with six electrons
----
Score: 0.565685451
Page Number: 10443
Sentence Text: although carbon atoms always form four bonds it is still useful to think a