# Semantic search over text

In [6]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("data/nke-10k-2023.pdf")
docs = loader.load()

In [7]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [10]:
all_splits[100].page_content

'and growing our brands will depend on our design and marketing efforts, including advertising and consumer campaigns, product innovation and product quality. Our\ncommitment to product innovation, quality and sustainability, and our continuing investment in design (including materials), marketing and sustainability measures may not\nhave the desired impact on our brand image and reputation. In addition, our success in maintaining, extending and expanding our brand image depends on our ability to\nadapt to a rapidly changing media and digital environment, including our reliance on social media and other digital advertising networks, and digital dissemination of\nadvertising campaigns on our digital platforms and through our digital experiences and products. We could be adversely impacted if we fail to achieve any of these\nobjectives.'

In [72]:
from openai import OpenAI
import numpy as np

class Embedding:
    def __init__(self):
        self.client = OpenAI()

    def __call__(self, *args, **kwds):
        return self.embed(*args, **kwds)

    def embed(self, text:str) -> np.ndarray:
        response = self.client.embeddings.create(
            model="text-embedding-ada-002",
            input=text,
            encoding_format="float"
        )
        vector = response.data[0].embedding
        return np.array(vector)

In [73]:
embedding = Embedding()

In [74]:
r = embedding("Hello, world!")

In [75]:
r.shape

(1536,)

In [76]:
len(all_splits)

516

In [77]:
embeddings = [embedding(split.page_content) for split in all_splits[:10]]

In [78]:
embeddings[0]

array([-0.00860656, -0.03344117, -0.00994162, ..., -0.00637715,
        0.01088782, -0.01173681])

In [91]:
matrix = np.stack(embeddings, axis=1)

In [92]:
matrix.shape

(1536, 10)

In [93]:
matrix

array([[-0.00860656, -0.01706297, -0.02913401, ..., -0.01411097,
        -0.0001142 , -0.00353322],
       [-0.03344117, -0.01177839, -0.01107629, ..., -0.02933733,
        -0.01172377, -0.01499969],
       [-0.00994162, -0.00939332, -0.00927052, ..., -0.0210627 ,
        -0.00054143, -0.01723583],
       ...,
       [-0.00637715, -0.0134887 ,  0.0095189 , ..., -0.00801524,
        -0.00730249, -0.02286386],
       [ 0.01088782,  0.02120512,  0.01858132, ..., -0.00428647,
        -0.00589397,  0.00584159],
       [-0.01173681, -0.00577562, -0.03509507, ..., -0.00937057,
        -0.01207671, -0.00096104]])

In [94]:
all_splits[3].page_content

"accounting standards provided pursuant to Section 13(a) of the Exchange Act.\n¨ \n• whether the registrant has filed a report on and attestation to its management's assessment of the effectiveness of its internal control over financial\nreporting under Section 404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued its audit\nreport.\nþ \n• if securities are registered pursuant to Section 12(b) of the Act, whether the financial statements of the registrant included in the filing reflect the\ncorrection of an error to previously issued financial statements.\n¨ \n• whether any of those error corrections are restatements that required a recovery analysis of incentive-based compensation received by any of the\nregistrant's executive officers during the relevant recovery period pursuant to § 240.10D-1(b).\n¨ \n• whether the registrant is a shell company (as defined in Rule 12b-2 of the Act). ☐ þ"

In [54]:
query_vector = embedding("accounting standards provided pursuant to Section 13(a) of the Exchange Act")

In [105]:
query_vector =matrix[:, 3]

In [106]:
cosines = (query_vector@matrix)/(np.linalg.norm(query_vector)*np.linalg.norm(matrix, axis=0))

In [107]:
query_vector@matrix

array([0.78950021, 0.84323744, 0.86904014, 0.99999998, 0.83298677,
       0.77478341, 0.78846501, 0.82199439, 0.8085074 , 0.75204297])

In [108]:
np.linalg.norm(query_vector)

0.999999988468154

In [109]:
np.linalg.norm(matrix, axis=1)

array([0.06193406, 0.0599555 , 0.0364599 , ..., 0.04731287, 0.03470184,
       0.05970752])

In [110]:
cosines.argmin(), cosines.argmax()

(9, 3)

In [119]:
class VectorStore:
    def __init__(self, vectors:list[np.ndarray], content:list[str], embedding:Embedding):
        assert len(vectors) == len(content)
        self.matrix = np.stack(vectors, axis=1)
        self.matrix = self.matrix / np.linalg.norm(self.matrix, axis=0)
        self.content = content
        self.embedding = embedding
    
    def __call__(self, *args, **kwds):
        return self.query(*args, **kwds)
    
    def query(self, text:str, top_k:int=1) -> np.ndarray:
        query_vector = self.embedding(text)
        cosines = (query_vector@self.matrix)/np.linalg.norm(query_vector)
        idxs = cosines.argsort()[-top_k:]
        return [self.content[idx] for idx in idxs[::-1]]

In [120]:
store = VectorStore(embeddings, [split.page_content for split in all_splits[:10]], embedding)

In [121]:
store("Section 13(a) of the Exchange Act", top_k=3)

["accounting standards provided pursuant to Section 13(a) of the Exchange Act.\n¨ \n• whether the registrant has filed a report on and attestation to its management's assessment of the effectiveness of its internal control over financial\nreporting under Section 404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued its audit\nreport.\nþ \n• if securities are registered pursuant to Section 12(b) of the Act, whether the financial statements of the registrant included in the filing reflect the\ncorrection of an error to previously issued financial statements.\n¨ \n• whether any of those error corrections are restatements that required a recovery analysis of incentive-based compensation received by any of the\nregistrant's executive officers during the relevant recovery period pursuant to § 240.10D-1(b).\n¨ \n• whether the registrant is a shell company (as defined in Rule 12b-2 of the Act). ☐ þ",
 'SECURITIES REGISTERED PURSUA