In [29]:
from langchain_community.document_loaders import PyPDFLoader

In [30]:
loader = PyPDFLoader("../openai_dev/data/nke-10k-2023.pdf")

In [31]:
doc = loader.load()

In [32]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, add_start_index=True)
splits = splitter.split_documents(doc)

In [33]:
len(splits)

516

In [34]:
from openai import OpenAI
import numpy as np

class Embedder:
    def __init__(self):
        self.client = OpenAI()

    def embed(self, text:str) -> np.ndarray:
        vector = self.client.embeddings.create(model="text-embedding-ada-002", input=text, encoding_format='float')
        return vector.data[0].embedding

In [35]:
embedder = Embedder()

In [36]:
embedder.embed("Hello, world!")

[0.001459866,
 0.0034020946,
 -0.013020599,
 -0.03340122,
 -0.00949392,
 0.0048747384,
 -0.015461163,
 0.0018767423,
 -0.0029820239,
 -0.024993416,
 0.029925656,
 0.007174746,
 -0.016917834,
 -0.018016726,
 0.010420312,
 -0.0030027877,
 0.02503175,
 -0.015205606,
 0.011423372,
 0.011014481,
 -0.008311972,
 -0.0019070897,
 0.017096724,
 0.0057532135,
 -0.014323937,
 -0.0075644697,
 0.0035458452,
 -0.015818942,
 0.037439015,
 -0.026053976,
 0.010056145,
 -0.0065933554,
 -0.004823627,
 -0.01383838,
 0.011819484,
 -0.019179508,
 0.005018489,
 -0.011506427,
 0.019498954,
 -0.011787539,
 0.004807655,
 0.0056701577,
 0.0031305659,
 -0.006126965,
 -0.0267312,
 0.008241694,
 0.005440157,
 -0.009219198,
 -0.0062068263,
 0.02045729,
 0.020585068,
 -0.012094207,
 -0.0134806,
 0.00785197,
 -0.018796174,
 -0.0004240639,
 -0.03838457,
 0.024060635,
 0.030998992,
 -0.02233563,
 0.019920621,
 0.008126694,
 -0.023166189,
 0.009921977,
 -0.0100817,
 0.0050121,
 0.014579493,
 0.013966157,
 -0.022207852,
 

In [37]:
embed_vectors = [embedder.embed(split.page_content) for split in splits[:10]]

In [49]:
class RAG:
    def __init__(self, embed_vectors:list[np.ndarray], content:list[str]):
        self.client = OpenAI()
        self.embedder = Embedder()
        self.embed_matrix = np.stack(embed_vectors, 1)
        self.embed_matrix /= np.linalg.norm(self.embed_matrix, axis=0, keepdims=True)
        self.content = content

    def query(self, query:str, top_k:int=1) -> list[str]:
        embed_query = self.embedder.embed(query)
        embed_query /= np.linalg.norm(embed_query)
        similarities = embed_query @ self.embed_matrix
        idxs = np.argsort(similarities)[::-1][:top_k]
        return [self.content[idx] for idx in idxs]

In [50]:
content = [split.page_content for split in splits[:10]]

In [51]:
content[3]

"accounting standards provided pursuant to Section 13(a) of the Exchange Act.\n¨ \n• whether the registrant has filed a report on and attestation to its management's assessment of the effectiveness of its internal control over financial\nreporting under Section 404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued its audit\nreport.\nþ \n• if securities are registered pursuant to Section 12(b) of the Act, whether the financial statements of the registrant included in the filing reflect the\ncorrection of an error to previously issued financial statements.\n¨ \n• whether any of those error corrections are restatements that required a recovery analysis of incentive-based compensation received by any of the\nregistrant's executive officers during the relevant recovery period pursuant to § 240.10D-1(b).\n¨ \n• whether the registrant is a shell company (as defined in Rule 12b-2 of the Act). ☐ þ"

In [52]:
rag = RAG(embed_vectors, content)

In [53]:
rag.query("Section 13(a) of the Exchange Act", top_k=3)

["accounting standards provided pursuant to Section 13(a) of the Exchange Act.\n¨ \n• whether the registrant has filed a report on and attestation to its management's assessment of the effectiveness of its internal control over financial\nreporting under Section 404(b) of the Sarbanes-Oxley Act (15 U.S.C. 7262(b)) by the registered public accounting firm that prepared or issued its audit\nreport.\nþ \n• if securities are registered pursuant to Section 12(b) of the Act, whether the financial statements of the registrant included in the filing reflect the\ncorrection of an error to previously issued financial statements.\n¨ \n• whether any of those error corrections are restatements that required a recovery analysis of incentive-based compensation received by any of the\nregistrant's executive officers during the relevant recovery period pursuant to § 240.10D-1(b).\n¨ \n• whether the registrant is a shell company (as defined in Rule 12b-2 of the Act). ☐ þ",
 'SECURITIES REGISTERED PURSUA