In [None]:
# %pip install pypdf mistralai faiss-cpu

## Read pdf


In [None]:
from pathlib import Path
from pypdf import PdfReader


pdf_files = Path("data").glob("*.pdf")
text = ""

for pdf_file in pdf_files:
    reader = PdfReader(pdf_file)
for page in reader.pages:
    text += page.extract_text() + "\n\n"

In [None]:
print(text[:100])
print(len(text))

Benedict Neo
/envel⌢pebenedict.neo@outlook.com /linkedinin/benedictneo /github@benthecoder
Education
3817


## chunking


In [None]:
chunk_size = 500
chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]
len(chunks)

8

in RAG, we need to split documents into smaller chunks so it's more effective to identify and retrieve teh most relevant information

depending on the use case, a smaller chunk size will be beneficial for RAG to identify and extract relevant information more accurately, as larger text chunks can contain filler text that obscures the semantic representation

Here, we combine 500 characters into one chunk, and we get 8 chunks.


## embed


for each text chunk, we create a text embedding, which are numerical representations of text in the vector space.

Words with similar meanings are closer to each other in this space.

To create embeddings, we use Mistral AI's embeddings API endpoint.

We create a simple embed function to get embeddings from a single chunk, store all of them in a numpy array


In [None]:
from mistralai.client import MistralClient
import numpy as np

client = MistralClient(api_key="YOUR_MISTRAL_KEY")


def embed(input: str):
    return client.embeddings("mistral-embed", input=input).data[0].embedding


embeddings = np.array([embed(chunk) for chunk in chunks])
dimension = embeddings.shape[1]

In [None]:
embeddings

array([[-0.03314209,  0.03010559,  0.03341675, ..., -0.00093889,
         0.03649902,  0.01657104],
       [-0.03050232,  0.06610107,  0.06039429, ..., -0.02508545,
        -0.00403595, -0.02178955],
       [-0.02389526,  0.06365967,  0.04605103, ..., -0.02262878,
        -0.00494003, -0.02874756],
       ...,
       [-0.02015686,  0.03216553,  0.04882812, ..., -0.01455688,
         0.00720596, -0.0216217 ],
       [-0.00782013,  0.043396  ,  0.0413208 , ..., -0.02740479,
        -0.002491  , -0.02764893],
       [-0.0513916 ,  0.03872681,  0.03271484, ..., -0.01724243,
         0.03497314, -0.01080322]])

In [None]:
dimension

1024

## vector db


once we have the embeddings, we store them in a vector db for efficient processing and retrieval.

Here we use Faiss, an open source vector db developed by Meta.

we create an index to store our embeddings.

look at the different indexes: https://github.com/facebookresearch/faiss/wiki/Faiss-indexes


In [None]:
import faiss

d = embeddings.shape[1]
index = faiss.IndexFlatL2(d)
index.add(embeddings)

## query


when user asks a question, we create embeddings by using the same model as before.


In [None]:
question = "Who is Benedict Neo?"
question_embeddings = np.array([embed(question)])

## retrieval


we perform search on our vector db using `index.search`, it takes two parameters, the embedding of our question and k, which is the number of similar vectors to retrieve

The function returns the distances (D) and indices (I) of the most similar vector, and based on the indices, we can return the actual text.


In [None]:
D, I = index.search(question_embeddings, k=2)  # distance, index
retrieved_chunk = [chunks[i] for i in I.tolist()[0]]

## Create prompt


we create a prompt template that combines the chunk and the question


In [None]:
prompt = f"""
Context information is below.
---------------------
{retrieved_chunk}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {question}
Answer:
"""

## Chat model


using the mistral chat completion API with a mistral model, here we're using `mistral-medium`, we generate an answer based on the user question and the context retrieved


In [None]:
from mistralai.client import ChatMessage


def run_mistral(user_message, model="mistral-medium"):
    messages = [ChatMessage(role="user", content=user_message)]
    chat_response = client.chat(model=model, messages=messages)
    return chat_response.choices[0].message.content


run_mistral(prompt)

'Benedict Neo is a current student at Iowa State University pursuing a Bachelor of Science in Statistics with a minor in Computer Science, expected to graduate in December 2023. He has a GPA of 3.95. His coursework includes Experimental Design, Bayesian Statistics, Design & Analysis of Algorithms, and Large-scale Data Analysis.\n\nBenedict has worked as an Undergraduate Research Assistant at Iowa State University since January 2022. In this role, he led the development of the WEPPR R package, improved search speed by implementing concurrent processing in FastAPI, and built Svelte components to render search results in a user-friendly design.\n\nHis skills include programming languages such as Python, R, SQL, SAS, JavaScript (React), Java, HTML/CSS, and Bash. He is also proficient in libraries such as Pandas, NumPy, Matplotlib, Plotly, Tidyverse, Scikit-Learn, NLTK, PyTorch, and PySpark, and tools such as AWS, Google Cloud, Docker, Power BI, Tableau, Git, Linux, Hadoop, Spark, and Airfl

## all together now


In [None]:
from faiss import IndexFlatL2

prompt = """
Context information is below.
---------------------
{context}
---------------------
Given the context information and not prior knowledge, answer the query.
Query: {query}
Answer:
"""


def ask(query: str, index: IndexFlatL2, chunks):
    embedding = embed(query)
    embedding = np.array([embedding])

    _, indexes = index.search(embedding, k=2)
    context = [chunks[i] for i in indexes.tolist()[0]]

    user_message = prompt.format(context=context, query=query)

    messages = [ChatMessage(role="user", content=user_message)]
    chat_response = client.chat(model="mistral-medium", messages=messages)
    return chat_response.choices[0].message.content


ask("What work experience does he have?", index, chunks)

"Based on the provided context information, the individual has experience as a President of the Google Developer Student Club at Iowa State University from August 2022 to May 2023. In this role, they led a team of 12 core officers to organize and host tech workshops and talks for over 100 students.\n\nAdditionally, they have experience as a Data Analyst Intern at Tesla from May 2022 to August 2022. Their responsibilities at Tesla included architecting and deploying Airflow ETL pipelines with Docker on Linux, designing and optimizing MySQL database schemas, developing Python packages interfacing with various APIs, and building an interactive web app with Streamlit for failure analysis using text embeddings and NLP techniques.\n\nIt's worth noting that the context information does not specify the duration of the individual's experience as a Data Analyst at a company with revenue up to $1 Million/month, but it can be inferred that they have experience with data visualization, hypothesis t

In [None]:
ask("Does he know how to code in Python?", index, chunks)

'Yes, the individual listed Python as one of the languages they know how to code in.'

In [None]:
ask("What projects has he worked on?", index, chunks)

'The person has worked on two projects based on the provided context information.\n\n1. A project related to analyzing work orders text using NLP and machine learning algorithms for categorizing failures. Additionally, they developed 10 PowerBI dashboards, optimized data models using DAX queries, and analyzed shift hours and badging data for peak hour optimization.\n2. A GitHub project called "githubClassGPT" that involved Python, LlamaIndex, LangChain, AWS S3, OpenAI, Docker, and Streamlit. They developed a chatbot with Netlify and Cloud Run, improved search speed using concurrent processing in FastAPI, and built Svelte components to render search results in a user-friendly design.'

In [None]:
ask("Is he on the job market?", index, chunks)

'Based on the provided context information, it is not explicitly stated whether the individual is currently on the job market or not. The information only mentions their skills, experiences, and past leadership roles. To determine their employment status, additional context or information is needed.'

## Streamlit


### building the index


In [None]:
# Function to build and cache the index from PDFs in a directory
@st.cache_resource
def build_and_cache_index():
    """Builds and caches the index from PDF documents in the specified directory."""
    pdf_files = Path("data").glob("*.pdf")
    text = ""

    for pdf_file in pdf_files:
        reader = PdfReader(pdf_file)
        for page in reader.pages:
            text += page.extract_text() + "\n\n"

    chunk_size = 500
    chunks = [text[i : i + chunk_size] for i in range(0, len(text), chunk_size)]

    embeddings = np.array([embed(chunk) for chunk in chunks])
    dimension = embeddings.shape[1]
    index = IndexFlatL2(dimension)
    index.add(embeddings)

    return index, chunks

### Streaming


In [None]:
# Function to stream a string with a delay
def stream_str(s, speed=250):
    """Yields characters from a string with a delay to simulate streaming."""
    for c in s:
        yield c
        time.sleep(1 / speed)


# Function to stream the response from the AI
def stream_response(response):
    """Yields responses from the AI, replacing placeholders as needed."""
    for r in response:
        content = r.choices[0].delta.content
        # prevent $ from rendering as LaTeX
        content = content.replace("$", "\$")
        yield content

### messages


In [None]:
# Function to add a message to the chat
def add_message(msg, agent="ai", stream=True, store=True):
    """Adds a message to the chat interface, optionally streaming the output."""
    if stream and isinstance(msg, str):
        msg = stream_str(msg)

    with st.chat_message(agent):
        if stream:
            output = st.write_stream(msg)
        else:
            output = msg
            st.write(msg)

    if store:
        st.session_state.messages.append(dict(agent=agent, content=output))

### main


In [None]:
# Main application logic
def main():
    """Main function to run the application logic."""
    if st.sidebar.button("🔴 Reset conversation"):
        st.session_state.messages = []

    index, chunks = build_and_cache_index()

    for message in st.session_state.messages:
        with st.chat_message(message["agent"]):
            st.write(message["content"])

    query = st.chat_input("Ask something about your PDF")

    if not st.session_state.messages:
        add_message("Ask me anything!")

    if query:
        add_message(query, agent="human", stream=False, store=True)
        reply(query, index, chunks)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a605a3e6-1564-47b2-94e7-842290ba7692' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>