In [2]:
import pdfplumber
import cohere
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document
from langchain.vectorstores import FAISS
from langchain_cohere import CohereEmbeddings

A function to read PDF content

In [3]:
def extract_pdf_content(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        all_content = []
        for page in pdf.pages:
            text = page.extract_text()  
            tables = page.extract_tables()  
            
            page_content = {"text": text, "tables": tables}
            all_content.append(page_content)
        return all_content

### Chunking

A function for **Semantic chunking** to consider large tables without loosing track of table header

In [4]:
def chunk_paragraphs_and_tables(content):
    chunks = []

    
    paragraph_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

    # For each page, look for paragraphs and tables
    for page in content:
        text = page['text']
        text_chunks = paragraph_splitter.split_text(text)

        # Fixed chunking for paragraphs and add metadata
        chunks.extend([Document(page_content=chunk, metadata = {'chunk_num': text_chunk_num, 'chunk_type' : 'text'}) \
                       for text_chunk_num, chunk in enumerate(text_chunks)])
        
        
        table_chunk_num = len(chunks) 
        for table in page['tables']:
            table_chunks = []
            header = table[0]  
            rows = table[1:]

            # For each table split a certain number of splits manually 
            for i in range(0, len(rows), 2):  
                table_chunk = [header] + rows[i:i+2]
                table_chunks.append(Document(page_content=str(table_chunk), metadata = {'chunk_num': table_chunk_num, 'chunk_type': 'table'}   ))
                table_chunk_num += 1
            chunks.extend(table_chunks)

    return chunks

In [5]:
# Read and chunk the PDF
pdf_content = extract_pdf_content("sample.pdf")
semantic_chunks = chunk_paragraphs_and_tables(pdf_content)

Take a look at the first and last chunks to verify

A very small chunk size was used for the table just to test the embedding model on that small pdf file

In [6]:
print('--------------------- First chunk ---------------------')
print(semantic_chunks[0])
print('--------------------- Last chunk ---------------------')
print(semantic_chunks[-1])

--------------------- First chunk ---------------------
page_content='Marwan Yasser
This is a text that has no meaning just to add any tokens for the model to test the retrieveal and the
generation will be tested later and I’ll simply repreat this text for multiple times ok
This is a text that has no meaning just to add any tokens for the model to test the retrieveal and the
generation will be tested later and I’ll simply repreat this text for multiple times ok
This is a text that has no meaning just to add any tokens for the model to test the retrieveal and the
generation will be tested later and I’ll simply repreat this text for multiple times ok
This is a text that has no meaning just to add any tokens for the model to test the retrieveal and the
generation will be tested later and I’ll simply repreat this text for multiple times ok
Name ID Department
Marwan 1 First
Mahmoud 2 Second
Ali 3 First
Ayman 4 Second
Hossam 5 third
Nada 6 second' metadata={'chunk_num': 0, 'chunk_type': 'tex

## Embedd and build the vector database

In [7]:
embedder = CohereEmbeddings(cohere_api_key='kQ1bePfavZiNGicrlYIHa71W8M0P0DJbr3Ss89Wt', model='embed-english-v2.0')

In [8]:
vector_db = FAISS.from_documents(semantic_chunks, embedder)

### Test the vector DB using sample query

In [10]:
query_text = "id and department of Marwan"
retrieved_docs = vector_db.similarity_search(query_text, k=1)
retrieved_docs

[Document(metadata={'chunk_num': 1, 'chunk_type': 'table'}, page_content="[['Name', 'ID', 'Department'], ['Marwan', '1', 'First'], ['Mahmoud', '2', 'Second']]")]

### Modularize the retrieval

In [11]:
def get_relevant_content(query, vector_db):
    retrieved_docs = vector_db.similarity_search(query, k=1)
    content = ''
    for doc in retrieved_docs:
        content += doc.page_content
    return content

In [12]:
get_relevant_content('department of marwan', vector_db)

"[['Name', 'ID', 'Department'], ['Marwan', '1', 'First'], ['Mahmoud', '2', 'Second']]"

# Generation

Establishing the model and test it

In [20]:
import cohere

# Initialize the Cohere client with your API key
generation_model = cohere.Client('kQ1bePfavZiNGicrlYIHa71W8M0P0DJbr3Ss89Wt')  

def generate (query):
    response = generation_model.generate(
        model='command-xlarge-nightly',  
        prompt= query,
        max_tokens=100,  
        temperature=0.7,  
        k=0,  # Top-k sampling (0 disables)
        p=0.9  # Top-p (nucleus) sampling
    )
    return response.generations[0].text

In [14]:
generate('tell me your name, give me a one word answer')

'Coral.'

In [34]:
generate('repeat the same exact prompt that you will have as an output beecaues I want to know your limit of context window whether it will be a limitation or no I m marwan who am I')

'I am Marwan. I want to know your limit of context window; whether it will be a limitation or not. Repeat the same exact prompt as an output because I want to know your limit of context window.'

# RAG

A function that works with that model

In [35]:
def rag(prompt, vectordb):
    relevant_content = get_relevant_content(prompt, vectordb)
    final_prompt = f'Answer this question about the document: {prompt} given the following information: {relevant_content}'
    response = generate(final_prompt)
    return response

A function that doesn't work with that model

In [37]:
def rag(prompt, vectordb):
    relevant_content = get_relevant_content(prompt, vectordb)
    final_prompt = f'Act as a normal chatbot to answer this: {prompt} ,\
        use this info for help: {relevant_content},\
        if it is not usefull discard it and not talk about it'
    response = generate(final_prompt)
    return response

In [38]:
rag('Department of marwan', vector_db)

"Sure, I'll do my best to answer your questions as a normal chatbot without referring to the provided table unless it's relevant. How can I assist you today?"