In [1]:
## !pip install -U ipykernel jupyter ipywidgets numpy langchain_community pip install
## !pip install -U sentence_transformers numpy pandas unstructured openpyxl shutup PyPDF2

### Importing general Libraries
import os
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
import shutup; shutup.please()

import pandas as pd
import numpy as np
import PyPDF2



In [2]:
# store the pdfS FILES in a specific map

map= "LAWTON"
os.makedirs(map, exist_ok=True)

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Load pdf files in the local directory
loader = PyPDFDirectoryLoader("./LAWTON")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 900,
    chunk_overlap  = 50,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(metadata={'source': 'LAWTON\\QoL reference Lawton.pdf', 'page': 0}, page_content='Original Research Article\nDement Geriatr Cogn Disord 2004;18:159–164\nDOI: 10.1159/000079196\nA Model for Quality of Life Measures in\nPatients with Dementia:\nLawton’s Next Step\nC. Jonkera,b D.L. Gerritsenb P.R. Bosboomc J.T. Van der Steenb\naDepartment of Psychiatry and bInstitute for Research in Extramural Medicine, VU University Medical Center, and\ncDepartment of Clinical Neuropsychology, VU University, Amsterdam, The Netherlands\nAccepted: January 22, 2004\nPublished online: June 21, 2004\nProf. Dr. C. Jonker, Behavioral Neurologist\nDepartment of Psychiatry and EMGO Institute\nVU University Medical Center, Van der Boechorststraat 7\nNL–1081 BT Amsterdam (The Netherlands)\nTel. +31 20 4446770, Fax +31 20 4446775, E-Mail c.jonker.emgo@med.vu.nl\nABC\nFax +41 61 306 12 34\nE-Mail karger@karger.ch\nwww.karger.com\n© 2004 S. Karger AG, Basel\n1420–8008/04/0182–0159$21.00/0\nAccessible online 

### Split documents to smaller chunks

Large enough to contain enough information to answer aquestion.

Small enough to fit into the LLM prompt: Mistral-7B-v0.1 input tokens limit  4960 tokens.

Small enough to fit into the embeddings model: BAAI/bge-small-en-vI.
Input tokens embedder must be limited to 512 tokens.
Roughly 2000 characters. Note: 1 to approximately 4 characters.

Here we split documents to chunks smaller than 700 characters with an overlap of 50 characters.

In [4]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there was/were {len(docs_before_split)} document(s) loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there was/were 6 document(s) loaded, with average characters equal to 4561.
After split, there were 36 documents (chunks), with average characters equal to 767 (average chunk length).


In [5]:
display(docs_after_split[0])

Document(metadata={'source': 'LAWTON\\QoL reference Lawton.pdf', 'page': 0}, page_content='Original Research Article\nDement Geriatr Cogn Disord 2004;18:159–164\nDOI: 10.1159/000079196\nA Model for Quality of Life Measures in\nPatients with Dementia:\nLawton’s Next Step\nC. Jonkera,b D.L. Gerritsenb P.R. Bosboomc J.T. Van der Steenb\naDepartment of Psychiatry and bInstitute for Research in Extramural Medicine, VU University Medical Center, and\ncDepartment of Clinical Neuropsychology, VU University, Amsterdam, The Netherlands\nAccepted: January 22, 2004\nPublished online: June 21, 2004\nProf. Dr. C. Jonker, Behavioral Neurologist\nDepartment of Psychiatry and EMGO Institute\nVU University Medical Center, Van der Boechorststraat 7\nNL–1081 BT Amsterdam (The Netherlands)\nTel. +31 20 4446770, Fax +31 20 4446775, E-Mail c.jonker.emgo@med.vu.nl\nABC\nFax +41 61 306 12 34\nE-Mail karger@karger.ch\nwww.karger.com\n© 2004 S. Karger AG, Basel\n1420–8008/04/0182–0159$21.00/0\nAccessible online 

### Text Embeddings with Hugging Face models
At the time of writing (nov 2024), there are 213 text embeddings models for English on the Massive Text Embedding Benchmark (MTEB) leaderboard

https://huggingface.co/spaces/mteb/leaderboard

 For our project, we are using LangChain’s HuggingFaceBgeEmbeddings (BGE models on the Hugging Face.

Aaccording to LangChain are “the best open-source embedding models”.

 Currently, **BAAI/bge-small-en-v1.5** model is the 26th on MTEB leaderboard with max tokens: 512 tokens, embedding dimensions: 384 and model size: 0.13GB

To use, you should have the sentence_transformers python package installed. To use Nomic, make sure the version of sentence_transformers >= 2.3.0.

https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.huggingface.HuggingFaceBgeEmbeddings.html#langchain-community-embeddings-huggingface-huggingfacebgeembeddings.

In [7]:
import torch
from torch import cuda, bfloat16
from transformers import AutoTokenizer, AutoModel

device = f'cuda:{cuda.current_device()}' if cuda.is_available() else 'cpu'
display(torch.cuda.is_available())

### ===> takes about 60 seconds to complete the process

from langchain_community.embeddings import HuggingFaceBgeEmbeddings
### EMBEDDING models
# https://python.langchain.com/docs/integrations/text_embedding/

model_name = "BAAI/bge-large-en-v1.5"
#model_name = "BAAI/bge-small-en-v1.5"
#model_name ="jinaai/jina-embeddings-v3"


model_kwargs = {'device': 'cuda', # use cuda" when GPU is available
                'trust_remote_code':True
                } 

encode_kwargs = {'normalize_embeddings': True,
                 'truncate':True # truncate the input to the maximum length the model can handle
                 }  

### Create the embeddings object
huggingface_embeddings = HuggingFaceBgeEmbeddings(
                                                    model_name=model_name,
                                                    model_kwargs=model_kwargs,
                                                    encode_kwargs=encode_kwargs,
)


display(torch.cuda.is_available())

True

True

### Retrieval System for vector embeddings

In [8]:
import numpy as np

## provide a sample of how the embedding

sample_embedding = np.array(huggingface_embeddings.embed_query(docs_after_split[0].page_content))
print("Sample embedding of a document chunk: ", sample_embedding)
print("Size of the embedding: ", sample_embedding.shape)

Sample embedding of a document chunk:  [ 0.04514517 -0.01253906 -0.01787082 ... -0.02373618 -0.020726
  0.01130795]
Size of the embedding:  (1024,)


In [9]:
## Create the vector store from the splitted  data and use the selected embedding model

from langchain_community.vectorstores import FAISS
vectorstore = FAISS.from_documents(docs_after_split, huggingface_embeddings)

In [10]:
###pip install huggingface_hub["cli"]
###Then run the command:
### ====> huggingface-cli delete-cache

from huggingface_hub import scan_cache_dir
hf_cache_info = scan_cache_dir()
display(hf_cache_info)


### Designate model +  Create & test the defined LLM via HuggingFace HUB API (online)
from langchain_huggingface import HuggingFaceEndpoint


# Define the LLM
llm = HuggingFaceEndpoint(#repo_id='tiiuae/falcon-7b-instruct',
                          repo_id="mistralai/Mistral-7B-v0.1",
                          #repo_id="HuggingFaceH4/zephyr-7b-beta",
                          #repo_id="HuggingFaceH4/mistral-7b-anthropic",
                          max_new_tokens=1024,
                          top_k=10,
                          top_p=0.95,
                          typical_p=0.95,
                          temperature=0.5,
                          repetition_penalty=1.03
                          #callbacks=callbacks,
                          #huggingfacehub_api_token=huggingfacehub_api_token
                          )
        
# Predict the words following the text in question
question = """Explain how to use python code to extract text from a PDF file."""
output = llm.invoke(question)

print(output)





The following code is used to extract text from a PDF file using Python:

import PyPDF2

pdf_file = open('filename.pdf', 'rb')

pdf_reader = PyPDF2.PdfFileReader(pdf_file)

num_pages = pdf_reader.numPages

for i in range(num_pages):

    page = pdf_reader.getPage(i)

    text = page.extractText()

    print(text)

pdf_file.close()

This code uses the PyPDF2 library to read the PDF file and extract the text from each page. The text is then printed to the console.


In [11]:
from langchain.prompts import PromptTemplate

prompt_template = """Use the following pieces of context to answer the question at the end. Please follow the following rules:
1. If you don't know the answer, don't try to make up an answer. Just say "I can't find the final answer but you may want to check the following links".
2. If you find the answer, write the answer in a concise way with five sentences maximum.

{context}

Question: {question}

Helpful Answer:
"""

PROMPT = PromptTemplate(
 template=prompt_template, input_variables=["context", "question"]
)

In [12]:
#query = """Give a 500 word summary of the paper titled:  A Model for Quality of Life Measures in Patients with Dementia:Lawton’s Next Step. Start with stating the authors en journal + the abstract and then summarize the introduction, methodology, results and conclusion."""  
query = """What are the actual pagenumers of the paper titled: A Model for Quality of Life Measures in Patients with Dementia:Lawton’s """  
         # Sample question, change to other questions you are interested in.


relevant_documents = vectorstore.similarity_search(query)
print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)

There are 4 documents retrieved which are relevant to the query. Display the first one:

Original Research Article
Dement Geriatr Cogn Disord 2004;18:159–164
DOI: 10.1159/000079196
A Model for Quality of Life Measures in
Patients with Dementia:
Lawton’s Next Step
C. Jonkera,b D.L. Gerritsenb P.R. Bosboomc J.T. Van der Steenb
aDepartment of Psychiatry and bInstitute for Research in Extramural Medicine, VU University Medical Center, and
cDepartment of Clinical Neuropsychology, VU University, Amsterdam, The Netherlands
Accepted: January 22, 2004
Published online: June 21, 2004
Prof. Dr. C. Jonker, Behavioral Neurologist
Department of Psychiatry and EMGO Institute
VU University Medical Center, Van der Boechorststraat 7
NL–1081 BT Amsterdam (The Netherlands)
Tel. +31 20 4446770, Fax +31 20 4446775, E-Mail c.jonker.emgo@med.vu.nl
ABC
Fax +41 61 306 12 34
E-Mail karger@karger.ch
www.karger.com
© 2004 S. Karger AG, Basel
1420–8008/04/0182–0159$21.00/0
Accessible online at:


In [13]:
# Use the retrieved documents to answer the question
# The `chain_type` is set to "stuff" to use the `RetrievalQA` class.
# k=1 means that only the most relevant document is used to answer the question.
# The `return_source_documents` is set to True to return the source documents used to answer the question.


#### IF ANSWER is inconsistent change K to higher number

from langchain.chains import RetrievalQA

retrievalQA = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10}),
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)

In [14]:
# Call the QA chain with our query.
result = retrievalQA.invoke({"query": query})
print(result['result'])


1. The abstract starts on page 159.
2. The article starts on page 159.
3. The first reference is on page 160.
4. The last reference is on page 164.
5. The first figure is on page 161.
6. The last figure is on page 162.
7. The first table is on page 161.
8. The last table is on page 162.
9. The first box is on page 162.
10. The last box is on page 162.

Accessible online at:
www.karger.com/dem


In [15]:
relevant_docs = result['source_documents']
print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')
print("*" * 100)

for i, doc in enumerate(relevant_docs):
    print(f"Relevant Document #{i+1}:\n Source file: {doc.metadata['source']}, Page: {doc.metadata['page']}\nContent: {doc.page_content}")
    print("#"*100) 
    print(f'There are {len(relevant_docs)} documents retrieved which are relevant to the query.')

There are 10 documents retrieved which are relevant to the query.
****************************************************************************************************
Relevant Document #1:
 Source file: LAWTON\QoL reference Lawton.pdf, Page: 0
Content: Original Research Article
Dement Geriatr Cogn Disord 2004;18:159–164
DOI: 10.1159/000079196
A Model for Quality of Life Measures in
Patients with Dementia:
Lawton’s Next Step
C. Jonkera,b D.L. Gerritsenb P.R. Bosboomc J.T. Van der Steenb
aDepartment of Psychiatry and bInstitute for Research in Extramural Medicine, VU University Medical Center, and
cDepartment of Clinical Neuropsychology, VU University, Amsterdam, The Netherlands
Accepted: January 22, 2004
Published online: June 21, 2004
Prof. Dr. C. Jonker, Behavioral Neurologist
Department of Psychiatry and EMGO Institute
VU University Medical Center, Van der Boechorststraat 7
NL–1081 BT Amsterdam (The Netherlands)
Tel. +31 20 4446770, Fax +31 20 4446775, E-Mail c.jonker.emgo@med.vu.nl


In [16]:
display(relevant_docs)

[Document(metadata={'source': 'LAWTON\\QoL reference Lawton.pdf', 'page': 0}, page_content='Original Research Article\nDement Geriatr Cogn Disord 2004;18:159–164\nDOI: 10.1159/000079196\nA Model for Quality of Life Measures in\nPatients with Dementia:\nLawton’s Next Step\nC. Jonkera,b D.L. Gerritsenb P.R. Bosboomc J.T. Van der Steenb\naDepartment of Psychiatry and bInstitute for Research in Extramural Medicine, VU University Medical Center, and\ncDepartment of Clinical Neuropsychology, VU University, Amsterdam, The Netherlands\nAccepted: January 22, 2004\nPublished online: June 21, 2004\nProf. Dr. C. Jonker, Behavioral Neurologist\nDepartment of Psychiatry and EMGO Institute\nVU University Medical Center, Van der Boechorststraat 7\nNL–1081 BT Amsterdam (The Netherlands)\nTel. +31 20 4446770, Fax +31 20 4446775, E-Mail c.jonker.emgo@med.vu.nl\nABC\nFax +41 61 306 12 34\nE-Mail karger@karger.ch\nwww.karger.com\n© 2004 S. Karger AG, Basel\n1420–8008/04/0182–0159$21.00/0\nAccessible online