<a href="https://colab.research.google.com/github/Med-Rokaimi/LLMs-document-analysis/blob/main/Document%20analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Analyzing documents LLMs models : PALM2 and Gemini Pro:


*   Summarize each document
*   Quering and retrival documents (Q&A)
*   Identify the documents discussing a specific topic
*   Q&A : Gemini Pro example


# packages

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
!pip install -q -U google-generativeai
!pip -q install langchain
!pip install chromadb
!pip install PyMuPDF

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m794.3/794.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.3/46.3 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.4/49.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting chromadb
  Downloading chromadb-0.4.20-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.7/507.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from langchain import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from pathlib import Path as p
from langchain.embeddings import GooglePalmEmbeddings
from langchain.llms import GooglePalm
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
from langchain.vectorstores import Chroma

import pandas as pd

In [None]:
import os
# configure palm
import google.generativeai as palm
GOOGLE_API_KEY = #put your API here. [You can use env to store the api key]
palm.configure(api_key=GOOGLE_API_KEY)

In [4]:
# I prefer to load the pdf files to one csv file
import os
import csv
import fitz  # PyMuPDF

def extract_pdf_content(pdf_path):
    doc = fitz.open(pdf_path)
    content = ""
    for page_num in range(doc.page_count):
        page = doc[page_num]
        content += page.get_text()
    doc.close()
    return content

def process_pdfs(input_folder, output_csv):
    # Create or append to the CSV file
    mode = 'w' if not os.path.exists(output_csv) else 'a'

    with open(output_csv, mode, newline='', encoding='utf-8') as csvfile:
        fieldnames = ['id', 'name', 'content']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        # If the file is being created, write the header
        if mode == 'w':
            writer.writeheader()

        # Iterate through PDF files in the input folder
        for idx, pdf_file in enumerate(os.listdir(input_folder)):
            if pdf_file.endswith(".pdf"):
                pdf_path = os.path.join(input_folder, pdf_file)
                content = extract_pdf_content(pdf_path)
                #print(f"id: {idx + 1}, name: {pdf_file}")
                writer.writerow({'id': idx + 1, 'name': pdf_file, 'content': content})


input_folder_path = "/content/docs"
output_csv_path = "raw_data.csv"
process_pdfs(input_folder_path, output_csv_path)
print("PDFs loaded into CSV successfully.")


PDFs loaded into CSV successfully.


In [None]:
#reading the raw data using langchain CSV loader
loader = CSVLoader(file_path="./raw_data.csv")
documents = loader.load()

In [None]:
#see some content
print(len(documents))
print(f"# of words in the document = {len(documents[0].page_content)}")

8
# of words in the document = 63382


#  Palm2 apis with langchain





In [None]:
llm = GooglePalm(google_api_key=GOOGLE_API_KEY)
llm.temperature = 0.1

## Summerise each book

In [None]:
# I'm gonna use refine method for chunking the pdfs content, as this method retains the context.


# prepare the prompt template for text summerisaion
question_prompt_template = """
                  Please provide a summary of the following text. the first chunk should recognise the document title, and a summary extracted from the introduction section.
                  TEXT: {text}
                  SUMMARY:
                  """
question_prompt = PromptTemplate(
    template=question_prompt_template, input_variables=["text"]
)
refine_prompt_template = """
              Write a concise summary of the following text.
              ```{text}```
              SUMMARY:
              """

refine_prompt = PromptTemplate(
    template=refine_prompt_template, input_variables=["text"]
)
refine_chain = load_summarize_chain(
    llm,
    chain_type="refine",
    question_prompt=question_prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
)
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 5000,
    chunk_overlap  = 100)


In [None]:
# Fixed-size chunking (note: content-aware could be another chunking option)
try:
    document_chunks = document_chunks = text_splitter.split_documents(documents)
    #print(f"Number chunks {len(document_chunks)}")
    refine_outputs = refine_chain({"input_documents": document_chunks})
except IndexError:
    gotdata = 'null'


Unnamed: 0,concise_summary,doc,file_name,chunks
0,The key challenges for Paraguay include reduci...,"{'source': './raw_data_light.csv', 'row': 0}",raw_data_light,: 0\nCPD_ID: pry_2\ntext: United Nations ...
1,This document presents UNDP's programme strate...,"{'source': './raw_data_light.csv', 'row': 4}",raw_data_light,"benefited approximately 15,000 individuals13. ..."
2,UNDP's proposed programme for Iraq focuses on ...,"{'source': './raw_data_light.csv', 'row': 4}",raw_data_light,which transfers the powers of eight ministries...
3,This document presents the results and resourc...,"{'source': './raw_data_light.csv', 'row': 4}",raw_data_light,to cost-share and provide funding to UNDP-impl...
4,This document outlines the key performance ind...,"{'source': './raw_data_light.csv', 'row': 4}",raw_data_light,institutions as more transparent implement...


In [None]:
#showing chunk summeries
final_refine_data = []
for doc, out in zip(refine_outputs["input_documents"], refine_outputs["intermediate_steps"]):
        output = {}
        output["doc"] = doc.metadata['row']
        output["file_name"] = p(doc.metadata["source"]).stem
        output["chunks"] = doc.page_content.strip()
        output["concise_summary"] = out.strip()
        final_refine_data.append(output)

pdf_refine_summary  = pd.DataFrame.from_dict(final_refine_data)
pdf_refine_summary  = pdf_refine_summary .sort_values(
    by=["file_name"]
)  # sorting the dataframe by filename and page_number
pdf_refine_summary .reset_index(inplace=True, drop=True)
pdf_refine_summary .head(10)

Unnamed: 0,doc,file_name,chunks,concise_summary
0,0,raw_data_light,: 0\nCPD_ID: pry_2\ntext: United Nations ...,The key challenges for Paraguay include reduci...
1,4,raw_data_light,"benefited approximately 15,000 individuals13. ...",This document presents UNDP's programme strate...
2,4,raw_data_light,which transfers the powers of eight ministries...,UNDP's proposed programme for Iraq focuses on ...
3,4,raw_data_light,to cost-share and provide funding to UNDP-impl...,This document presents the results and resourc...
4,4,raw_data_light,institutions as more transparent implement...,This document outlines the key performance ind...
5,4,raw_data_light,Target: Yes ...,This document outlines the results and resourc...
6,4,raw_data_light,DP/DCP/IRQ/2\n ...,This document outlines the results and resourc...
7,5,raw_data_light,: 5\nCPD_ID: sle_1\ntext: United Nations ...,The CPD 2008-2010 for Sierra Leone is based on...
8,5,raw_data_light,hindered effective benchmarking and tracking o...,The Government of Sierra Leone has developed a...
9,4,raw_data_light,: 4\nCPD_ID: irq_2\ntext: United Nations ...,This document presents the country programme d...


In [None]:
#showing a summary for each book
summeries = pdf_refine_summary.groupby('doc')['concise_summary'].agg(lambda x: '\n'.join(x))


# rephrase & improve the context. start by this documents ....

# Print the merged DataFrame
print(summeries.head(5))
summeries.to_csv("summaries.csv", index=True)

doc
0    The key challenges for Paraguay include reduci...
1    Ghana's development challenges are: persistent...
2    This is a document about the development of Be...
3    4 key areas for Cambodia's development plan: r...
4    This document presents UNDP's programme strate...
Name: concise_summary, dtype: object


## Q&A text books

In [None]:
from langchain.document_loaders.csv_loader import CSVLoader
loader = CSVLoader(file_path="./raw_data.csv")
pdfs = loader.load()

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 5000,
    chunk_overlap  = 100)

pdfs_chunks = text_splitter.split_documents(pdfs)

print(f"Number documents {len(pdfs)}")
print(f"Number chunks {len(pdfs_chunks)}")

pdfs_chunks__=[f"Context: {chunk.page_content} Source: {chunk.metadata['source']}" for chunk in pdfs_chunks]


Number documents 369
Number chunks 4098


In [None]:
print(documents[0])



In [None]:
print(pdfs[0].metadata)

{'source': './raw_data_light.csv', 'row': 0}


In [None]:

embedding=GooglePalmEmbeddings(google_api_key=GOOGLE_API_KEY)

In [None]:
#another way to index chunks
'''
index = VectorstoreIndexCreator(embedding=embedding,
        text_splitter=CharacterTextSplitter(separator = "\n",chunk_size=1000, chunk_overlap=10)).from_documents(pdfs)


chain = RetrievalQA.from_chain_type(llm=llm,
                            chain_type="stuff",
                            retriever=index.vectorstore.as_retriever(),
                            input_key="question")
'''

In [None]:
question = "does the text discusses how political instability and terrorism impact poverty?"

query_prompt_template = """
                  Follow exactly those 3 steps:
                  1. Read the context below and aggregrate this data
                    Context : {text}
                  2. Answer the {question} using only this context by yes or no
                  3. if yes, Show the docoument id of the document answers the user Question: {question}
                  4. if no, just say no, it doesn't discuss it

                 If you don't have any context and are unsure of the answer, reply that you don't know about this topic.
                """

query_prompt = PromptTemplate(
    template=query_prompt_template, input_variables=["text"]
)

In [None]:
persist_directory = 'docs/chroma/'
db = Chroma.from_documents(pdfs_chunks, embedding, persist_directory=persist_directory)


In [None]:
# Expose index to the retriever
retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
# Create chain to answer questions
from langchain.chains import RetrievalQA


qa = RetrievalQA.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True
)
response = qa({"query": question})
print(response)



In [None]:
#another way
#print the results along with the source documents, so that we can see how the\
      # retriever performed, i.e. which chunks of text it found to prepend to the query as context for the LLM.
def print_result(response_obj):
    print("SOURCES: \n")
    cnt = 1
    for source_doc in response_obj["source_documents"]:
        print(f"Chunk #{cnt}")
        cnt += 1
        print("Source Metadata: ", source_doc.metadata)
        print("Source Text:")
        print(source_doc.page_content)
        print("\n")
    print("RESULT: \n")
    print(response_obj["result"] + "\n\n")


print_result(response)

In [None]:
#Contextual Compression
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n\n" + d.page_content for i, d in enumerate(docs)]))

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

# Wrap our vectorstore
compressor = LLMChainExtractor.from_llm(llm)

compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor,
    base_retriever=db.as_retriever()
)

compressed_docs = compression_retriever.get_relevant_documents(question)
pretty_print_docs(compressed_docs)

Q&A : Gemini Pro example

In [None]:
import google.generativeai as genai
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-pro')

In [None]:
loader = CSVLoader(file_path="./raw_data.csv")
documents = loader.load()

In [None]:

retrived_docs = pd.DataFrame()
context =        """
                     I will give you a long text, you are required to find out if the text clearly includes some discussions about how political instability and terrorism impact poverty.:
                     1. You should answer with Yes or No,
                     2. if Yes, show the paragraphs discussing political instability and terrorism impact poverty.
                     3. if no,  show a text expressing that the text doesn't include discussion about the topic.

                """
for pdf in documents:
    text = pdf.page_content
    g_response = model.generate_content([context , text], stream=True)
    g_response.resolve()
    answer = g_response.text
    answer = answer.strip()
    import re
    #answer = re.sub(r"[^\w\s]", "", answer).lower()
    if answer.startswith("Yes") :
        data = {"doc_id": pdf.metadata['row'] , "text": answer[3:20]}
        retrived_docs = retrived_docs.append(pd.Series(data), ignore_index=True)



In [None]:
print(retrived_docs)
retrived_docs.to_csv("result_task2.csv")

   doc_id                                               text
0      99   the discussions about how political instabili...
1     101   under section i programme rationale\nparagrap...


In [None]:
filtered_documents = [
    document
    for document in documents
    if document.metadata["row"] in retrived_docs["doc_id"]
]

