## Summarization Feature

In [None]:
%pip install --quiet langchain boto3 langchain-community
%pip install --quiet "amazon-textract-caller>=0.2.0" amazon-textract-textractor

In [None]:
import boto3
from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.llms import Bedrock
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

textract_client = boto3.client("textract", region_name="us-east-1")
bedrock_client = boto3.client("bedrock-runtime", region_name="us-east-1")


file_path = "s3://bucket-ufms-document/docs/INSTRUCAO NORMATIVA (PROGRAD_RTR) n 4, de 20-06-2023..pdf"

loader = AmazonTextractPDFLoader(file_path, client=textract_client)
documents = loader.load()

print(documents)

In [None]:
template = """

Given a full document, give me a concise summary. Skip any preamble text and just give the summary.

<document>{doc_text}</document>
<summary>"""


prompt = PromptTemplate(template=template, input_variables=["doc_text"])
bedrock_llm = Bedrock(client=bedrock_client, model_id="anthropic.claude-v2")

llm_chain = LLMChain(prompt=prompt, llm=bedrock_llm)
summary = llm_chain.run(documents[0].page_content)

print(summary.replace("</summary>","").strip())

## MAP REDUCE

In [None]:
from langchain.document_loaders import AmazonTextractPDFLoader
from langchain.llms import Bedrock

bedrock_client = boto3.client("bedrock-runtime", region_name="us-east-1")

bedrock_llm = Bedrock(client=bedrock_client, model_id="anthropic.claude-v2")

loader = AmazonTextractPDFLoader(file_path, client=textract_client)
document = loader.load()

num_docs = len(document)
print (f"There are {num_docs} pages in the document")

# Deprecated
# for index, doc in enumerate(document):
#     num_tokens_first_doc = bedrock_llm.get_num_tokens(doc.page_content)
#     print (f"Page {index+1} has approx. {num_tokens_first_doc} tokens")

In [None]:
from langchain.chains.summarize import load_summarize_chain

summary_chain = load_summarize_chain(llm=bedrock_llm, 
			         chain_type='map_reduce')
output = summary_chain.run(document)
print(output.strip())