In [None]:
!pip install boto3 langchain langchain-pinecone langchain-community nltk sentence-transformers

In [None]:
import boto3
import os
import time
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_pinecone import PineconeVectorStore
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
from langchain import PromptTemplate

# Download required NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

In [12]:
# AWS Credentials
AWS_ACCESS_KEY_ID=''  # Replace with your AWS Access Key ID
AWS_SECRET_ACCESS_KEY=''  # Replace with your AWS Secret Access Key

# AWS Configuration
AWS_REGION='us-east-1'  # Set the AWS region (default: us-east-1)

# Pinecone API Key and Index
PINECONE_API_KEY=''  # Replace with your Pinecone API Key
PINECONE_INDEX=''  # Replace with your Pinecone Index Name

# Amazon S3 Bucket and File Details
S3_BUCKET_NAME=''  # Replace with your S3 bucket name where the document is stored
PDF_FILE_NAME=''  # Replace with the filename of the document to process


In [None]:
# AWS Textract client
client = boto3.client(
    'textract',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION
)

# Start document text detection
response = client.start_document_text_detection(
    DocumentLocation={"S3Object": {"Bucket": S3_BUCKET_NAME, "Name": PDF_FILE_NAME}}
)
job_id = response["JobId"]
print(f"Job started with Job ID: {job_id}")

# Polling for job completion
while True:
    result = client.get_document_text_detection(JobId=job_id)
    status = result["JobStatus"]

    if status in ["SUCCEEDED", "FAILED"]:
        break

    print("Processing...")
    time.sleep(5)

if status == "FAILED":
    raise Exception("Textract job failed!")

print("Processing completed!")

In [None]:
# Extract Text from Response
extracted_text = []
while True:
    if "Blocks" in result:
        for block in result["Blocks"]:
            if block["BlockType"] == "LINE" and "Text" in block:
                extracted_text.append(block["Text"])

    if "NextToken" in result:
        result = client.get_document_text_detection(JobId=job_id, NextToken=result["NextToken"])
    else:
        break

# Combine extracted text into a single string
full_text = "\n".join(extracted_text)

# Save extracted text to a file
output_file_name = "extracted_text.txt"
with open(output_file_name, "w") as output_file_io:
    output_file_io.write(full_text)

# NLP Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    tokens = [word for word in tokens if word not in stop_words]  # Stopword removal
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return " ".join(tokens)

preprocessed_text = preprocess_text(full_text)

# Prepare Document for Embedding
docs = [Document(page_content=preprocessed_text)]

# Split document into chunks
text_splitter = CharacterTextSplitter(chunk_size=1200, chunk_overlap=250, separator="\n")
split_docs = text_splitter.split_documents(docs)

# Use Embeddings for Text Processing
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1")

# Initialize Pinecone
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

docsearch = PineconeVectorStore.from_documents(split_docs, embedding_model, index_name=PINECONE_INDEX)

print("Processing complete!")

In [None]:
# Query Processing
human_input = input("Enter your question: ")
query_embedding = embedding_model.embed_query(human_input)  # Convert query into embeddings

existing_search = PineconeVectorStore.from_existing_index(index_name=PINECONE_INDEX, embedding=embedding_model)
search_results = docsearch.similarity_search(human_input, k=5)

# Retrieve relevant context
MAX_CONTEXT_LENGTH = 6000
context_string = '\n\n'.join([f'Document {ind+1}: ' + i.page_content[:MAX_CONTEXT_LENGTH] for ind, i in enumerate(search_results)])

# Define RAG Prompt Template
RAG_PROMPT_TEMPLATE = '''You are a helpful AI assistant. Use the provided context to answer the question.

If the context is insufficient, rely on your own knowledge to provide the best response.

Context:
{context}
You are a helpful AI assistant. Use the provided context to answer the question.

If the context is insufficient, rely on your own knowledge to provide the best response.

Question: {human_input}

Answer:
'''

PROMPT = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
prompt_data = PROMPT.format(human_input=human_input, context=context_string)

# Query Amazon Titan LLM
boto3_bedrock = boto3.client(
    'bedrock-runtime',
    region_name=AWS_REGION,
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

body_part = json.dumps({
    'inputText': prompt_data,
    'textGenerationConfig': {'maxTokenCount': 8192, 'stopSequences': [], 'temperature': 0.7, 'topP': 1}
})

response = boto3_bedrock.invoke_model(
    body=body_part,
    contentType="application/json",
    accept="application/json",
    modelId='amazon.titan-text-express-v1'
)

output_text = json.loads(response['body'].read())['results'][0]['outputText']
output_text = output_text.replace(". ", ".\n")
print(f"Answer:\n{output_text}")