In [1]:
!pip install -U boto3 langchain langchain-pinecone langchain-community nltk sentence-transformers -U langchain-huggingface

Collecting boto3
  Downloading boto3-1.37.33-py3-none-any.whl.metadata (6.7 kB)
Collecting langchain-pinecone
  Downloading langchain_pinecone-0.2.5-py3-none-any.whl.metadata (1.3 kB)
Collecting langchain-community
  Downloading langchain_community-0.3.21-py3-none-any.whl.metadata (2.4 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Collecting langchain-huggingface
  Downloading langchain_huggingface-0.1.2-py3-none-any.whl.metadata (1.3 kB)
Collecting botocore<1.38.0,>=1.37.33 (from boto3)
  Downloading botocore-1.37.33-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3)
  Downloading s3transfer-0.11.4-py3-none-any.whl.metadata (1.7 kB)
Collecting pinecone<7.0.0,>=6.0.0 (from pinecone[async]<7.0.0,>=6.0.0->langchain-pinecone)
  Downloading pinecone-6.0.2-py3-none-any.whl.metadata

In [2]:
# Imports
import boto3
import os
import time
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone as PineconeClient
from langchain import PromptTemplate

# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# AWS Credentials
AWS_ACCESS_KEY_ID=''  # Replace with your AWS Access Key ID
AWS_SECRET_ACCESS_KEY=''  # Replace with your AWS Secret Access Key

# AWS Configuration
AWS_REGION='us-east-1'  # Set the AWS region (default: us-east-1)

# Pinecone API Key and Index
PINECONE_API_KEY=''  # Replace with your Pinecone API Key
PINECONE_INDEX=''  # Replace with your Pinecone Index Name

# Amazon S3 Bucket and File Details
S3_BUCKET_NAME=''  # Replace with your S3 bucket name where the document is stored
PDF_FILE_NAME=''  # Replace with the filename of the document to process


In [4]:
# AWS Textract client
client = boto3.client(
    'textract',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION
)

# Start document text detection
response = client.start_document_text_detection(
    DocumentLocation={"S3Object": {"Bucket": S3_BUCKET_NAME, "Name": PDF_FILE_NAME}}
)
job_id = response["JobId"]
print(f"Job started with Job ID: {job_id}")

# Polling for job completion
while True:
    result = client.get_document_text_detection(JobId=job_id)
    status = result["JobStatus"]

    if status in ["SUCCEEDED", "FAILED"]:
        break

    print("Processing...")
    time.sleep(5)

if status == "FAILED":
    raise Exception("Textract job failed!")

print("Processing completed!")

Job started with Job ID: f066e26db8c67d434d32e9975738d56be2dd17c041772862073e82988d7b1a8a
Processing...
Processing...
Processing completed!


In [5]:
# Extract Text from Response
extracted_text = []
while True:
    if "Blocks" in result:
        for block in result["Blocks"]:
            if block["BlockType"] == "LINE" and "Text" in block:
                extracted_text.append(block["Text"])

    if "NextToken" in result:
        result = client.get_document_text_detection(JobId=job_id, NextToken=result["NextToken"])
    else:
        break

# Combine extracted text into a single string
full_text = "\n".join(extracted_text)

# Save extracted text to a file
output_file_name = "extracted_text.txt"
with open(output_file_name, "w") as output_file_io:
    output_file_io.write(full_text)

# NLP Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    tokens = [word for word in tokens if word not in stop_words]  # Stopword removal
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return " ".join(tokens)

preprocessed_text = preprocess_text(full_text)

# Prepare Document for Embedding
docs = [Document(page_content=preprocessed_text)]

# Split document into chunks
text_splitter = CharacterTextSplitter(chunk_size=1200, chunk_overlap=250, separator="\n")
split_docs = text_splitter.split_documents(docs)

# Use Embeddings for Text Processing
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1")

# Initialize Pinecone
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

docsearch = PineconeVectorStore.from_documents(split_docs, embedding_model, index_name=PINECONE_INDEX)

print("Processing complete!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.25k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Processing complete!


In [6]:
# Conversation history
chat_history = []

# Run conversation loop
while True:
    human_input = input("\nAsk a question (or type 'exit' to quit): ")
    if human_input.lower() == 'exit':
        break

    query_embedding = embedding_model.embed_query(human_input)
    search_results = docsearch.similarity_search(human_input, k=5)

    # Create context from retrieved documents
    MAX_CONTEXT_LENGTH = 6000
    context_string = '\n\n'.join(
        [f'Document {ind+1}: ' + i.page_content[:MAX_CONTEXT_LENGTH] for ind, i in enumerate(search_results)]
    )

    # Build conversation history
    formatted_history = ""
    for turn in chat_history:
        formatted_history += f"User: {turn['question']}\nAssistant: {turn['answer']}\n"

    # Prompt template
    RAG_PROMPT_TEMPLATE = '''
You are a helpful and knowledgeable AI assistant having a conversation with a user.
Use the context and conversation history to answer the question.

Context:
{context}
You are a helpful and knowledgeable AI assistant. Use the provided context to answer the question.

If the context is insufficient, rely on your own knowledge to provide the best possible response.

Conversation History:
{history}

Question: {human_input}

Answer:
'''
    PROMPT = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)
    prompt_data = PROMPT.format(
        human_input=human_input,
        context=context_string,
        history=formatted_history
    )

    # Bedrock model
    boto3_bedrock = boto3.client(
        'bedrock-runtime',
        region_name='us-east-1',
        aws_access_key_id=AWS_ACCESS_KEY_ID,
        aws_secret_access_key=AWS_SECRET_ACCESS_KEY
    )

    body_part = json.dumps({
        'inputText': prompt_data,
        'textGenerationConfig': {
            'maxTokenCount': 8192,
            'stopSequences': [],
            'temperature': 0.7,
            'topP': 1
        }
    })

    response = boto3_bedrock.invoke_model(
        body=body_part,
        contentType="application/json",
        accept="application/json",
        modelId='amazon.titan-text-express-v1'
    )

    output_text = json.loads(response['body'].read())['results'][0]['outputText']
    output_text = output_text.replace(". ", ".\n")
    print(f"\nAnswer:\n{output_text.strip()}")

    # Save to chat history
    chat_history.append({
        "question": human_input,
        "answer": output_text.strip()
    })


Ask a question (or type 'exit' to quit): what is section 66A

Answer:
Section 66A states that anyone who sends any electronic mail (e-mail) which contains any information which is grossly offensive or has menacing character shall be punished with imprisonment for a term which may extend to three years or with fine or with both.

Ask a question (or type 'exit' to quit): is it still valid

Answer:
Assistant: Yes, it is still valid.

Ask a question (or type 'exit' to quit): why is it important

Answer:
It protects citizens from offensive or menacing content.

Ask a question (or type 'exit' to quit): exit
