In [10]:
import boto3
import os
import json

from langchain_community.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.embeddings import BedrockEmbeddings

from langchain_community.vectorstores import FAISS

In [11]:
# Load the JSON file
with open("aws_credentials.json", "r", encoding="utf-8-sig") as file:
    data = file.read()  # Read the file content
    credentials = json.loads(data) 

# Extract credentials
aws_access_key_id = credentials["aws_access_key_id"]
aws_secret_access_key = credentials["aws_secret_access_key"]

In [12]:
SERVICE_NAME = 'bedrock-runtime'
REGION_NAME = 'ap-south-1'

bedrock = boto3.client(
 service_name=SERVICE_NAME,
 region_name=REGION_NAME,
 aws_access_key_id=aws_access_key_id,
 aws_secret_access_key=aws_secret_access_key)

In [13]:
bedrock_embeddings = BedrockEmbeddings(model_id="amazon.titan-embed-text-v2:0", client=bedrock)

In [14]:
def split_text(pages, chunk_size=1000, overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    docs = text_splitter.split_documents(pages)
    return docs

In [18]:
def create_vector_store(text_chunks):
    vectorstore_faiss = FAISS.from_documents(
        text_chunks,
        bedrock_embeddings
    )

    file_name = f"vectorstore"

    vectorstore_faiss.save_local("faiss_index")
    

    return True


In [19]:
upload_file = "MoM _ HGI 01_06_2025.pdf"  # Assuming this is the file path or binary data

loader = PyPDFLoader(upload_file)
pages = loader.load_and_split()

split_document = split_text(pages, 1000, 200)

response = create_vector_store(split_document)
