In [None]:
import boto3
from requests_aws4auth import AWS4Auth
from opensearchpy import RequestsHttpConnection
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import OpenSearchVectorSearch
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter



In [None]:

loader = PyPDFLoader("./Peer Review Framework for Predictive Analytics.pdf")
documents = loader.load()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)


In [None]:
# Initialize HuggingFace embeddings
hf_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
service = "aoss"  # must set the service as 'aoss'
region = "us-east-1"
credentials = boto3.Session(
    aws_access_key_id="<aws_access_key>",
    aws_secret_access_key="<aws_secret_access_key>"
).get_credentials()

awsauth = AWS4Auth(
    credentials.access_key,
    credentials.secret_key,
    region,
    service,
    session_token=credentials.token
)

In [None]:
docsearch = OpenSearchVectorSearch.from_documents(
    docs,
    hf_embeddings,
    opensearch_url="<open-search-serverless-collection-url>",
    http_auth=awsauth,
    timeout=300,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    index_name="aws-os-demo-collection",
    engine="faiss",
)



In [None]:

docs = docsearch.similarity_search(
    "What are Common sources of observational rainfall data",
    k=3,
)

In [None]:
print(docs)