In [2]:
from pathlib import Path
from dotenv import dotenv_values
def read_envfile(path:Path):
    env_vars = dict(dotenv_values(path))
    return env_vars

In [6]:
read_envfile(Path("../embedding.env"))

{'"SAGEMAKER_MODEL_SERVER_WORKERS"': '1',
 '"TS_DEFAULT_WORKERS_PER_MODEL"': '1'}

In [2]:
properties = {"engine": "MPI", "option.model_id": "tiiuae/falcon-7b-instruct", "option.trust_remote_code": "true", "option.tensor_parallel_degree": 1, "option.paged_attention": "true", "option.max_rolling_batch_size": 64, "option.rolling_batch": "lmi-dist", "option.max_rolling_batch_prefill_tokens": 1560}

In [6]:
with open("../falcon-7b-instruct/serving.properties", "w") as f:
    for pro in properties.items():
        p,v = pro
        f.write(f"{p}={v} \n")
    

In [2]:
import boto3
import json
smr_client =  boto3.client("sagemaker-runtime")
endpoint_name = "lmi-model-2025-05-15-01-57-52-403"

In [3]:
smr_client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps(
        {
            "inputs": "The diamondback terrapin was the first reptile to be",
            "parameters": {
                "do_sample": True,
                "max_new_tokens": 256,
                "min_new_tokens": 256,
                "temperature": 0.3,
                "watermark": True,
            },
        }
    ),
    ContentType="application/json",
)["Body"].read().decode("utf8")

'{"generated_text": " listed as endangered in the United States. The diamondback terrapin is a small turtle found only in the coastal bays of the Atlantic coast of the United States. The diamondback terrapin is a land turtle that lives in brackish water. It is the only turtle in the world that can swim in both fresh and salt water.\\nThe diamondback terrapin is a small turtle that is found only in the coastal bays of the Atlantic coast of the United States. The diamondback terrapin is a land turtle that lives in brackish water. It is the only turtle in the world that can swim in both fresh and salt water.\\nThe diamondback terrapin is a small turtle that is found only in the coastal bays of the Atlantic coast of the United States. The diamondback terrapin is a land turtle that lives in brackish water. It is the only turtle in the world that can swim in both fresh and salt water. The diamondback terrapin is a small turtle that is found only in the coastal bays of"}'

In [5]:
import os
os.getcwd()

'/home/mrafi/Desktop/Books/RAGwithSagemaker/research'

In [8]:
os.chdir("../")

In [9]:
os.getcwd()

'/home/mrafi/Desktop/Books/RAGwithSagemaker'

In [10]:
import os
import pickle
import re
from langchain_community.document_loaders import PyPDFLoader, PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import torch
from langchain.vectorstores import Chroma, AtlasDB, FAISS
from RAGwithSagemaker.logging.logging import logger

def vectorizedocs(embeddings):
    logger.info("starting docs  loadig")
    loader = PyPDFDirectoryLoader("RAGwithSagemaker/data")
    docs = loader.load()
    logger.info("docs loaded")
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=100)
    final_docs = text_splitter.split_documents(docs)
    logger.info("starting vector dbs")
    vector_store = FAISS.from_documents(final_docs, embeddings)
    # with open("vector_store.pth", "wb") as file:           # Optional. Helps in resuing the vectorspace directly without processing the files everytime
    #     vector_store = torch.save(vector_store,file)
    return vector_store

In [11]:
from RAGwithSagemaker.config.configuration import ConfigurationManager
from RAGwithSagemaker.cloud.embeddingmodel import DeployEmbeddingModel
from RAGwithSagemaker.cloud.textgenerationmodel import DeployTextGenerationModel
from RAGwithSagemaker.cloud.ragendpoints import RAGEndPoints
from RAGwithSagemaker.cloud.vectorize_docs import vectorizedocs
from RAGwithSagemaker.logging.logging import logger


congfiguration = ConfigurationManager()
sagemaker_config = congfiguration.get_sagemakersession_config()
embeddings_config = congfiguration.get_embeddings_config()
textgeneration_config = congfiguration.get_textgeneration_config()
s3_config = congfiguration.get_s3_config()
rag_config = congfiguration.get_rag_config()

# text_model_deploy =DeployTextGenerationModel(sagemaker_config, textgeneration_config)
# text_model_deploy.creat_and_deploy_model()

# embedding_model_deploy = DeployEmbeddingModel(sagemaker_config,embeddings_config )
# embedding_model_deploy.deploy_embedding_model()

rag_endpoints = RAGEndPoints(rag_config)
embeddings_endpoint, sm_llm_endpoint = rag_endpoints.create_rag_endpoints()

logger.info("Started vector db")
vector_store = vectorizedocs(embeddings_endpoint)

2025-05-14 21:54:07,149, common.py, 19, INFO, config/config.yaml file loaded sucessfully
2025-05-14 21:54:07,152, common.py, 19, INFO, params.yaml file loaded sucessfully
2025-05-14 21:54:07,153, common.py, 19, INFO, schema.yaml file loaded sucessfully
2025-05-14 21:54:07,155, common.py, 32, INFO, Directory model created
2025-05-14 21:54:07,233, 3111586173.py, 25, INFO, Started vector db
2025-05-14 21:54:07,234, vectorize_docs.py, 11, INFO, starting docs  loadig


{'SAGEMAKER_MODEL_SERVER_WORKERS': '1', 'TS_DEFAULT_WORKERS_PER_MODEL': '1'}


2025-05-14 21:54:13,962, vectorize_docs.py, 14, INFO, docs loaded
2025-05-14 21:54:13,988, vectorize_docs.py, 17, INFO, starting vector dbs


In [15]:
import os
import pickle
import re
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import streamlit as st
import torch


ImportError: cannot import name 'create_stuff_documents_chain' from 'langchain.chains.combine_documents' (/home/mrafi/miniconda3/envs/rag/lib/python3.10/site-packages/langchain/chains/combine_documents/__init__.py)

In [1]:
!pip install requests_aws4auth opensearch-py

Collecting requests_aws4auth
  Downloading requests_aws4auth-1.3.1-py3-none-any.whl.metadata (18 kB)
Collecting opensearch-py
  Downloading opensearch_py-2.8.0-py3-none-any.whl.metadata (6.9 kB)
Collecting Events (from opensearch-py)
  Downloading Events-0.5-py3-none-any.whl.metadata (3.9 kB)
Downloading requests_aws4auth-1.3.1-py3-none-any.whl (24 kB)
Downloading opensearch_py-2.8.0-py3-none-any.whl (353 kB)
Downloading Events-0.5-py3-none-any.whl (6.8 kB)
Installing collected packages: Events, requests_aws4auth, opensearch-py
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [opensearch-py]
[1A[2KSuccessfully installed Events-0.5 opensearch-py-2.8.0 requests_aws4auth-1.3.1


In [18]:
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
import json
import boto3
# create open search collection public endpoint
host = 'yvi7ktac3durmnl69mc9.us-east-1.aoss.amazonaws.com' # OpenSearch Serverless collection endpoint

region = 'us-east-1'

service = 'aoss'
credentials = boto3.Session().get_credentials()
awsauth = AWS4Auth(credentials.access_key, credentials.secret_key, region, service,
session_token=credentials.token)

# Create an OpenSearch client
client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_auth = awsauth,
    timeout = 300,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)

In [5]:
from urllib.request import urlretrieve
urls = [
    'https://s2.q4cdn.com/299287126/files/doc_financials/2023/ar/2022-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2022/ar/2021-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2021/ar/Amazon-2020-Shareholder-Letter-and-1997-Shareholder-Letter.pdf',
    'https://s2.q4cdn.com/299287126/files/doc_financials/2020/ar/2019-Shareholder-Letter.pdf'
]

filenames = [
    'AMZN-2022-Shareholder-Letter.pdf',
    'AMZN-2021-Shareholder-Letter.pdf',
    'AMZN-2020-Shareholder-Letter.pdf',
    'AMZN-2019-Shareholder-Letter.pdf'
]

metadata = [
    dict(year=2022, source=filenames[0]),
    dict(year=2021, source=filenames[1]),
    dict(year=2020, source=filenames[2]),
    dict(year=2019, source=filenames[3])]

data_root = "./data/"

for idx, url in enumerate(urls):
    file_path = data_root + filenames[idx]
    urlretrieve(url, file_path)

In [6]:
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader

documents = []

for idx, file in enumerate(filenames):
    loader = PyPDFLoader(data_root + file)
    document = loader.load()
    for document_fragment in document:
        document_fragment.metadata = metadata[idx]
        
    print(f'{len(document)} {document}\n')
    documents += document

# - in our testing Character split works better with this PDF data set
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 100,
)

docs = text_splitter.split_documents(documents)

10 [Document(metadata={'year': 2022, 'source': 'AMZN-2022-Shareholder-Letter.pdf'}, page_content='Dear shareholders:\nAs I sit down to write my second annual shareholder letter as CEO, I find myself optimistic and energized\nby what lies ahead for Amazon. Despite 2022 being one of the harder macroeconomic years in recent memory,\nand with some of our own operating challenges to boot, we still found a way to grow demand (on top of\nthe unprecedented growth we experienced in the first half of the pandemic). We innovated in our largest\nbusinesses to meaningfully improve customer experience short and long term. And, we made important\nadjustments in our investment decisions and the way in which we’ll invent moving forward, while still\npreserving the long-term investments that we believe can change the future of Amazon for customers,\nshareholders, and employees.\nWhile there were an unusual number of simultaneous challenges this past year, the reality is that if you\noperate in large, dy

In [30]:
docs[0].page_content

'Dear shareholders:\nAs I sit down to write my second annual shareholder letter as CEO, I find myself optimistic and energized\nby what lies ahead for Amazon. Despite 2022 being one of the harder macroeconomic years in recent memory,\nand with some of our own operating challenges to boot, we still found a way to grow demand (on top of\nthe unprecedented growth we experienced in the first half of the pandemic). We innovated in our largest\nbusinesses to meaningfully improve customer experience short and long term. And, we made important\nadjustments in our investment decisions and the way in which we’ll invent moving forward, while still\npreserving the long-term investments that we believe can change the future of Amazon for customers,\nshareholders, and employees.\nWhile there were an unusual number of simultaneous challenges this past year, the reality is that if you\noperate in large, dynamic, global market segments with many capable and well-funded competitors (the'

In [7]:
avg_doc_length = lambda documents: sum([len(doc.page_content) for doc in documents])//len(documents)
print(f'Average length among {len(documents)} documents loaded is {avg_doc_length(documents)} characters.')
print(f'After the split we have {len(docs)} documents as opposed to the original {len(documents)}.')
print(f'Average length among {len(docs)} documents (after split) is {avg_doc_length(docs)} characters.')

Average length among 37 documents loaded is 3908 characters.
After the split we have 167 documents as opposed to the original 37.
Average length among 167 documents (after split) is 873 characters.


In [9]:
from sagemaker.jumpstart.model import JumpStartModel
role = "arn:aws:iam::703671901662:role/service-role/AmazonSageMaker-ExecutionRole-20250211T170358"
embedding_model_id, embedding_model_version = "huggingface-textembedding-all-MiniLM-L6-v2", "*"
model = JumpStartModel(model_id=embedding_model_id, model_version=embedding_model_version,role=role)
embedding_predictor = model.deploy()

------------!

In [10]:
embedding_model_endpoint_name = embedding_predictor.endpoint_name
embedding_model_endpoint_name

'hf-textembedding-all-minilm-l6-v2-2025-05-15-16-40-20-228'

In [11]:
import boto3
aws_region = boto3.Session().region_name

In [12]:
from typing import Dict, List
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
import json


class CustomEmbeddingsContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, inputs: list[str], model_kwargs: Dict) -> bytes:
        input_str = json.dumps({"text_inputs": inputs, **model_kwargs})
        return input_str.encode("utf-8")

    def transform_output(self, output: bytes) -> List[List[float]]:
        response_json = json.loads(output.read().decode("utf-8"))
        return response_json["embedding"]


embeddings_content_handler = CustomEmbeddingsContentHandler()

embeddings = SagemakerEndpointEmbeddings(
    endpoint_name=embedding_model_endpoint_name,
    region_name=aws_region,
    content_handler=embeddings_content_handler,
)

In [13]:
query_embedding = np.array(embeddings.embed_query(docs[0].page_content))
np.array(query_embedding)

array([ 9.50001180e-03, -5.75785488e-02,  6.35385513e-02,  9.86443833e-03,
        4.05131765e-02,  4.44803871e-02, -3.85062024e-02, -8.37806426e-03,
        2.50135716e-02,  9.80357546e-03, -1.92213133e-02,  8.14457536e-02,
        3.87949944e-02, -7.63982907e-02, -4.13764035e-03, -2.75844447e-02,
       -2.87858043e-02, -7.11952969e-02, -1.07975781e-01, -3.09080016e-02,
       -7.54978433e-02, -2.85860896e-02, -7.56058618e-02,  5.84735759e-02,
       -9.87049565e-02,  4.67438139e-02, -2.39389911e-02,  5.86080514e-02,
       -1.83459986e-02, -8.40838253e-02,  8.37054942e-03,  1.09171318e-02,
        8.12826008e-02,  5.19359708e-02, -1.70592964e-02,  1.63737889e-02,
       -5.63596301e-02, -1.02564245e-01,  2.03202832e-02, -9.32913460e-03,
        7.40218312e-02, -7.70206302e-02, -8.27470347e-02, -2.53112223e-02,
        2.87916753e-02, -3.42744403e-02,  6.12416342e-02,  2.46039778e-02,
        3.51396110e-03, -2.43620686e-02, -6.82935789e-02, -4.69737314e-02,
        4.46422398e-02, -

In [14]:
index_name = "v2-try-again-sagemaker-embedding-384-opensearch-serverless-demo"
vector_size = 384

In [20]:
index_body = {
    "settings": {
        "index.knn": True
  },
  'mappings': {
    'properties': {
      "title": { "type": "text", "fields": { "keyword": { "type": "keyword" } } }, #the field will be title.keyword and the data type will be keyword, this will act as sub field for
      "v_title": { "type": "knn_vector", "dimension": vector_size },
    }
  }
}

client.indices.create(
  index=index_name, 
  body=index_body
)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'v2-try-again-sagemaker-embedding-384-opensearch-serverless-demo'}

In [21]:
client.indices.get_mapping(index_name)

{'v2-try-again-sagemaker-embedding-384-opensearch-serverless-demo': {'mappings': {'properties': {'title': {'type': 'text',
     'fields': {'keyword': {'type': 'keyword'}}},
    'v_title': {'type': 'knn_vector', 'dimension': 384}}}}}

In [22]:
actions =[]
bulk_size = 0
action = {"index": {"_index": index_name}}
for document in docs: 
    sample_embedding = np.array(embeddings.embed_query(document.page_content))    
#    print(sample_embedding)
    actions.append(action)
    json_data = {
        "title" : document.page_content,
        "v_title" : sample_embedding
    }
    actions.append(json_data)
    bulk_size+=1
    if(bulk_size > 200 ):
        client.bulk(body=actions)
        print(f"bulk request sent with size: {bulk_size}")
        bulk_size = 0
print("remaining documents: ", bulk_size)
client.bulk(body=actions)

remaining documents:  167


{'took': 2095,
 'errors': False,
 'items': [{'index': {'_index': 'v2-try-again-sagemaker-embedding-384-opensearch-serverless-demo',
    '_id': '1%3A0%3AzDDp1JYB35ZuvkV-C3Ly',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 0, 'successful': 0, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 0,
    'status': 201}},
  {'index': {'_index': 'v2-try-again-sagemaker-embedding-384-opensearch-serverless-demo',
    '_id': '1%3A0%3AzTDp1JYB35ZuvkV-C3L2',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 0, 'successful': 0, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 0,
    'status': 201}},
  {'index': {'_index': 'v2-try-again-sagemaker-embedding-384-opensearch-serverless-demo',
    '_id': '1%3A0%3AzjDp1JYB35ZuvkV-C3L2',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 0, 'successful': 0, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 0,
    'status': 201}},
  {'index': {'_index': 'v2-try-again-sagemaker-embedding-384-

In [23]:
query = "How has Amazon evolved?"

In [24]:
query_embedding = np.array(embeddings.embed_query(query))
np.array(query_embedding)

array([ 1.46804536e-02, -2.16112956e-02, -3.97073030e-02,  4.47003916e-02,
        5.48140407e-02, -1.26882736e-02, -7.06176534e-02, -6.19897656e-02,
        5.45500703e-02,  8.04458559e-02,  3.72455195e-02, -1.07092587e-02,
        1.69227794e-02, -4.82086353e-02,  1.19856475e-02, -3.68680730e-02,
       -3.86934914e-02, -1.42347038e-01, -6.49031848e-02, -5.98373562e-02,
       -9.33933351e-03,  8.36848915e-02,  1.72329657e-02,  1.00849699e-02,
       -7.79980123e-02,  5.02735339e-02, -3.92008498e-02,  6.10285476e-02,
        7.48902708e-02, -1.24253631e-01,  3.52659859e-02,  3.89122218e-02,
        8.15102980e-02,  5.81278689e-02, -5.82035668e-02, -4.71365778e-03,
        2.40640435e-03, -7.94573054e-02,  3.07969823e-02,  1.40313897e-02,
        2.02736892e-02, -3.84319313e-02, -1.78903602e-02, -6.74636140e-02,
       -2.48281974e-02, -8.17714073e-03,  9.16194543e-03,  1.63671449e-02,
        2.70995405e-02, -2.93891784e-02, -5.80426566e-02, -4.57424521e-02,
       -1.83141176e-02, -

In [25]:
import time

time.sleep(120)

In [26]:
query_os = {
  "size": 3,
  "fields": ["title"],
  "_source": False,
  "query": {
    "knn": {
      "v_title": {
        "vector": query_embedding,
        "k": vector_size
      }
    }
  }
}

relevant_documents = client.search(
    body = query_os,
    index = index_name
)

In [27]:
relevant_documents

{'took': 291,
 'timed_out': False,
 '_shards': {'total': 0, 'successful': 0, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 167, 'relation': 'eq'},
  'max_score': 0.5626755,
  'hits': [{'_index': 'v2-try-again-sagemaker-embedding-384-opensearch-serverless-demo',
    '_id': '1%3A0%3AcTDp1JYB35ZuvkV-DHMN',
    '_score': 0.5626755,
    'fields': {'title': ['believe that other products may be prudent investments. We also believe there are significant opportunities to\nbetter serve our customers overseas, such as reducing delivery times and better tailoring the customer experience.\nTo be certain, a big part of the challenge for us will lie not in finding new ways to expand our business, but in\nprioritizing our investments.\nWe now know vastly more about online commerce than when Amazon.com was founded, but we still have\nso much to learn. Though we are optimistic, we must remain vigilant and maintain a sense of urgency. The\nchallenges and hurdles we will face to make our long-t

In [28]:
print(len(relevant_documents["hits"]["hits"]))
print("--------------------")
context = " "
for i, rel_doc in enumerate(relevant_documents["hits"]["hits"]):
    print(f'## Document {i+1}: {relevant_documents["hits"]["hits"][i]["fields"]["title"][0]}.......')
    print('---')
    context += relevant_documents["hits"]["hits"][i]["fields"]["title"][0]

3
--------------------
## Document 1: believe that other products may be prudent investments. We also believe there are significant opportunities to
better serve our customers overseas, such as reducing delivery times and better tailoring the customer experience.
To be certain, a big part of the challenge for us will lie not in finding new ways to expand our business, but in
prioritizing our investments.
We now know vastly more about online commerce than when Amazon.com was founded, but we still have
so much to learn. Though we are optimistic, we must remain vigilant and maintain a sense of urgency. The
challenges and hurdles we will face to make our long-term vision for Amazon.com a reality are several:
aggressive, capable, well-funded competition; considerable growth challenges and execution risk; the risks of
product and geographic expansion; and the need for large continuing investments to meet an expanding market.......
---
## Document 2: believe that other products may be prudent