### **Install Dependencies & Import Libraries**

In [None]:
!pip install -U boto3 langchain langchain-pinecone nltk sentence-transformers langchain-huggingface

# Standard and NLP Libraries
import os, time, json, boto3, nltk
from sentence_transformers import SentenceTransformer, util
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# LangChain and Vector Store
from langchain.docstore.document import Document
from langchain.text_splitter import CharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone as PineconeClient
from langchain import PromptTemplate

# Download required NLTK resources
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

### **Configuration: AWS, Pinecone, and S3 Setup**

In [7]:
# AWS Credentials
AWS_ACCESS_KEY_ID=''  # Replace with your AWS Access Key ID
AWS_SECRET_ACCESS_KEY=''  # Replace with your AWS Secret Access Key

# AWS Configuration
AWS_REGION=''  # Set the AWS region (default: us-east-1)

# Pinecone API Key and Index
PINECONE_API_KEY=''  # Replace with your Pinecone API Key
PINECONE_INDEX=''  # Replace with your Pinecone Index Name

# Amazon S3 Bucket and File Details
S3_BUCKET_NAME=''  # Replace with your S3 bucket name where the document is stored
PDF_FILE_NAME=''  # Replace with the filename of the document to process

### **Extract Text from Document via AWS Textract**

In [8]:
# AWS Textract client
client = boto3.client(
    'textract',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
    region_name=AWS_REGION
)

# Start document text detection
response = client.start_document_text_detection(
    DocumentLocation={"S3Object": {"Bucket": S3_BUCKET_NAME, "Name": PDF_FILE_NAME}}
)
job_id = response["JobId"]
print(f"Job started with Job ID: {job_id}")

# Polling for job completion
while True:
    result = client.get_document_text_detection(JobId=job_id)
    status = result["JobStatus"]

    if status in ["SUCCEEDED", "FAILED"]:
        break

    print("Processing...")
    time.sleep(5)

if status == "FAILED":
    raise Exception("Textract job failed!")

print("Processing completed!")

# Extract Text from Response
extracted_text = []
while True:
    if "Blocks" in result:
        for block in result["Blocks"]:
            if block["BlockType"] == "LINE" and "Text" in block:
                extracted_text.append(block["Text"])

    if "NextToken" in result:
        result = client.get_document_text_detection(JobId=job_id, NextToken=result["NextToken"])
    else:
        break

# Combine extracted text into a single string
full_text = "\n".join(extracted_text)

# Save extracted text to a file
output_file_name = "extracted_text.txt"
with open(output_file_name, "w") as output_file_io:
    output_file_io.write(full_text)

Job started with Job ID: e632ca8823c5e4aa3dcd003dd334d025b073fc53cbc9c350b07d040ec307d292
Processing...
Processing...
Processing...
Processing completed!


### **Preprocess Extracted Text**

In [9]:
# NLP Preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def preprocess_text(text):
    tokens = word_tokenize(text.lower())  # Tokenization
    tokens = [word for word in tokens if word.isalnum()]  # Remove punctuation
    tokens = [word for word in tokens if word not in stop_words]  # Stopword removal
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return " ".join(tokens)

preprocessed_text = preprocess_text(full_text)

# Prepare Document for Embedding
docs = [Document(page_content=preprocessed_text)]

# Split document into chunks
text_splitter = CharacterTextSplitter(chunk_size=1200, chunk_overlap=250, separator="\n")
split_docs = text_splitter.split_documents(docs)

### **Embed and Store in Pinecone Vector DB**

In [None]:
# Use Embeddings for Text Processing
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/multi-qa-mpnet-base-cos-v1")

# Initialize Pinecone
os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY

docsearch = PineconeVectorStore.from_documents(split_docs, embedding_model, index_name=PINECONE_INDEX)

### **Setup Prompt Template and Bedrock Clients**

In [17]:
# Conversation history
chat_history = []

# Prompt template
RAG_PROMPT_TEMPLATE = '''
You are a helpful and knowledgeable AI assistant having a conversation with a user.
Use the context and conversation history to answer the question.

Context:
{context}
You are a helpful and knowledgeable AI assistant. Use the provided context to answer the question.

If the context is insufficient, rely on your own knowledge to provide the best possible response.

Conversation History:
{history}

Question: {human_input}

Answer:
'''
PROMPT = PromptTemplate.from_template(RAG_PROMPT_TEMPLATE)

# Bedrock model
boto3_bedrock = boto3.client(
    'bedrock-runtime',
    region_name='us-east-1',
    aws_access_key_id=AWS_ACCESS_KEY_ID,
    aws_secret_access_key=AWS_SECRET_ACCESS_KEY
)

### **Define Evaluation & Scoring Functions**

In [None]:
# Sentence transformer model for evaluation
eval_model = SentenceTransformer("all-MiniLM-L6-v2")

def evaluate_response(query, response, context):
    q_emb = eval_model.encode(query, convert_to_tensor=True)
    r_emb = eval_model.encode(response, convert_to_tensor=True)
    c_emb = eval_model.encode(context, convert_to_tensor=True)
    query_sim = float(util.cos_sim(q_emb, r_emb)[0][0])
    context_sim = float(util.cos_sim(c_emb, r_emb)[0][0])

    words = [w for w in word_tokenize(response) if w.isalpha()]
    fluency_len = len(words)
    fluency_score = min(fluency_len / 20, 1.0)  # normalize to [0, 1]
    fluency = "High" if fluency_len > 10 else "Low"

    return {
        "query_similarity": query_sim,
        "context_similarity": context_sim,
        "fluency": fluency,
        "fluency_score": fluency_score
    }

def scoring_fn(metrics, gen_time, max_gen_time):
    # Normalize gen time: lower is better
    norm_time = gen_time / max_gen_time if max_gen_time > 0 else 1.0
    time_penalty = 1.0 - norm_time  # so faster = higher score

    return (
        0.35 * metrics["query_similarity"]
        + 0.35 * metrics["context_similarity"]
        + 0.2 * metrics["fluency_score"]
    )

### **RAG-Based Interactive Loop with Model Evaluation**

In [20]:
# Initialize score trackers
express_scores = []
premier_scores = []
lite_scores = []
interaction_ids = []
MAX_HISTORY_SIZE = 8

express_gen_times = []
premier_gen_times = []
lite_gen_times = []

interaction_count = 0
total_times_express = []
total_times_premier = []
total_times_lite = []
# Run conversation loop
while True:
    if len(chat_history) >= MAX_HISTORY_SIZE:
        chat_history.clear()

    print(f"\n--- Interaction #{interaction_count + 1} ---")
    human_input = input("\nAsk a question (or type 'exit' to quit): ")
    if human_input.lower() == 'exit':
        break

    start = time.time()
    query_embedding = embedding_model.embed_query(human_input)
    embedding_time = time.time() - start

    start = time.time()
    search_results = docsearch.similarity_search(human_input, k=5)
    retrieval_time = time.time() - start

    # Create context from retrieved documents
    MAX_CONTEXT_LENGTH = 6000
    LITE_MAX_CONTEXT_LENGTH = 2000
    context_string = '\n\n'.join(
        [f'Document {ind+1}: ' + i.page_content[:MAX_CONTEXT_LENGTH] for ind, i in enumerate(search_results)]
    )

    lite_context_string = '\n\n'.join(
    [f'Document {ind+1}: ' + i.page_content[:LITE_MAX_CONTEXT_LENGTH] for ind, i in enumerate(search_results)]
    )

    # Build conversation history
    formatted_history = ""
    for turn in chat_history:
        formatted_history += f"User: {turn['question']}\nAssistant: {turn['answer']}\n"

    prompt_data = PROMPT.format(
        human_input=human_input,
        context=context_string,
        history=formatted_history
    )

    lite_prompt_data = PROMPT.format(
        human_input=human_input,
        context=lite_context_string,
        history=""
    )

    # Prepare body for both models
    body_part = json.dumps({
        'inputText': prompt_data,
        'textGenerationConfig': {
            'maxTokenCount': 1024,
            'stopSequences': [],
            'temperature': 0.7,
            'topP': 0.9
        }
    })

    # Body for lite model
    lite_body_part = json.dumps({
        'inputText': lite_prompt_data,
        'textGenerationConfig': {
            'maxTokenCount': 1024,
            'stopSequences': [],
            'temperature': 0.7,
            'topP': 0.9
        }
    })

    start = time.time()
    express_response = boto3_bedrock.invoke_model(
        body=body_part,
        contentType="application/json",
        accept="application/json",
        modelId='amazon.titan-text-express-v1'
    )
    express_text = json.loads(express_response['body'].read())['results'][0]['outputText'].strip()
    express_gen_time = time.time() - start

    start = time.time()
    premier_response = boto3_bedrock.invoke_model(
        body=body_part,
        contentType="application/json",
        accept="application/json",
        modelId='amazon.titan-text-premier-v1:0'
    )
    premier_text = json.loads(premier_response['body'].read())['results'][0]['outputText'].strip()
    premier_gen_time = time.time() - start

    start = time.time()
    lite_response = boto3_bedrock.invoke_model(
        body=lite_body_part,
        contentType="application/json",
        accept="application/json",
        modelId='amazon.titan-text-lite-v1'
    )
    lite_text = json.loads(lite_response['body'].read())['results'][0]['outputText'].strip()
    lite_gen_time = time.time() - start

    # Evaluate and choose best
    eval_express = evaluate_response(human_input, express_text, context_string)
    eval_premier = evaluate_response(human_input, premier_text, context_string)
    eval_lite = evaluate_response(human_input, lite_text, context_string)
    max_gen_time = max(express_gen_time, premier_gen_time, lite_gen_time)

    score_express = scoring_fn(eval_express, express_gen_time, max_gen_time)
    score_premier = scoring_fn(eval_premier, premier_gen_time, max_gen_time)
    score_lite = scoring_fn(eval_lite, lite_gen_time, max_gen_time)

    scores = {
      "Express": score_express,
      "Premier": score_premier,
      "Lite": score_lite
    }

    gen_times = {
    "Express": express_gen_time,
    "Premier": premier_gen_time,
    "Lite": lite_gen_time
    }

    # Store total times per model
    total_times = {
    model: embedding_time + retrieval_time + gen_times[model]
    for model in gen_times
    }

    # Track times and scores for plots
    express_gen_times.append(express_gen_time)
    premier_gen_times.append(premier_gen_time)
    lite_gen_times.append(lite_gen_time)

    total_times_express.append(total_times["Express"])
    total_times_premier.append(total_times["Premier"])
    total_times_lite.append(total_times["Lite"])

    best_model = max(scores, key=scores.get)
    best_score = scores[best_model]
    best_gen_time = gen_times[best_model]

    if best_model == "Express":
        best_text = express_text
        best_eval = eval_express
    elif best_model == "Premier":
        best_text = premier_text
        best_eval = eval_premier
    else:
        best_text = lite_text
        best_eval = eval_lite

    best_text = best_text.replace(". ", ".\n")

    interaction_count += 1
    interaction_ids.append(interaction_count)
    express_scores.append(eval_express)
    premier_scores.append(eval_premier)
    lite_scores.append(eval_lite)

    # Calculate throughput
    total_time_so_far = sum(total_times_express) + sum(total_times_premier) + sum(total_times_lite)
    avg_throughput = (interaction_count * 3) / total_time_so_far if total_time_so_far > 0 else 0  # 3 models per interaction

    print(f"\nAnswer from {best_model}:\n{best_text}")
    print("\n--- Model Generation Time ---")
    print(f"Embedding Time: {embedding_time:.2f} sec")
    print(f"Retrieval Time: {retrieval_time:.2f} sec")
    print(f"Generation Time: {best_gen_time:.2f} sec")
    print(f"Total Time : {total_times[best_model]:.2f} sec")
    print(f"Average Throughput: {avg_throughput:.3f} responses/second")

    print("\n--- Model Score ---")
    print(f"Query Similarity: {best_eval['query_similarity']:.3f}")
    print(f"Context Similarity: {best_eval['context_similarity']:.3f}")
    print(f"Fluency: {best_eval['fluency']} ({best_eval['fluency_score']:.3f})")

    # Save to chat history
    chat_history.append({
        "question": human_input,
        "answer": best_text
    })


--- Interaction #1 ---

Ask a question (or type 'exit' to quit): What is the primary objective of the Information Technology Act, 2000?

Answer from Express:
The primary objective of the Information Technology Act, 2000 is to provide legal recognition and regulation for information technology activities in India.

--- Model Generation Time ---
Embedding Time: 0.10 sec
Retrieval Time: 0.21 sec
Generation Time: 3.22 sec
Total Time : 3.52 sec
Average Throughput: 0.028 responses/second

--- Model Score ---
Query Similarity: 0.830
Context Similarity: 0.561
Fluency: High (1.000)

--- Interaction #2 ---

Ask a question (or type 'exit' to quit): Compare the penalties under Section 66 and Section 66F of the IT Act.

Answer from Express:
Section 66 of the IT Act deals with computer-related fraud, which involves dishonestly accessing a computer, computer system, or network with the intent to cause damage.
The punishment is imprisonment for up to three years, a fine of up to one lakh rupees, or b

In [23]:
import numpy as np
import pandas as pd

# Simulated constants for embedding and retrieval time (adjust if needed)
embedding_time_express = 0.12
embedding_time_premier = 0.13
embedding_time_lite = 0.11

retrieval_time_express = 0.38
retrieval_time_premier = 0.42
retrieval_time_lite = 0.33

# Generate lists using the constants (one per interaction)
embed_times_express = [embedding_time_express] * len(express_scores)
embed_times_premier = [embedding_time_premier] * len(premier_scores)
embed_times_lite = [embedding_time_lite] * len(lite_scores)

retrieval_times_express = [retrieval_time_express] * len(express_scores)
retrieval_times_premier = [retrieval_time_premier] * len(premier_scores)
retrieval_times_lite = [retrieval_time_lite] * len(lite_scores)

# Helper to extract average from list of evaluation dicts
def avg_metric(score_list, key):
    return np.mean([entry[key] for entry in score_list])

# === Averages per model ===
# Scores
avg_query_sim = [
    avg_metric(express_scores, "query_similarity"),
    avg_metric(premier_scores, "query_similarity"),
    avg_metric(lite_scores, "query_similarity")
]
avg_context_sim = [
    avg_metric(express_scores, "context_similarity"),
    avg_metric(premier_scores, "context_similarity"),
    avg_metric(lite_scores, "context_similarity")
]
avg_fluency = [
    avg_metric(express_scores, "fluency_score"),
    avg_metric(premier_scores, "fluency_score"),
    avg_metric(lite_scores, "fluency_score")
]
avg_final_score = [
    np.mean([
        scoring_fn(score, gen_time, max(express_gen_times[i], premier_gen_times[i], lite_gen_times[i]))
        for i, (score, gen_time) in enumerate(zip(scores, gen_times))
    ]) for scores, gen_times in [
        (express_scores, express_gen_times),
        (premier_scores, premier_gen_times),
        (lite_scores, lite_gen_times)
    ]
]

# Times
avg_embed_times = [
    np.mean(embed_times_express),
    np.mean(embed_times_premier),
    np.mean(embed_times_lite)
]
avg_retrieval_times = [
    np.mean(retrieval_times_express),
    np.mean(retrieval_times_premier),
    np.mean(retrieval_times_lite)
]
avg_gen_times = [
    np.mean(express_gen_times),
    np.mean(premier_gen_times),
    np.mean(lite_gen_times)
]
avg_total_times = [
    np.mean(total_times_express),
    np.mean(total_times_premier),
    np.mean(total_times_lite)
]

# Throughput = responses / total time
total_interactions = len(express_scores)
avg_throughput = [
    round(total_interactions / sum(total_times_express), 3),
    round(total_interactions / sum(total_times_premier), 3),
    round(total_interactions / sum(total_times_lite), 3)
]

# Construct the data table
data = {
    "Metric": [
        "Query Similarity",
        "Context Similarity",
        "Fluency Score",
        "Final Score",
        "Embedding Time (s)",
        "Retrieval Time (s)",
        "Generation Time (s)",
        "Total Time (s)",
        "Avg. Throughput (resp/sec)"
    ],
    "Titan Express": [
        round(avg_query_sim[0], 3),
        round(avg_context_sim[0], 3),
        round(avg_fluency[0], 3),
        round(avg_final_score[0], 3),
        round(avg_embed_times[0], 2),
        round(avg_retrieval_times[0], 2),
        round(avg_gen_times[0], 2),
        round(avg_total_times[0], 2),
        avg_throughput[0]
    ],
    "Titan Premier": [
        round(avg_query_sim[1], 3),
        round(avg_context_sim[1], 3),
        round(avg_fluency[1], 3),
        round(avg_final_score[1], 3),
        round(avg_embed_times[1], 2),
        round(avg_retrieval_times[1], 2),
        round(avg_gen_times[1], 2),
        round(avg_total_times[1], 2),
        avg_throughput[1]
    ],
    "Titan Lite": [
        round(avg_query_sim[2], 3),
        round(avg_context_sim[2], 3),
        round(avg_fluency[2], 3),
        round(avg_final_score[2], 3),
        round(avg_embed_times[2], 2),
        round(avg_retrieval_times[2], 2),
        round(avg_gen_times[2], 2),
        round(avg_total_times[2], 2),
        avg_throughput[2]
    ]
}

df = pd.DataFrame(data)

# === Display the table ===
print(df.to_markdown(index=False))

| Metric                     |   Titan Express |   Titan Premier |   Titan Lite |
|:---------------------------|----------------:|----------------:|-------------:|
| Query Similarity           |           0.653 |           0.619 |        0.648 |
| Context Similarity         |           0.41  |           0.419 |        0.377 |
| Fluency Score              |           0.945 |           0.9   |        0.91  |
| Final Score                |           0.561 |           0.544 |        0.541 |
| Embedding Time (s)         |           0.12  |           0.13  |        0.11  |
| Retrieval Time (s)         |           0.38  |           0.42  |        0.33  |
| Generation Time (s)        |           5.73  |           3.28  |        3.52  |
| Total Time (s)             |           6.18  |           3.73  |        3.97  |
| Avg. Throughput (resp/sec) |           0.162 |           0.268 |        0.252 |
