In [60]:
# !pip install pdfplumber
# !pip install langchain-text-splitter
# !pip install sentence_transformers
# !pip install faiss-cpu
# !pip install groq


Conversion to plain text

In [61]:
import os
import pdfplumber

# Ensure the folder exists
output_folder = "textconversion"
os.makedirs(output_folder, exist_ok=True)

# Define the output file path
output_path = os.path.join(output_folder, "output.txt")

# Extract text and write to the file
with pdfplumber.open("Company-Policy-and-Procedure-June-1.18-V6.0.pdf") as pdf, open(output_path, "w", encoding="utf-8") as f:
    for page in pdf.pages:
        t = page.extract_text()
        if t:
            f.write(t + '\n')


In [62]:
with open("/content/textconversion/output.txt", "r", encoding="utf-8") as document:
    text = document.read()

In [63]:
print(len(text))

186479


Text Splitting

In [64]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)

texts = text_splitter.split_text(text)



Conversion of Chunks to Embeddings

In [65]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
vectors = model.encode(texts)

vector_folder = "vectors"
os.makedirs(vector_folder, exist_ok=True)

output_path = os.path.join(vector_folder, "embeddings.npy")
np.save(output_path, vectors)




Visualize chunks

In [None]:
# ✅ VISUALIZE CHUNKS 
print("\n🔍 Sample Chunks and Their Lengths:\n")
for i, chunk in enumerate(texts):
    print(f"Chunk {i+1}:")
    print(chunk)
    print(f"Token Length: {len(chunk.split())} words\n{'-'*40}")



🔍 Sample Chunks and Their Lengths:

Chunk 1:
Company Policy
and Procedure
Manual
TriageLogic, LLC
Initial Version November 2013
Last update June 4, 2018
Version 6.0
Approved By:
Charu G. Raheja, PhD
Chair/CEO
TriageLogic, LL
TABLE OF CONTENTS
Note: this covers:
https://accreditnet.urac.org/Application/3271/Evidence
Core Standards 1 to 40
I. Welcome Core 2
1. About Our Company ............................................................................................................. 5
Token Length: 49 words
----------------------------------------
Chunk 2:
2. Mission Statement ................................................................................................................... 5
3. Organizational Structure Core1 ………………………………………… .................. ………6
a. Company Demographic ....................................................................................... 6
b. Diagram Oversight Management Process ............................................................ 9
Token

Store Embeddings

In [67]:
import faiss

dimension = vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(vectors))

embedding_folder = "embeddings"
os.makedirs(embedding_folder, exist_ok=True)


index_path = os.path.join(embedding_folder, "faiss_index.index")
faiss.write_index(index, index_path)



Query Search

In [68]:
query = "Age to acess the TL Website?"
query_embedding = model.encode([query])
D, I = index.search(np.array(query_embedding), k=1)

for idx in I[0]:
    print(f"Match: {texts[idx]}")

Match: 18
Our Commitment to Data Security
Access to your data is limited to authorized TL staff or approved vendors. Although total security does not
exist on the Internet, TL shall make commercially reasonable efforts to safeguard the information that you
submit to TL or that TL collects.
Use of the TL Website by Children
The TL Website is not intended for use by children under the age of 13.
Your Privacy Preferences


Importing GROQ

In [None]:
import os
from groq import Groq

client = Groq(api_key="")


Query Searching and LLM Generation

In [70]:
query = "who makes the complains?"
query_embedding = model.encode([query])
D, I = index.search(np.array(query_embedding), k=3)

context = texts[I[0][0]]

rag_prompt = f"""
Use the context below to answer the question.

Context:
{context}

Question:
{query}

Answer:
"""


In [71]:
response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": "You are a helpful assistant using provided context."},
        {"role": "user", "content": rag_prompt}
    ],
    temperature=0.2
)

print(response.choices[0].message.content)

The context doesn't explicitly mention who makes the complaints. However, based on the information provided, it can be inferred that the complaints are likely made by patients or their representatives, as the process involves reviewing a call and a note, which suggests communication with patients.


Query Searching with Reranking chunks

In [72]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import numpy as np
import faiss

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

query = "summarize this"

query_embedding = embedding_model.encode([query])
D, I = index.search(np.array(query_embedding), k=10)

retrieved_chunks = [texts[i] for i in I[0]]

print("\n🔍 Top 10 Retrieved Chunks (Pre-Rerank):")
for i, chunk in enumerate(retrieved_chunks, 1):
    print(f"\n[{i}] {chunk[:200]}...")

rerank_inputs = [(query, chunk) for chunk in retrieved_chunks]
scores = reranker.predict(rerank_inputs)

scored_chunks = list(zip(scores, retrieved_chunks))
scored_chunks.sort(reverse=True, key=lambda x: x[0])

print("\n📊 Reranked Chunks (with Scores):")
for i, (score, chunk) in enumerate(scored_chunks, 1):
    print(f"\n[{i}] Score: {score:.4f}")
    print(f"{chunk[:200]}...")

top_chunks = [chunk for _, chunk in scored_chunks[:3]]
context = "\n\n".join(top_chunks)

print("\n🧩 Final Chunks Used in Context:")
for i, chunk in enumerate(top_chunks, 1):
    print(f"\n[{i}] {chunk[:200]}...")

rag_prompt = f"""
Use the context below to answer the question.

Context:
{context}

Question:
{query}

Answer:
"""

print("\n🧠 Final RAG Prompt:")
print(rag_prompt)



🔍 Top 10 Retrieved Chunks (Pre-Rerank):

[1] Is ready to perform at the beginning of their shift or if on a flexible
schedule, keeps enough common hours to facilitate teamwork.
Collaboration
Communicates with others to achieve the team’s common ...

[2] within own scope of work. Applies rules and standards to decisions.
Weighs alternatives and selects the best solution, and asks managers or
supervisors when a decision is beyond own scope of work.
Man...

[3] is communicated to staff and encourage staff to report quality and safety/risk concerns. They collect
data, report measur the Quality Committee and prepare any other summaries required.
Additionally, ...

[4] opportunity to give positive feedback and improve morale. See for example, TriageLogic’s Management
meetings.
Team Briefing. This is a way of passing information from the top of the business down to a...

[5] explained under the clinical section.
4.) Quality of Marketing and Advertisement Materials. Currently this is the resp

In [73]:
response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": "You are a helpful assistant using provided context."},
        {"role": "user", "content": rag_prompt}
    ],
    temperature=0.2
)

print(response.choices[0].message.content)

The provided text appears to be a job description for a Quality Committee Assistant or a similar role. The main responsibilities include:

1. Collecting and reporting data on quality and safety/risk concerns.
2. Investigating and documenting complaints.
3. Solving interdepartmental problems related to quality and patient care.
4. Establishing a system for identifying and correcting internal and external department problems.

The ideal candidate is expected to possess skills such as:

1. Collaboration and teamwork
2. Communication and problem-solving
3. Initiative and a willingness to learn and grow
4. Organization and technical literacy
5. Flexibility and adaptability

Overall, the role requires a detail-oriented and proactive individual who can effectively communicate and collaborate with others to achieve quality and safety goals.


Text splitting with meta data

Rag with text-splitting and meta data

In [74]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(text)

# Wrap each chunk with metadata
texts = [
    {
        "id": i,
        "text": chunk,
        "metadata": {
            "chunk_id": i,
            "start_char": text.find(chunk),
            "end_char": text.find(chunk) + len(chunk),
            "source": "your_file_name_or_path.pdf"
        }
    }
    for i, chunk in enumerate(chunks)
]


Conversion of Chunks with meta data to Embeddings

In [75]:
# for meta data encoding we need to extarct text from dictionary it wont work directly
model = SentenceTransformer('all-MiniLM-L6-v2')
text_contents = [chunk["text"] for chunk in texts]
vectors = model.encode(text_contents)

vector_folder = "vectors"
os.makedirs(vector_folder, exist_ok=True)
output_path = os.path.join(vector_folder, "embeddings.npy")
np.save(output_path, vectors)

For visuals of chunks with meta data

In [None]:
# ✅ VISUALIZE CHUNKS 
print("\n🔍 Sample Chunks and Their Lengths:\n")
for i, chunk in enumerate(texts):
    print(f"Chunk {i+1}:")
    print(chunk)
    print(f"Token Length: {len(chunk['text'].split())} words\n{'-'*40}")



🔍 Sample Chunks and Their Lengths:

Chunk 1:
{'id': 0, 'text': 'Company Policy\nand Procedure\nManual\nTriageLogic, LLC\nInitial Version November 2013\nLast update June 4, 2018\nVersion 6.0\nApproved By:\nCharu G. Raheja, PhD\nChair/CEO\nTriageLogic, LL\nTABLE OF CONTENTS\nNote: this covers:\nhttps://accreditnet.urac.org/Application/3271/Evidence\nCore Standards 1 to 40\nI. Welcome Core 2\n1. About Our Company ............................................................................................................. 5', 'metadata': {'chunk_id': 0, 'start_char': 0, 'end_char': 445, 'source': 'your_file_name_or_path.pdf'}}
Token Length: 49 words
----------------------------------------
Chunk 2:
{'id': 1, 'text': '2. Mission Statement ................................................................................................................... 5\n3. Organizational Structure Core1 ………………………………………… .................. ………6\na. Company Demographic .....................................

Store Embeddings

In [77]:
import faiss

dimension = vectors.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(vectors))

embedding_folder = "embeddings"
os.makedirs(embedding_folder, exist_ok=True)


index_path = os.path.join(embedding_folder, "faiss_index.index")
faiss.write_index(index, index_path)



Query Searching with Reranking chunks with meta data

In [None]:
from sentence_transformers import SentenceTransformer, CrossEncoder
import numpy as np
import faiss

embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
reranker = CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")

query = "who makes the complains?"

query_embedding = embedding_model.encode([query])
D, I = index.search(np.array(query_embedding), k=10)

retrieved_chunks = [texts[i] for i in I[0]]

print("\n🔍 Top 10 Retrieved Chunks (Pre-Rerank):")
for i, chunk in enumerate(retrieved_chunks, 1):
    print(f"\n[{i}] {chunk['text'][:200]}...")  # Access text part only

rerank_inputs = [(query, chunk["text"]) for chunk in retrieved_chunks]
scores = reranker.predict(rerank_inputs)

scored_chunks = list(zip(scores, retrieved_chunks))
scored_chunks.sort(reverse=True, key=lambda x: x[0])  # Sort by score descending

print("\n📊 Reranked Chunks (with Scores):")
for i, (score, chunk) in enumerate(scored_chunks, 1):
    print(f"\n[{i}] Score: {score:.4f}")
    print(f"{chunk['text'][:200]}...")

top_chunks = [chunk["text"] for _, chunk in scored_chunks[:3]]
context = "\n\n".join(top_chunks)

print("\n🧩 Final Chunks Used in Context:")
for i, chunk in enumerate(top_chunks, 1):
    print(f"\n[{i}] {chunk[:200]}...")

rag_prompt = f"""
Use the context below to answer the question.

Context:
{context}

Question:
{query}

Answer:
"""

print("\n🧠 Final RAG Prompt:")
print(rag_prompt)



🔍 Top 10 Retrieved Chunks (Pre-Rerank):

[1] staff.
4. The manager of the non-clinical staff or the nurse manager reviews the complaint, listens to the
call, reviews the note and talks to the agent or nurse involved.
5. Then manager will write a...

[2] is communicated to staff and encourage staff to report quality and safety/risk concerns. They collect
data, report measur the Quality Committee and prepare any other summaries required.
Additionally, ...

[3] Policy #: 13 Most Recent Review: 9/27/16
I. Client/Consumer Complaints
1. All complaints come through the provider (client) who is contracting for services.
2. Each client has a designated client rela...

[4] that they come in contact with as representatives of the Company. If an employee has an equal employment
opportunity related question, problem, or complaint, they must first discuss it with their imme...

[5] committee).
The Medical Director and Sales & Accounts Manager telephones and sets up meetings with clients on
an as ne

In [79]:
response = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": "You are a helpful assistant using provided context."},
        {"role": "user", "content": rag_prompt}
    ],
    temperature=0.2
)

print(response.choices[0].message.content)

According to the provided context, all complaints come through the provider (client) who is contracting for services.
