In [1]:
import getpass
import os
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import openai
from langchain.chains import retrieval_qa
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import FAISS
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [17]:
# Accessing Variables
openai_key = os.getenv("OPENAI_API_KEY")
langchain_key = os.getenv("LANGCHAIN_API_KEY")
langchain_tracing = os.getenv("LANGCHAIN_TRACING_V2")

### Converting document to a text file for RAG

In [3]:
# Converting document to a text file to perform RAG on
import pdfplumber

def extract_text_from_pdf(file_path):
    """Extracts text from a PDF and cleans it."""
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            # Extract text from each page
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n\n"
    
    # Perform basic cleaning
    cleaned_text = clean_text(text)
    return cleaned_text

def clean_text(text):
    """Cleans the extracted text for better readability."""
    # Remove extra whitespace and newlines
    text = "\n".join([line.strip() for line in text.splitlines() if line.strip()])
    # Additional cleaning steps can be added here
    return text

def save_to_text_file(text, output_path):
    """Saves the cleaned text to a .txt file."""
    with open(output_path, "w", encoding="utf-8") as f:
        f.write(text)

# Directory containing the PDF files
input_folder = r"C:\Users\justu\OneDrive\Documents\Data_science\Portfolio\OHP LLM\Documents"
output_folder = r"C:\Users\justu\OneDrive\Documents\Data_science\Portfolio\OHP LLM\RAG Docs"

# Create output folder if it doesn't exist
os.makedirs(output_folder, exist_ok=True)

# Process each PDF in the input folder
for file_name in os.listdir(input_folder):
    if file_name.endswith(".pdf"):
        input_pdf = os.path.join(input_folder, file_name)
        output_txt = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}.txt")
        
        # Extract, clean, and save the text
        extracted_text = extract_text_from_pdf(input_pdf)
        save_to_text_file(extracted_text, output_txt)
        
        print(f"Text successfully extracted and saved to {output_txt}")


Text successfully extracted and saved to C:\Users\justu\OneDrive\Documents\Data_science\Portfolio\OHP LLM\RAG Docs\Benefit-Coverage-Summary.txt
Text successfully extracted and saved to C:\Users\justu\OneDrive\Documents\Data_science\Portfolio\OHP LLM\RAG Docs\New to OHP Doc.txt


In [4]:
# Due to UTF-8 error, cleaning .txt file programatically
def clean_text(text):
    """Remove problematic characters from the text."""
    replacements = {
        "\u2013": "-",  # Replace en dash with hyphen
        "\u2022": "*",  # Replace bullet points with asterisks
        "\u25A0": "",   # Remove black squares
    }
    for original, replacement in replacements.items():
        text = text.replace(original, replacement)
    return text

# Apply cleaning before saving
with open("./RAG Docs/Benefit-Coverage-Summary.txt", "r", encoding="utf-8") as f:
    content = f.read()

cleaned_content = clean_text(content)

# Save back to file
with open("./RAG Docs/Benefit-Coverage-Summary-cleaned.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_content)


### Splitting Text

In [5]:
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.directory import DirectoryLoader

loader = DirectoryLoader(
    "./RAG Docs/",
    glob="./*.txt",
    loader_cls=lambda file_path: TextLoader(file_path, encoding="utf-8")
)

documents = loader.load()


In [6]:
# Text Splitting
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)
texts = text_splitter.split_documents(documents)


In [7]:
len(texts)

45

In [8]:
texts[6]

Document(metadata={'source': 'RAG Docs\\Benefit-Coverage-Summary-cleaned.txt'}, page_content='Physician/surgeon fees No charge Not covered None\nEmergency room care No charge No charge\nIf you need immediate\nEmergency medical None\nmedical attention No charge No charge\ntransportation\n*For more information about limitations and exceptions, see the plan or policy document at www.oregon.gov/oha/HSD/OHP/Pages/Handbooks.aspx. Page 2 of 5\nWhat You Will Pay\nLimitations, Exceptions, & Other Important\nCommon Medical Event Services You May Need Network Provider Out-of-Network Provider\nInformation\n(You will pay the least) (You will pay the most)\nUrgent care No charge Not covered\nFacility fee (e.g., hospital\nNo charge Not covered None\nIf you have a hospital\nroom)\nstay\nPhysician/surgeon fees No charge Not covered None\nIf you need mental\nOutpatient services No charge Not covered\nhealth, behavioral\nNone\nhealth, or substance\nInpatient services No charge Not covered\nabuse services

### Creating the Database

In [9]:
os.environ['OPENAI_API_KEY'] = "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"
# Embedding & storing text
persist_directory = 'db'

# Using OpenAI embeddings
embedding = OpenAIEmbeddings()

# Vectorizing db
vector_db = Chroma.from_documents(documents = texts,
                                  embedding = embedding,
                                  persist_directory = persist_directory)


In [10]:
vector_db = None

In [11]:
# Loading the vectorized database
vector_db = Chroma(embedding_function= embedding,
                   persist_directory = persist_directory)

  vector_db = Chroma(embedding_function= embedding,


### RAG

In [12]:
retriever = vector_db.as_retriever()
docs = retriever.invoke("What is my benefit coverage?")

In [13]:
docs

[Document(metadata={'source': 'RAG Docs\\Benefit-Coverage-Summary.txt'}, page_content='Summary of Benefits and Coverage: What this Plan Covers & What You Pay for Covered Services Coverage Period: 01/01/2024-12/31/2024\nOregon Health Plan Coverage for: Individual and Family | Plan Type: Coordinated Care Organization\nThe Summary of Benefits and Coverage (SBC) document will help you choose a health plan. The SBC shows you how you and the plan\nwould share the cost for covered health care services. NOTE: Information about the cost of this plan (called the premium) will be provided\nseparately. This is only a summary. For more information about your coverage, or to get a copy of the complete terms of coverage, go\nto https://www.oregon.gov/oha/HSD/OHP/Pages/Splash.aspx or call the Oregon Health Plan at 1-800-273-0557. For information for your CCO, please go\nhere: https://www.oregon.gov/oha/hsd/ohp/pages/coordinated-care-organizations.aspx. For general definitions of common terms, such as 

In [14]:
# Setting the amount of docs returned via rag
retriever = vector_db.as_retriever(search_kwargs = {"k":2})

In [15]:
retriever.search_type

'similarity'

### Making the RAG Chain

In [112]:
llm = ChatOpenAI(model = "gpt-4o-mini")
qa_chain = RetrievalQA.from_chain_type(llm = llm,
                                       chain_type = "stuff",
                                       retriever = retriever,
                                       return_source_documents = True)

In [114]:
# Creating a function to process the llm output
def process_llm_response(llm_response):
    print(llm_response['result'])
    print('\n\n Sources:')
    for source in llm_response['source_documents']:
        print(source.metadata['source'])

In [118]:
query = "What benefits are offered by OHP? What is my copay?"
response = qa_chain(query)
process_llm_response(response)

Most Oregon Health Plan (OHP) members have OHP Plus benefits, which cover a range of medical, dental, and behavioral health care services. Examples of benefits include:

- Medical care, such as checkups, shots, and X-rays
- Mental health services, such as counseling
- Dental care, including cleaning, fluoride, fillings, and extractions
- Birth control and family planning
- Urgent care
- Hospital stays
- Prescriptions
- Physical, occupational, and speech therapy
- Rides to health care appointments
- Vision care for children through age 21 and pregnant women

Regarding copays, the information provided does not specify details about copays for OHP. You may want to check the OHP Handbook or contact them directly at 800-273-0557 for more information on copay amounts.


 Sources:
RAG Docs\New to OHP Doc.txt
RAG Docs\New to OHP Doc.txt
