# Packages

In [None]:
pip install langchain-openai "unstructured==0.13.7" pypandoc langchain-core langchain-community scikit-learn rank-bm25 faiss-cpu langchain-text-splitters

In [None]:
from typing import List
from langchain_core.documents.base import Document
from langchain_community.document_loaders import UnstructuredRTFLoader
import os 
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableSequence, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import pandas as pd
from sklearn.metrics import accuracy_score
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

# Loading the documents 

In [3]:
folder_path = "Unrelated"

unrelated_langchain_docs = []
unrelated_labels = []
unrelated_example_labels = []
# Loop through each document and load it into a langchain document 
for filename in os.listdir(folder_path):
    loader = UnstructuredRTFLoader(os.path.join(folder_path, filename))
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        unrelated_langchain_docs.append(docs[0])
        unrelated_labels.append("Not Relevant")
        unrelated_example_labels.append("Does not contain example")




In [None]:
unrelated_langchain_docs

In [None]:
folder_path = "Related-withExample"

related_with_example_langchain_docs = []
related_with_example_labels = []
related_with_example_examples = []
# Loop through each document and load it into a langchain document 
for filename in os.listdir(folder_path):
    loader = UnstructuredRTFLoader(os.path.join(folder_path, filename))
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        print(docs[0])
        related_with_example_langchain_docs.append(docs[0])
        related_with_example_labels.append("Relevant")
        related_with_example_examples.append("Contains example")

In [6]:
folder_path = "Related-withoutExample"

related_no_example_langchain_docs = []
related_no_example_labels = []
related_no_example_examples = []
# Loop through each document and load it into a langchain document 
for filename in os.listdir(folder_path):
    loader = UnstructuredRTFLoader(os.path.join(folder_path, filename))
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        related_no_example_langchain_docs.append(docs[0])
        related_no_example_labels.append("Relevant")
        related_no_example_examples.append("Does not contain example")


In [None]:
langchain_docs = unrelated_langchain_docs + related_with_example_langchain_docs + related_no_example_langchain_docs
related_labels_truth = unrelated_labels + related_with_example_labels + related_no_example_labels
example_labels_truth = unrelated_example_labels + related_with_example_examples + related_no_example_examples
print(langchain_docs)
print(related_labels_truth)
print(example_labels_truth)

# Embeddings for Retrieval Augmented Generation (RAG) Testing

In [8]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
# Classifying the documents 
vector_store = FAISS.from_documents(langchain_docs, embeddings)

# Classifying the documents 

## Structured outputs - https://python.langchain.com/v0.1/docs/modules/model_io/chat/structured_output/

LLMs really like structure and tend to be more accurate when you "program" them, so know we can get back a class with variables that we can use later without parsing the output. 

In [9]:
from langchain_core.pydantic_v1 import BaseModel, Field


class ClassificationResponse(BaseModel):
    """Class to parse the output of the LLM"""
    relevant_or_not: str = Field(description="""Answer with ‘Relevant’ if it is about this topic or ‘Not Relevant’ if it is not.""") #  but does not meet all three conditions, answer ‘Unclear’.
    example_or_not: str = Field(description="Answer with ‘Contains example’ if it meets all three criteria or ‘Does not contain example’ if it does not.   ")


In [10]:
def split_into_chunks(doc: Document, chunk_size: int = 200, chunk_overlap: int = 40) -> List[Document]:
    """Take in a document, split it into chunks of a specified size and overlap."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents([doc])
    return chunks

In [11]:
def run_rag_chain(doc: Document, llm: ChatOpenAI, embeddings: OpenAIEmbeddings, chat_prompt:ChatPromptTemplate) -> ClassificationResponse:
    """Take in a document, use a language model, embeddings, and chat prompt and return a retrieval based classification response.
    Accepts:
        doc (Document): The langchain document to classify. Contains the text content to be analyzed.
        llm (ChatOpenAI): The language model to use for classification. Configured for structured output.
        chat_prompt (ChatPromptTemplate): The chat prompt template to use for generating the classification question.
        embeddings (OpenAIEmbeddings): The embeddings to use for the vector store.
    Returns:
        Response: A Structured Response class defined with pydantic, containing:
            - true_or_false (str): "True" if the document relates to pharmacy refusals, "False" otherwise.
            - additional_information (str): Any extra context or details about the classification.

    Description:
    This function takes a document, uses semantic search and keyword search to classify the document, and returns a structured response.
    """
    chunks = split_into_chunks(doc)
    vector_store = FAISS.from_documents(chunks, embeddings) # for semantic search (Vector Search)
    vector_store.k = 5
    vector_retriever = vector_store.as_retriever()
    chatting_chain = RunnableSequence({"question": RunnablePassthrough(), "source": vector_retriever} | chat_prompt | llm)
    response = chatting_chain.invoke(f"")
    return response

In [12]:
def run_model(doc: Document, llm: ChatOpenAI, chat_prompt:ChatPromptTemplate) -> ClassificationResponse:
    """Take in a document, use a language model and chat prompt and return a structured response.
    Accepts:
        doc (Document): The langchain document to classify. Contains the text content to be analyzed.
        llm (ChatOpenAI): The language model to use for classification. Configured for structured output.
        chat_prompt (ChatPromptTemplate): The chat prompt template to use for generating the classification question.

    Returns:
        Response: A Structured Response class defined with pydantic, containing:
            - true_or_false (str): "True" if the document relates to pharmacy refusals, "False" otherwise.
            - additional_information (str): Any extra context or details about the classification.

    Description:
    This function takes a document, uses a specified language model and chat prompt to analyze
    the document's content, and determines whether it's related to pharmacy refusals.
    The result is returned in a structured format for easy parsing and further processing.
    """
    chatting_chain = RunnableSequence({"question": RunnablePassthrough(), "source": lambda x: doc.page_content} | chat_prompt | llm)
    response = chatting_chain.invoke("")
    return response

In [13]:
classify_chat_prompt = ChatPromptTemplate.from_template(template="""
We are searching for specific examples of pharmaceutical refusals, or the refusal to fulfill a prescription medication at a pharmacy by a pharmacist based on religious or moral objections. Our current corpus contains news articles or legal cases.
Read the each of  the attached documents and determine the following:
Determine if the article is about pharmaceutical refusals. Answer with ‘Relevant’ if it is about this topic or ‘Not Relevant’ if it is not.
For the articles marked as ‘Relevant’, determine whether the article talks about a specific example of a pharmaceutical refusal. To qualify as a specific example of a pharmaceutical refusal, the news article or legal case must have all three conditions:
1. Involve a specific person who was refused a prescription at a pharmacy (name not necessary).
2. Mention the drug or type of drug that was refused (e.g. emergency contraception, birth control, abortion medication, hormones, HIV medication, etc.).
3. State that the refusal was based on moral or religious grounds. It can also relate to an alternative conscientious objection.  
    Answer based on the following document:{source} Do not include any other information in your answer.
    {question}""")
openai_model = ChatOpenAI(model = "gpt-4o")
classify_structured_llm = openai_model.with_structured_output(ClassificationResponse)
response = run_rag_chain(langchain_docs[0], classify_structured_llm, embeddings, classify_chat_prompt)
print(response)

relevant_or_not='Not Relevant' example_or_not='Does not contain example'


In [None]:
relevant_predicted = []
example_predicted = []
document_names = []
for i, doc in enumerate(langchain_docs):
    print(len(doc.page_content))
    response = run_model(doc, classify_structured_llm, classify_chat_prompt)
    relevant_predicted.append(response.relevant_or_not)
    example_predicted.append(response.example_or_not)
    document_names.append(doc.metadata["source"])

In [15]:
classify_df = pd.DataFrame({"document_name": document_names, "Predicted Relevance": relevant_predicted, "Actual Relevance": related_labels_truth, "Predicted Example": example_predicted, "Actual Example": example_labels_truth})

In [16]:
accuracy = accuracy_score(related_labels_truth, relevant_predicted)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 68.18%


In [17]:
accuracy = accuracy_score(example_labels_truth, example_predicted)
print(f"Accuracy: {accuracy:.2%}")

Accuracy: 90.91%


In [18]:
classify_df.to_csv("Train_Test_classification_gpt4o_relevant_and_example1.csv", index=False)

# Extracting the information from the documents 

In [19]:
class ExtractionResponse(BaseModel):
    """Class to parse the output of the LLM"""
    date: str = Field(description="The date when the incident occurred. Only list the date if it refers to when a specific pharmacist refused a prescription, not legal case timelines or rulings, or the date the article was published or uploaded. Answer with None if not mentioned.")
    location: str = Field(description="The state, city, or county where a specific pharmacist refused a prescription. Answer with None if not mentioned.")
    pharmacy_name: str = Field(description="The pharmacy that originally refused the medication. Answer with None if not mentioned.")
    drug_or_classification: str = Field(description="The drug, item, or broad drug category that was refused. Answer with None if not mentioned.")
    patient_name: str = Field(description="The name of the patient who was refused medication. Answer with None if not mentioned.")
    patient_demographics: str = Field(description="The demographics of the patient (e.g. Age, Race, Gender, Sexuality, etc.). Answer with None if not mentioned.")
    refusal_reason: str = Field(description="The reason the pharmacist refused to provide the desired medication. Answer with None if not mentioned.")
    patient_outcome: str = Field(description="The outcome for the patient. Did they eventually receive the drug? If yes, indicate if it was the same pharmacy or a different one. Answer with None if not mentioned.")
    pharmacist_outcome: str = Field(description="The outcome for the pharmacist. Was legal action brought against the pharmacist or pharmacy, and if so, what was the result? Answer with None if not mentioned.")
    news_source: str = Field(description="Where the story was reported (name of newspaper, publication, headline, and date published). Answer with None if not mentioned.")
    additional_information: str = Field(description="Any important additional information about the refusal.")

In [20]:
extraction_structured_llm = openai_model.with_structured_output(ExtractionResponse)
extraction_chat_prompt = ChatPromptTemplate.from_template(template="""Answer the following questions based on the following document:{source}. From the document, which clarifies specific instances of pharmaceutical refusals based upon moral or religious grounds, extract the following information. If the information is not available, return ‘None’""")

In [21]:
extraction_responses = []
date = []
location = []
pharmacy_name = []
drug_or_classification = []
patient_name = []
patient_demographics = []
refusal_reason = []
patient_outcome = []
pharmacist_outcome = []
news_source = []
additional_information = []
for i, doc in enumerate(langchain_docs):
    bool = example_predicted[i]
    if bool == "Contains example":
        response = run_model(doc, extraction_structured_llm, extraction_chat_prompt)
        date.append(response.date)
        location.append(response.location)
        pharmacy_name.append(response.pharmacy_name)
        drug_or_classification.append(response.drug_or_classification)
        patient_name.append(response.patient_name)
        patient_demographics.append(response.patient_demographics)
        refusal_reason.append(response.refusal_reason)
        patient_outcome.append(response.patient_outcome)
        pharmacist_outcome.append(response.pharmacist_outcome)
        news_source.append(response.news_source)
        additional_information.append(response.additional_information)
    else:
        date.append("None")
        location.append("None")
        pharmacy_name.append("None")
        drug_or_classification.append("None")
        patient_name.append("None")
        patient_demographics.append("None")
        refusal_reason.append("None")
        patient_outcome.append("None")
        pharmacist_outcome.append("None")
        news_source.append("None")
        additional_information.append("None")

In [22]:
# Add new columns from the earlier DataFrame
extraction_df = pd.concat([classify_df, pd.DataFrame({
    "date": date,
    "location": location,
    "pharmacy_name": pharmacy_name,
    "drug_or_classification": drug_or_classification,
    "patient_name": patient_name,
    "patient_demographics": patient_demographics,
    "refusal_reason": refusal_reason,
    "patient_outcome": patient_outcome,
    "pharmacist_outcome": pharmacist_outcome,
    "news_source": news_source,
    "additional_information": additional_information
})], axis=1)

In [23]:
extraction_df.to_csv("TT_classification_and_extraction_gpt4o_prompt2_relevant_and_example.csv", index=False)
