# Packages

In [None]:
pip install langchain-openai "unstructured==0.13.7" pypandoc langchain-core langchain-community scikit-learn faiss-cpu langchain-text-splitters ipykernel pandas

In [None]:
pip install langchain-ollama ollama

In [10]:
import os 
import chardet
import csv

from typing import List
from langchain_core.documents.base import Document
from langchain_community.document_loaders import UnstructuredRTFLoader, TextLoader
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableSequence, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import pandas as pd
from sklearn.metrics import accuracy_score
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain_ollama import ChatOllama

## Test with Ollama model (current ollama version is 0.5.4)


#### Useful classes and methods definition

In [30]:
class TestDataRowInfo:
    """"Parses individual rows from the manual label CSV file"""
    def __init__(self, about_pharmacy_refusals, additional_info=""):
        self.about_pharmacy_refusals = about_pharmacy_refusals
        self.additional_info = additional_info 

    def not_labeled(self):
        return self.about_pharmacy_refusals == ""

In [41]:
class ClassificationResponse(BaseModel):
    """Class to parse the output of the LLM"""
    about_pharmaceutical_refusals: str = Field(description="""Answer with Yes’ if it is about this topic or No’ if it is not.If it is about pharmaceutical refusals, but does not meet all three conditions, answer ‘Unclear’.""") #  but does not meet all three conditions, answer ‘Unclear’.
    additional_information: str = Field(description="Any extra context or details about the classification.")

In [42]:
def run_structured_llm(doc: Document, llm: ChatOpenAI, chat_prompt:ChatPromptTemplate) -> ClassificationResponse:
    """Take in a document, use a language model and chat prompt and return a structured response.
    Accepts:
        doc (Document): The langchain document to classify. Contains the text content to be analyzed.
        llm (ChatOpenAI): The language model to use for classification. Configured for structured output.
        chat_prompt (ChatPromptTemplate): The chat prompt template to use for generating the classification question.

    Returns:
        Response: A Structured Response class defined with pydantic, containing:
            - true_or_false (str): "Yes" if the document relates to pharmacy refusals, "No" otherwise.
            - additional_information (str): Any extra context or details about the classification.

    Description:
    This function takes a document, uses a specified language model and chat prompt to analyze
    the document's content, and determines whether it's related to pharmacy refusals.
    The result is returned in a structured format for easy parsing and further processing.
    """
    chatting_chain = RunnableSequence({"question": RunnablePassthrough(), "source": lambda x: doc.page_content} | chat_prompt | llm)
    response = chatting_chain.invoke("")
    return response

In [None]:
classify_chat_prompt = ChatPromptTemplate.from_template(template="""
    We are searching for specific examples of pharmaceutical refusals, or the refusal to fulfill a prescription medication at a pharmacy by a pharmacist based on religious or moral objections. Our current corpus contains news articles or legal cases.
    To qualify as a specific example of a pharmaceutical refusal, the news article or legal case must have all three conditions:
    1. Involve a specific person who was refused a prescription at a pharmacy (name not necessary).
    2. Mention the drug or type of drug that was refused (e.g. emergency contraception, birth control, abortion medication, hormones, HIV medication, etc.).
    3. State that the refusal was based on moral or religious grounds. It can also relate to an alternative conscientious objection.
    Based on these conditions, read each of the attached documents and determine if it mentions specific instances of prescriptions being refused on moral or religious grounds.    
    Answer based on the following document:{source} Do not include any other information in your answer.
    {question}""")

#### Ollama model setup

In [None]:
ollama_model = ChatOllama(model="llama3.2").with_structured_output(ClassificationResponse)

#### Read data from manual label CSV file

In [31]:
relative_manual_label_csv_file_path = "../TestData/QSIDE Pharmacy Refusal Data Label - Web_based_2014-Present1-500byRelevence_cleaned.csv"

current_directory = os.getcwd()
    
full_manual_label_csv_file_path = os.path.join(current_directory, relative_manual_label_csv_file_path)

manual_label_data_by_name = dict()

with open(full_manual_label_csv_file_path, 'r', newline='', encoding='utf-8') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    
    for row in csv_reader:
        document_name = row['Document name'].strip()
        
        about_pharmacy_refusals = row.get('about_pharmacy_refusals', '').strip()
        additional_notes = row.get('Additional notes', '').strip()
        
        test_data_row_info = TestDataRowInfo(
            about_pharmacy_refusals=about_pharmacy_refusals, 
            additional_info=additional_notes
        )
        
        manual_label_data_by_name[document_name] = test_data_row_info

#### Validation process

In [37]:
def get_predicted_label_for_file(file_path, llm_model):
    loader = TextLoader(file_path, encoding='utf-8')
    docs = loader.load()
    response = run_structured_llm(docs[0], llm_model, classify_chat_prompt)

    if not response:
        return None 

    return response.about_pharmaceutical_refusals

In [43]:
folder_paths = ["../FullData/Web_based_2014-Present1-500byRelevence_cleaned"]

result_csv_file_name = "validation_result.csv"

with open(result_csv_file_name, 'w', newline='', encoding='utf-8') as result_csv_file:
    result_csv_file_writer = csv.writer(result_csv_file)
    
    result_csv_file_writer.writerow(['document_name', 'actual_label', 'model_predicted_label'])

    for folder_path in folder_paths:
        for filename in os.listdir(folder_path):
            file_path = os.path.join(folder_path, filename)

            filename_without_extension = os.path.splitext(filename)[0].strip()
        
            # Some file names contain a strange leading apostrophe ('), which for some reasons is omitted when copy to CSV file so this is a "hack" to handle those documents 
            if filename_without_extension[0] == "'":
                filename_without_extension = filename_without_extension[1:].strip()

            actual_data = manual_label_data_by_name[filename_without_extension]
            if actual_data.not_labeled():
                continue 

            actual_label = actual_data.about_pharmacy_refusals
            try:
                model_predicted_label = get_predicted_label_for_file(file_path, ollama_model)
                result_csv_file_writer.writerow([filename_without_extension, actual_label, model_predicted_label])
                
            except Exception as e:
                print(f"Error: {e}")

# Loading the documents 

## Test Documents (for making sure the model is working positive cases)

In [3]:
test_langchain_docs = []
test_document_type = []
folder_path = "/Users/kelley/qdth/TrainTestData/Relevant_Positive"
for filename in os.listdir(folder_path): # Finds every file in the folder
    file_path = os.path.join(folder_path, filename)
    loader = TextLoader(file_path) # Loads a txt file (RTF Loader is always works )
    docs = loader.load()
    test_langchain_docs.append(docs[0])
    test_document_type.append(filename.split(".")[0])

## Magazine articles 


In [4]:
document_type = []
magazine_langchain_docs = []

folder_path = "/Users/kelley/qdth/FullData/Magazines_Journals2014-Present1-500byRelevence_cleaned"

# Loop through each document and load it into a langchain document 
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    loader = TextLoader(file_path)
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        magazine_langchain_docs.append(docs[0])
        document_type.append("Magazine")

In [5]:
folder_path = "/Users/kelley/qdth/FullData/Magazines_Journals2014-Present501-808byRelevence_cleaned"
# Loop through each document and load it into a langchain document 
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    loader = TextLoader(file_path)
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        magazine_langchain_docs.append(docs[0])
        document_type.append("Magazine")

## Newpaper

In [6]:
newspaper_langchain_docs = []
folder_path = "/Users/kelley/qdth/FullData/Magazines_Journals2014-Present1-500byRelevence_cleaned"
# Loop through each document and load it into a langchain document  
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    loader = TextLoader(file_path)
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        newspaper_langchain_docs.append(docs[0])
        document_type.append("Newspaper")

In [7]:
folder_path = "/Users/kelley/qdth/FullData/Newspapers2014-Present501-1000byRelevence_cleaned"
# Loop through each document and load it into a langchain document  
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    loader = TextLoader(file_path)
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        newspaper_langchain_docs.append(docs[0])
        document_type.append("Newspaper")

## News Transcripts

In [8]:
news_transcript_langchain_docs = []
folder_path = "/Users/kelley/qdth/FullData/NewsTranscripts2014-Present1-500byRelevence_cleaned"
# Loop through each document and load it into a langchain document  
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    loader = TextLoader(file_path)
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        news_transcript_langchain_docs.append(docs[0])
        document_type.append("News Transcript")

In [9]:
folder_path = "/Users/kelley/qdth/FullData/NewsTranscripts2014-Present501-1000byRelevence_cleaned"
# Loop through each document and load it into a langchain document  
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    loader = TextLoader(file_path)
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        news_transcript_langchain_docs.append(docs[0])
        document_type.append("News Transcript")
# Related with example 

## Newswires

In [10]:
newswire_langchain_docs = []
folder_path = "/Users/kelley/qdth/FullData/Newswires_Press_Releases2014-Present1-500byRelevence_cleaned"
# Loop through each document and load it into a langchain document  
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    loader = TextLoader(file_path)
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        newswire_langchain_docs.append(docs[0])
        document_type.append("Newswire")

In [11]:
folder_path = "/Users/kelley/qdth/FullData/Newswires_Press_Releases2014-Present501-1000byRelevence_cleaned-1"
# Loop through each document and load it into a langchain document  
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    loader = TextLoader(file_path)
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        newswire_langchain_docs.append(docs[0])
        document_type.append("Newswire")

## Web Based

In [12]:
web_langchain_docs = []
folder_path = "/Users/kelley/qdth/FullData/Web_based_2014-Present1-500byRelevence_cleaned"
# Loop through each document and load it into a langchain document  
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    loader = TextLoader(file_path)
    docs = loader.load()
    if len(docs[0].page_content) > 0:
        web_langchain_docs.append(docs[0])
        document_type.append("Web Based")

In [13]:
langchain_docs = magazine_langchain_docs + newspaper_langchain_docs + news_transcript_langchain_docs + newswire_langchain_docs + web_langchain_docs
print(len(langchain_docs))
print(len(document_type))

3828
3828


In [31]:
filtered_langchain_docs = langchain_docs[:400]
filtered_document_type = document_type[:400]

# Embeddings for Retrieval Augmented Generation (RAG) Testing

In [15]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Classifying the documents 

## Structured outputs - https://python.langchain.com/v0.1/docs/modules/model_io/chat/structured_output/

LLMs really like structure and tend to be more accurate when you "program" them, so know we can get back a class with variables that we can use later without parsing the output. 

## Rag Chain for Classification

In [17]:
def split_into_chunks(doc: Document, chunk_size: int = 200, chunk_overlap: int = 40) -> List[Document]:
    """Take in a document, split it into chunks of a specified size and overlap."""
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents([doc])
    return chunks

In [18]:
def run_rag_chain(doc: Document, llm: ChatOpenAI, embeddings: OpenAIEmbeddings, chat_prompt:ChatPromptTemplate) -> ClassificationResponse:
    """Take in a document, use a language model, embeddings, and chat prompt and return a retrieval based classification response.
    Accepts:
        doc (Document): The langchain document to classify. Contains the text content to be analyzed.
        llm (ChatOpenAI): The language model to use for classification. Configured for structured output.
        chat_prompt (ChatPromptTemplate): The chat prompt template to use for generating the classification question.
        embeddings (OpenAIEmbeddings): The embeddings to use for the vector store.
    Returns:
        Response: A Structured Response class defined with pydantic, containing:
            - true_or_false (str): "True" if the document relates to pharmacy refusals, "False" otherwise.
            - additional_information (str): Any extra context or details about the classification.

    Description:
    This function takes a document, uses semantic search and keyword search to classify the document, and returns a structured response.
    """
    print(doc.page_content)
    chunks = split_into_chunks(doc)
    vector_store = FAISS.from_documents(chunks, embeddings) # for semantic search (Vector Search)
    vector_store.k = 10
    vector_retriever = vector_store.as_retriever()
    rag_chain = RunnableSequence({"question": RunnablePassthrough(), "source": vector_retriever} | chat_prompt | llm)
    response = rag_chain.invoke(f"")
    return response

## Run the model for a single document

In [None]:
openai_model = ChatOpenAI(model = "gpt-4o", api_key= os.getenv("OPENAI_API_KEY"))
classify_structured_llm = openai_model.with_structured_output(ClassificationResponse)
response = run_structured_llm(test_langchain_docs[0], classify_structured_llm, classify_chat_prompt)
print(response)

In [22]:
predicted_pharmaceutical_refusals = []
document_names = []
for i, doc in enumerate(filtered_langchain_docs):
    print(len(doc.page_content))
    response = run_structured_llm(doc, classify_structured_llm, classify_chat_prompt)
    predicted_pharmaceutical_refusals.append(response.about_pharmaceutical_refusals)
    document_names.append(f'{doc.metadata["source"].split("/")[-2]}/{doc.metadata["source"].split("/")[-1]}')

44155
32268
29092


In [23]:
classify_df = pd.DataFrame({"document_name": document_names, "document_type": filtered_document_type, "Pharmaceutical Refusal": predicted_pharmaceutical_refusals})
classify_df

Unnamed: 0,document_name,document_type,Pharmaceutical Refusal
0,Relevant_Positive/Michigan Pharmacist Refused ...,Michigan Pharmacist Refused to Dispense Miscar...,True
1,Relevant_Positive/Morning-after pill denied; s...,Morning-after pill denied; suit follows,True
2,Relevant_Positive/San Diego woman says CVS pha...,San Diego woman says CVS pharmacist refused to...,Unclear


# Extracting the information from the documents 

## Structured Output Blueprint  


In [24]:
class ExtractionResponse(BaseModel):
    """Class to parse the output of the LLM"""
    date: str = Field(description="The date when the incident occurred. Only list the date if it refers to when a specific pharmacist refused a prescription, not legal case timelines or rulings, or the date the article was published or uploaded. Answer with None if not mentioned.")
    location: str = Field(description="The state, city, or county where a specific pharmacist refused a prescription. Answer with None if not mentioned.")
    pharmacy_name: str = Field(description="The pharmacy that originally refused the medication. Answer with None if not mentioned.")
    drug_or_classification: str = Field(description="The drug, item, or broad drug category that was refused. Answer with None if not mentioned.")
    patient_name: str = Field(description="The name of the patient who was refused medication. Answer with None if not mentioned.")
    patient_demographics: str = Field(description="The demographics of the patient (e.g. Age, Race, Gender, Sexuality, etc.). Answer with None if not mentioned.")
    refusal_reason: str = Field(description="The reason the pharmacist refused to provide the desired medication. Answer with None if not mentioned.")
    patient_outcome: str = Field(description="The outcome for the patient. Did they eventually receive the drug? If yes, indicate if it was the same pharmacy or a different one. Answer with None if not mentioned.")
    pharmacist_outcome: str = Field(description="The outcome for the pharmacist. Was legal action brought against the pharmacist or pharmacy, and if so, what was the result? Answer with None if not mentioned.")
    news_source: str = Field(description="Where the story was reported (name of newspaper, publication, headline, and date published). Answer with None if not mentioned.")
    additional_information: str = Field(description="Any important additional information about the refusal.")

In [25]:
extraction_structured_llm = openai_model.with_structured_output(ExtractionResponse)
extraction_chat_prompt = ChatPromptTemplate.from_template(template="""Answer the following questions based on the following document:{source}. From the document, which clarifies specific instances of pharmaceutical refusals based upon moral or religious grounds, extract the following information. If the information is not available, return ‘None’""")

## Run the extraction process for all pharmaceutical refusals documents

In [26]:
extraction_responses = []
date = []
location = []
pharmacy_name = []
drug_or_classification = []
patient_name = []
patient_demographics = []
refusal_reason = []
patient_outcome = []
pharmacist_outcome = []
news_source = []
additional_information = []
for i, doc in enumerate(filtered_langchain_docs):
    bool = predicted_pharmaceutical_refusals[i]
    if bool == "True":
        response = run_structured_llm(doc, extraction_structured_llm, extraction_chat_prompt)
        date.append(response.date)
        location.append(response.location)
        pharmacy_name.append(response.pharmacy_name)
        drug_or_classification.append(response.drug_or_classification)
        patient_name.append(response.patient_name)
        patient_demographics.append(response.patient_demographics)
        refusal_reason.append(response.refusal_reason)
        patient_outcome.append(response.patient_outcome)
        pharmacist_outcome.append(response.pharmacist_outcome)
        news_source.append(response.news_source)
        additional_information.append(response.additional_information)
    else:
        date.append("None")
        location.append("None")
        pharmacy_name.append("None")
        drug_or_classification.append("None")
        patient_name.append("None")
        patient_demographics.append("None")
        refusal_reason.append("None")
        patient_outcome.append("None")
        pharmacist_outcome.append("None")
        news_source.append("None")
        additional_information.append("None")

## Saving info to a dataframe and exporting it 

In [28]:
extraction_df = pd.DataFrame({
    "date": date,
    "location": location,
    "pharmacy_name": pharmacy_name,
    "drug_or_classification": drug_or_classification,
    "patient_name": patient_name,
    "patient_demographics": patient_demographics,
    "refusal_reason": refusal_reason,
    "patient_outcome": patient_outcome,
    "pharmacist_outcome": pharmacist_outcome,
    "news_source": news_source,
    "additional_information": additional_information
})
# Add new columns from the earlier DataFrame
final_df = pd.concat([classify_df, extraction_df], axis=1)

In [29]:
final_df.to_csv("Classification_and_Extraction_Final_400_Documents.csv", index=False)
## Filtering for only the pharmaceutical refusals 

In [None]:
final_df[final_df['Pharmaceutical Refusal'] == "True"]