# Packages

In [None]:
pip install langchain-openai "unstructured==0.13.7" pypandoc langchain-core langchain-community

In [7]:
from typing import List
from langchain_core.documents.base import Document
from langchain_community.document_loaders import UnstructuredRTFLoader
import os 
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableSequence, RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
import pandas as pd

# Loading the documents 

In [None]:
folder_path = "15"

langchain_docs = []
# Loop through each document and load it into a langchain document 
for filename in os.listdir(folder_path):
    loader = UnstructuredRTFLoader(os.path.join(folder_path, filename))
    docs = loader.load()
    print(docs)
    langchain_docs.append(docs[0])

# Classifying the documents 

## Structured outputs - https://python.langchain.com/v0.1/docs/modules/model_io/chat/structured_output/

LLMs really like structure and tend to be more accurate when you "program" them, so know we can get back a class with variables that we can use later without parsing the output. 

In [33]:
from langchain_core.pydantic_v1 import BaseModel, Field


class Response(BaseModel):
    """Class to parse the output of the LLM"""
    true_or_false: str = Field(description="""Answer with ‘True’ if it is about this topic or ‘False’ if it is not.If it is about pharmaceutical refusals, but does not meet all three conditions, answer ‘Unclear’.""")
    additional_information: str = Field(description="Any additional information")

In [34]:
def classify_doc(doc: Document, llm: ChatOpenAI, chat_prompt:ChatPromptTemplate) -> Response:
    """Classify a document as true or false related to pharmacy refusals.
    Accepts:
        doc (Document): The langchain document to classify. Contains the text content to be analyzed.
        llm (ChatOpenAI): The language model to use for classification. Configured for structured output.
        chat_prompt (ChatPromptTemplate): The chat prompt template to use for generating the classification question.

    Returns:
        Response: A Structured Response class defined with pydantic, containing:
            - true_or_false (str): "True" if the document relates to pharmacy refusals, "False" otherwise.
            - additional_information (str): Any extra context or details about the classification.

    Description:
    This function takes a document, uses a specified language model and chat prompt to analyze
    the document's content, and determines whether it's related to pharmacy refusals.
    The result is returned in a structured format for easy parsing and further processing.
    """
    chatting_chain = RunnableSequence({"question": RunnablePassthrough(), "source": lambda x: doc.page_content} | chat_prompt | llm)
    response = chatting_chain.invoke(" ")
    return response

In [None]:
chat_prompt = ChatPromptTemplate.from_template(template="""
    We are searching for specific examples of pharmaceutical refusals, or the refusal to fulfill a prescription medication at a pharmacy by a pharmacist based on religious or moral objections. Our current corpus contains news articles or legal cases.
    To qualify as a specific example of a pharmaceutical refusal, the news article or legal case must have all three conditions:
    1. Involve a specific person who was refused a prescription at a pharmacy (name not necessary).
    2. Mention the drug or type of drug that was refused (e.g. emergency contraception, birth control, abortion medication, hormones, HIV medication, etc.).
    3. State that the refusal was based on moral or religious grounds. It can also relate to an alternative conscientious objection.
    Based on these conditions, read each of the attached documents and determine if it mentions specific instances of prescriptions being refused on moral or religious grounds.              
    {source} 
    {question}""")
openai_model = ChatOpenAI(model = "gpt-4o")
structured_llm = openai_model.with_structured_output(Response)
response = classify_doc(langchain_docs[0], structured_llm, chat_prompt)
print(response)

In [36]:
true_or_false = []
additional_information = []
document_names = []
for doc in langchain_docs:
    response = classify_doc(doc, structured_llm, chat_prompt)
    true_or_false.append(response.true_or_false)
    additional_information.append(response.additional_information)
    document_names.append(doc.metadata["source"])

In [37]:
df = pd.DataFrame({"document_name": document_names, "Pharmacy Refusal true_or_false": true_or_false, "additional_information": additional_information})
df.to_csv("15_classifications_promptteam1.csv", index=False)