# Advanced Patient Review Theme Categorization

This notebook demonstrates advanced theme categorization using multiple LLM APIs, LangChain, and various prompt engineering techniques.

In [None]:
!pip install openai groq pandas tqdm langchain python-dotenv

In [None]:
import os
import pandas as pd
from typing import Dict, Any, List
import json
from tqdm import tqdm
import time
from dotenv import load_dotenv
from langchain.chat_models import ChatOpenAI, ChatGroq
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field
from typing import List

load_dotenv()

In [None]:
KEY_THEMES = [
    "general comment", "laboratory", "discharge",
    "parking/transport", "infection prevention & control", "emergency",
    "medication/prescription", "access/coord of care", "unknown",
    "social services", "respect to patient", "admit/registration",
    "nurse/nurse aide", "continuity/transition", "icu/ccu", "religion",
    "housekeeping/room", "families/friends", "emotional support",
    "physical comfort", "information/education", "billing/accounting",
    "cardiology", "dietary/service", "radiology", "positive recognition"
]

In [None]:
class ThemeResponse(BaseModel):
    themes: List[Dict[str, str]] = Field(description="List of identified themes with descriptions")

parser = PydanticOutputParser(pydantic_object=ThemeResponse)

In [None]:
##
OPENAI_API_KEY = ""
GROQ_API_KEY = ""
##

In [None]:
def create_prompt_templates():
    system_template = """You are an expert healthcare analyst specializing in patient experience analysis. 
    Your task is to analyze patient reviews and identify key themes from the following list: {themes}
    
    Guidelines:
    1. Multiple themes may be present in a single review
    2. If no theme matches, use 'unknown'
    3. Provide a brief description for each identified theme
    4. Consider both explicit and implicit mentions of themes
    5. Focus on patient experience and satisfaction aspects
    
    {format_instructions}"""
    
    human_template = """Analyze this patient review and identify relevant themes:
    
    {review}"""
    
    system_prompt = SystemMessagePromptTemplate.from_template(system_template)
    human_prompt = HumanMessagePromptTemplate.from_template(human_template)
    
    return ChatPromptTemplate.from_messages([system_prompt, human_prompt])

In [None]:
def initialize_llm_chains():
    prompt_template = create_prompt_templates()
    
    openai_llm = ChatOpenAI(
        temperature=0.7,
        model_name="gpt-3.5-turbo",
        openai_api_key=OPENAI_API_KEY
    )
    
    groq_llm = ChatGroq(
        temperature=0.7,
        model_name="llama-3.1-70b-versatile",
        groq_api_key=GROQ_API_KEY
    )
    
    openai_chain = LLMChain(
        llm=openai_llm,
        prompt=prompt_template,
        output_parser=parser
    )
    
    groq_chain = LLMChain(
        llm=groq_llm,
        prompt=prompt_template,
        output_parser=parser
    )
    
    return openai_chain, groq_chain

In [None]:
def load_reviews(file_path: str, limit: int = None) -> List[str]:
    df = pd.read_csv(file_path)
    if 'review_text' not in df.columns:
        raise ValueError("CSV file must contain a 'review_text' column")
    reviews = df['review_text'].tolist()
    return reviews[:limit] if limit else reviews

In [None]:
def process_reviews(reviews: List[str], chain: LLMChain) -> List[Dict[str, Any]]:
    results = []
    for review in tqdm(reviews, desc="Processing reviews"):
        try:
            result = chain.run(
                review=review,
                themes=", ".join(KEY_THEMES),
                format_instructions=parser.get_format_instructions()
            )
            results.append(result.dict())
            time.sleep(1)  
        except Exception as e:
            print(f"Error processing review: {e}")
            results.append({"themes": []})
    return results

In [None]:
def analyze_results(openai_results: List[Dict[str, Any]], groq_results: List[Dict[str, Any]], reviews: List[str]):
    print("\nDetailed Analysis:")
    for i, (review, openai_result, groq_result) in enumerate(zip(reviews, openai_results, groq_results)):
        print(f"\nReview {i+1}:")
        print(f"Text: {review}")
        print("\nOpenAI Analysis:")
        for theme in openai_result['themes']:
            print(f"- {theme['theme']}: {theme['description']}")
        print("\nGroq Analysis:")
        for theme in groq_result['themes']:
            print(f"- {theme['theme']}: {theme['description']}")
        print("-" * 80)

In [None]:
def calculate_theme_statistics(results: List[Dict[str, Any]], model_name: str):
    theme_counts = {}
    theme_descriptions = {}
    
    for result in results:
        for theme in result['themes']:
            theme_name = theme['theme']
            theme_counts[theme_name] = theme_counts.get(theme_name, 0) + 1
            if theme_name not in theme_descriptions:
                theme_descriptions[theme_name] = set()
            theme_descriptions[theme_name].add(theme['description'])
    
    print(f"\nTheme Statistics ({model_name}):")
    for theme, count in sorted(theme_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"\n{theme} (Count: {count}):")
        print("Sample Descriptions:")
        for desc in list(theme_descriptions[theme])[:3]:
            print(f"- {desc}")

In [None]:
def main():
    openai_chain, groq_chain = initialize_llm_chains()
    
    reviews = load_reviews("patient_reviews.csv", limit=5)
    print(f"Loaded {len(reviews)} reviews")
    
    print("\nProcessing with OpenAI...")
    openai_results = process_reviews(reviews, openai_chain)
    
    print("\nProcessing with Groq...")
    groq_results = process_reviews(reviews, groq_chain)
    
    analyze_results(openai_results, groq_results, reviews)
    
    calculate_theme_statistics(openai_results, "OpenAI")
    calculate_theme_statistics(groq_results, "Groq")
    
    results_df = pd.DataFrame({
        'review': reviews,
        'openai_themes': [json.dumps(result['themes']) for result in openai_results],
        'groq_themes': [json.dumps(result['themes']) for result in groq_results]
    })
    
    results_df.to_csv('theme_categorization_results.csv', index=False)
    print("\nResults saved to 'theme_categorization_results.csv'")

if __name__ == "__main__":
    main()