## Splitting & Categorization

#### Import Packages & Models

In [0]:
import pandas as pd
import numpy as np
import torch
import re
import Levenshtein


from datasets import Dataset
from datetime import datetime
from sklearn.preprocessing import StandardScaler



from soundex import get_matched_names_and_unmatched_caregivers
from llm_models import load_llambda_model, llama_prompting, load_categorization_models
import config

2025-05-07 13:26:54.218131: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [0]:
# Load LLaMA model and tokenizer
llambda_tokenizer, llambda_model = load_llambda_model(config)

Downloading artifacts:   0%|          | 0/51 [00:00<?, ?it/s]

Downloading /local_disk0/repl_tmp_data/ReplId-196aa-df91c-b/tmpffuj4bft/llama_model_pipeline/model/model-00001…

Downloading /local_disk0/repl_tmp_data/ReplId-196aa-df91c-b/tmpffuj4bft/llama_model_pipeline/model/model-00034…

2025/05/07 13:27:15 INFO mlflow.transformers: 'runs:/daf088558bfc421e8ca57735f6a74460/llama_model_pipeline' resolved as 'dbfs:/databricks/mlflow-tracking/4367849868595085/daf088558bfc421e8ca57735f6a74460/artifacts/llama_model_pipeline'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



cannot access local variable 'model' where it is not associated with a value
cannot access local variable 'model' where it is not associated with a value


Loading checkpoint shards:   0%|          | 0/34 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [0]:
# Load other models
models = load_categorization_models(config)

# Assign tokenizers and models
roberta_tokenizer = models['roberta_tokenizer']
concerned_team_model = models['concerned_team_model']
subcategory_model = models['subcategory_model']
rating_model = models['rating_model']

Downloading artifacts:   0%|          | 0/16 [00:00<?, ?it/s]

2025/05/07 13:27:36 INFO mlflow.transformers: 'runs:/5ae7654725434ed39eb18d98e7d01b99/ConcernedTeam' resolved as 'dbfs:/databricks/mlflow-tracking/4367849868595085/5ae7654725434ed39eb18d98e7d01b99/artifacts/ConcernedTeam'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



cannot access local variable 'model' where it is not associated with a value
cannot access local variable 'model' where it is not associated with a value


Downloading artifacts:   0%|          | 0/16 [00:00<?, ?it/s]

2025/05/07 13:27:44 INFO mlflow.transformers: 'runs:/9a93c1e54b914c538e0faeb71b5ecf47/SubCategory' resolved as 'dbfs:/databricks/mlflow-tracking/4367849868595085/9a93c1e54b914c538e0faeb71b5ecf47/artifacts/SubCategory'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



cannot access local variable 'model' where it is not associated with a value
cannot access local variable 'model' where it is not associated with a value


Downloading artifacts:   0%|          | 0/16 [00:00<?, ?it/s]

2025/05/07 13:27:51 INFO mlflow.transformers: 'runs:/a1aae205bf254638ae560f5a009ba6e2/SentimentAnalysis' resolved as 'dbfs:/databricks/mlflow-tracking/4367849868595085/a1aae205bf254638ae560f5a009ba6e2/artifacts/SentimentAnalysis'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



cannot access local variable 'model' where it is not associated with a value
cannot access local variable 'model' where it is not associated with a value


Downloading artifacts:   0%|          | 0/16 [00:00<?, ?it/s]

2025/05/07 13:27:58 INFO mlflow.transformers: 'runs:/5ae7654725434ed39eb18d98e7d01b99/ConcernedTeam' resolved as 'dbfs:/databricks/mlflow-tracking/4367849868595085/5ae7654725434ed39eb18d98e7d01b99/artifacts/ConcernedTeam'


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]



cannot access local variable 'model' where it is not associated with a value
cannot access local variable 'model' where it is not associated with a value


#### Read Input Data & Identify New Data To Be Categorized

In [0]:
# Set Spark configurations from the config file
for key, value in config.spark_config.items():
    spark.conf.set(key, value)

In [0]:
# Use the file paths and mappings from the config file
feedback_input_path = config.feedback_input_path

In [0]:
# Define the columns to keep in output
selected_output_columns = config.categorization_output_columns 

# Column mapping between input data and trained model
column_mapping_for_original_columns = config.column_mapping_for_original_columns 

# Column renaming for the ai generated columns 
column_renaming_for_ai_created_columns = config.column_renaming_for_ai_created_columns


In [0]:
# Team & subcategory descriptions 
teams_description_dict = config.teams_description_dict
subcategory_description_dict = config.subcategory_description_dict

In [0]:
# Label to name mapping for Teams & subcategories 

label_mapping_team = config.label_mapping_team
label_mapping_subcategory = config.label_mapping_subcategory
label_mapping_rating = config.label_mapping_rating


In [0]:
# Read the table
input = spark.read.format("delta").load(feedback_input_path).toPandas()
input['Last_Load_Date'] = pd.to_datetime(input['Last_Load_Date'])
input['Visit_Date'] = pd.to_datetime(input['Visit_Date'])

# Filter the dataframe to include only CommentIDs that have not been gone through inference pipeline yet
input = input[input['Inference_Flag']==0]

input = input.rename(columns=column_mapping_for_original_columns)

#### Split Feedback

In [0]:
splitting_prompt = """Task: Extract and summarize the key points from the patient's feedback enclosed in < and >.

Requirements:

1. Extract the key points as bullet point.
2. Combine related ideas about the same person or team into a single bullet point to avoid oversplitting.
3. Ensure each bullet point is Concise, specific, and retains the patient's original tone.
4. Include **all mentioned names** in the feedback and ensure they appear in the corresponding bullet points.
5. Do not:
    a) Combine feedback about two or more different persons or teams, even if they are related or similar.
    b) Combine positive and negative feedback into one point. If the feedback includes both sentiments about the same person or team, split them into separate points.
    c) Rephrase or alter the intent, tone, or keywords.
    d) Add opinions, explanations, or context.
6. Only output the bullet points without any additional text or descriptions.
7. Avoid unnecessary splitting: If the feedback contains only one idea, it should remain a single bullet point.
8. Separate feedback based on roles (e.g., Receptionist, Nurses, Doctors, Assistants, Anesthesiologists, Therapists, Pharmacists, Technicians etc.), ensuring each role has a separate bullet point.

Guidelines for Combining Ideas:

1. If multiple statements relate to the same person, team, or topic, combine them into a single coherent point.
2. Ensure the combined point captures the essence of all related feedback without leaving out any details.
3. If a person or team receives both positive and negative feedback, separate them into two distinct points.
4. Verify that every name mentioned in the feedback is included in the output.
5. Do not merge feedback about different individuals or teams, even if they are mentioned in the same sentence.


Input Format:
Here is the Feedback: <Patient Feedback>

Output Format:

Bullet points summarizing key points."""

In [0]:
# Function to process a single row and return a DataFrame of split comments
def split_feedback(row):
    feedback_id = row['Comment_ID']
    feedback = row['Comment']
    
    # Call the LLaMA prompting function
    output = llama_prompting(feedback, splitting_prompt, llambda_tokenizer, llambda_model, max_new_tokens=4096)
    
    # Clean and split the output into individual bullet points
    feedback_points = [
                        point.strip('•').strip()
                        for point in output.split("\n")
                        if point.strip() and "key points" not in point.lower() and "bullet" not in point.lower()
                      ]
    
    # Create a DataFrame for the current row's split comments
    return pd.DataFrame({
        'Comment_ID': [feedback_id] * max(len(feedback_points), 1),
        'Comment': feedback_points if len(feedback_points) > 0 else [""]
    })

In [0]:
split_comments = pd.concat(input.apply(split_feedback, axis=1).tolist(), ignore_index=True)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end gene

#### Identify Caregiver Name

In [0]:
extract_cg_names_prompt = """You are a Named Entity Recognition (NER) engine specialized in identifying person names from patient feedback.

Input: Patient feedback enclosed in < and >

Instructions:
1. Analyze the text within < and > to identify all person names.
2. Extract names without including any titles (e.g., exclude "Dr.," "Nurse," "Mr.," "Ms.").
2. Output only the identified names as a comma-separated list.
3. Include both first and last names when available.
4. Recognize variations in name formats (e.g., Dr. Smith, Nurse Johnson).
6. Ignore general titles without specific names like doctor, nurse, operator, anesthesiologist, receptionist, Gynecologist etc.
7. If no names are found, output "NULL"

Examples:
Input: <Dr. Sarah Johnson provided excellent care during my stay.>
Output: Sarah Johnson

Input: <Nurse Thompson and Dr. Emily Roberts were very attentive.>
Output: Thompson, Emily Roberts

Input: <Thank you Cleveland Clinic >
Output: NULL

Input: <The Filipino therapist and Sudanese doctor were wonderful.>
Output: NULL

Input: <Daman insurance is not good. >
Output: NULL

Input: <Male Nurse was rude at the clinic >
Output: NULL

Input Format: Here is the feedback received from a Patient:<Feedback>

Note: Provide only the requested output without explanations or additional text
"""


# Function to extract caregiver names
def extract_caregiver_names(row, comment_col_name = 'Comment'):
    
    feedback = row[comment_col_name]
    
    llama_output_string =  llama_prompting(feedback, extract_cg_names_prompt, llambda_tokenizer, llambda_model, max_new_tokens=100, do_sample=False, temperature=0)
    llama_output_list = [part.strip() for part in llama_output_string.split(",") if llama_output_string!='NULL']

    exclude_list = [
        # Gender pronouns and general terms
        'he', 'she', 'her', 'they', 'everyone', 'somebody', 'sir', 'madam',
        
        # Family and relational terms
        'mother', 'brother', 'husband', 'wife', 'daughter', 'son', 'father', 'sister',

        # Titles and honorifics
        'mr', 'ms', 'mrs', 'dr', 'sir', 'madam', 'The doctor',

        # Professional roles
        'nurse', 'nurses', 'operator', 'therapist', 'receptionist', 'physician', 'physicians', 
        'gynecologist', 'gynecologists', 'anesthesiologists', 'doctor', 'doctors',

        # Gendered terms
        'male', 'female', 'man', 'woman',

        # Specific medical terms
        'orthopedic', 'orthopedics', 'foreign', 'cleveland',

        # Miscellaneous
        'The', 'The ', ' The', '*******'
    ]
    

    # Replace the words in the exclude list with an empty string
    llama_output_list_processed = [ ' '.join([word if word.lower() not in exclude_list else '' for word in i.split()]).strip()
                                    for i in llama_output_list]
    llama_output_list_processed = [i for i in llama_output_list_processed if i]


    return llama_output_list_processed


In [0]:
split_comments['Caregiver Names'] = split_comments.apply(extract_caregiver_names, axis=1)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end gene

#### Combine comments about same Caregiver

In [0]:
# Convert lists to string for proper processing
split_comments['Caregiver Names'] = split_comments['Caregiver Names'].apply(lambda x: str(x) if isinstance(x, list) else x)

# Filter out rows where 'Caregiver Names' is an empty list, group the rest
grouped_comments = split_comments[split_comments['Caregiver Names'] != '[]'] \
    .groupby(['Comment_ID', 'Caregiver Names'])['Comment'].agg(' '.join).reset_index()

# Keep rows where 'Caregiver Names' is an empty list unchanged
ungrouped_comments = split_comments[split_comments['Caregiver Names'] == '[]']

# Combine both grouped and ungrouped parts, and keep the original order
split_comments = pd.concat([grouped_comments, ungrouped_comments])

# Sort by original index to maintain the original order
split_comments = split_comments.sort_index()

# Convert 'Caregiver Names' back to list
split_comments['Caregiver Names'] = split_comments['Caregiver Names'].apply(lambda x: eval(x) if isinstance(x, str) else x)


#### Categorize on Split Comments

In [0]:
#Tokenization 
def tokenize_function(examples):
    return roberta_tokenizer(examples["Comment"], padding="max_length", truncation=True)

# Convert tokenized dataset to PyTorch DataLoader
def collate_fn(batch):
    # Select necessary keys for model input
    keys = ["input_ids", "attention_mask"]
    tensor_batch = {key: torch.tensor([example[key] for example in batch]) for key in keys}
    return tensor_batch

In [0]:
#Model Inference logic
def model_inference(dataloader, model, device):

    all_logits = []
    model.to(device)
    model.eval()

    with torch.no_grad():
        for batch in dataloader:
            batch = {key: value.to(device) for key, value in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            all_logits.append(logits.cpu())
    
    return torch.cat(all_logits, dim=0)

In [0]:
#Postprocessing Function
def postprocess_logits(logits, top_k):

    probs = torch.nn.functional.softmax(logits, dim=-1).numpy()
    predictions = np.argsort(-probs, axis=-1)[:, :top_k]
    probabilities = np.take_along_axis(probs, predictions, axis=-1)
    return predictions, probabilities


#Organize Results Function
def organize_results(tokenized_datasets, concerned_team_preds, concerned_team_probs, subcategory_preds, subcategory_probs, rating_preds, rating_probs, top_k):

    results = {
        "Comment_ID": tokenized_datasets["Comment_ID"],
        "Comment": tokenized_datasets["Comment"],
        "Caregiver Names": tokenized_datasets["Caregiver Names"]
    }

    # Add Concerned Team Predictions and Probabilities
    for i in range(top_k):
        results[f"Concerned Team {i+1}"] = concerned_team_preds[:, i]
        results[f"Concerned Team Probability {i+1}"] = concerned_team_probs[:, i]

    # Add Subcategory Predictions and Probabilities
    for i in range(top_k):
        results[f"Subcategory Prediction {i+1}"] = subcategory_preds[:, i]
        results[f"Subcategory Probability {i+1}"] = subcategory_probs[:, i]

    results["Rating Prediction"] = rating_preds[:, 0]
    results["Rating Probability"] = rating_probs[:, 0]

    return pd.DataFrame(results)

In [0]:
def predict(df, top_k=4):
    # Prepare dataset
    dataset = Dataset.from_pandas(df)
    tokenized_datasets = dataset.map(tokenize_function, batched=True)

    # Device setup
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Prepare dataloader
    dataloader = torch.utils.data.DataLoader(tokenized_datasets, batch_size=16, collate_fn=collate_fn)

    # Run inference
    concerned_team_logits = model_inference(dataloader, concerned_team_model, device)
    subcategory_logits = model_inference(dataloader, subcategory_model, device)
    rating_logits = model_inference(dataloader, rating_model, device)

    # Post-process predictions
    concerned_team_preds, concerned_team_probs = postprocess_logits(concerned_team_logits, top_k)
    subcategory_preds, subcategory_probs = postprocess_logits(subcategory_logits, top_k)
    rating_preds, rating_probs = postprocess_logits(rating_logits, 1)

    # Organize results
    results_df = organize_results(
        tokenized_datasets, concerned_team_preds, concerned_team_probs, 
        subcategory_preds, subcategory_probs,
        rating_preds, rating_probs, top_k
    )

    return results_df

In [0]:
split_comment_prediction = predict(split_comments)

Map:   0%|          | 0/2138 [00:00<?, ? examples/s]

In [0]:
def label_mapping(df, top_k=4):

    # Map Concerned Team predictions to their labels
    for i in range(1, top_k + 1):
        df[f'Concerned Team {i}'] = df[f'Concerned Team {i}'].map(label_mapping_team)
    
    # Map Subcategory predictions to their labels
    for i in range(1, top_k + 1):
        df[f'Subcategory Prediction {i}'] = df[f'Subcategory Prediction {i}'].map(label_mapping_subcategory)
    
    df['Rating Prediction'] = df['Rating Prediction'].map(label_mapping_rating)

    return df


In [0]:
split_comment_prediction = label_mapping(split_comment_prediction)

In [0]:
def filter_comments_by_probability(df):
    """
    Filter comments dataframe to keep:
    1. Records with non-empty Caregiver Names (always keep)
    2. The record with highest probability for each Comment_ID
    3. Additional records only if their probability is >= 0.90
    
    Parameters:
    df (pandas.DataFrame): DataFrame with columns 'Comment_ID', 'Comment', 
                         'Concerned_Team', 'Concerned_Team_Probability',
                         'Caregiver_Names'
    
    Returns:
    pandas.DataFrame: Filtered DataFrame
    """
    # Create a copy to avoid modifying the original
    result_df = df.copy()
    
    # Create mask for non-empty Caregiver Names
    # Handles both empty lists and None values
    has_caregivers = result_df['Caregiver Names'].apply(lambda x: bool(x) if isinstance(x, list) else False)
    
    # Get the indices of rows with highest probability for each Comment_ID
    max_prob_idx = result_df.groupby('Comment_ID')['Concerned Team Probability 1'].idxmax()
    
    # Get rows that meet the 0.97 threshold
    high_prob_mask = result_df['Concerned Team Probability 1'] >= 0.90
    
    # Combine all conditions:
    # - Either has caregivers
    # - Or is the max probability row
    # - Or meets probability threshold
    final_mask = (has_caregivers | 
                 result_df.index.isin(max_prob_idx) | 
                 high_prob_mask)
    
    # Apply the filter
    filtered_df = result_df[final_mask]
    
    # Sort by Comment_ID and probability (descending) for cleaner output
    return filtered_df.sort_values(['Comment_ID', 'Concerned Team Probability 1'], 
                                 ascending=[True, False])

In [0]:
split_comment_prediction = filter_comments_by_probability(split_comment_prediction)

#### Identify Subcategories using Llama

In [0]:
# Function to create the subcategory prompt
def create_subcategory_prompt(feedback, team, candidate_subcategories):
    return f"""A patient's feedback is enclosed in < and >. The team under consideration is {team}. Below is the team definition and category descriptions.

Team Definition:
{team}: {teams_description_dict[team]} 

Task: Identify relevant categories for this team from the following list: {candidate_subcategories}.

Category Descriptions:

1. {candidate_subcategories[0]}: {subcategory_description_dict[candidate_subcategories[0]]}
2. {candidate_subcategories[1]}: {subcategory_description_dict[candidate_subcategories[1]]}
3. {candidate_subcategories[2]}: {subcategory_description_dict[candidate_subcategories[2]]}

Instructions:
1. Feedback will be provided as follows: Here is the Feedback: <Feedback>.
2. Identify all relevant categories based STRICTLY on the descriptions provided above. DO NOT use any external knowledge or assumptions.
3. Output ONLY the category names as a comma-separated string. DO NOT include any additional explanations or text.
4. If none of the categories apply, return NULL.
5. Only choose categories that directly align with the descriptions. If uncertain, default to NULL—do not try to find a category just to match."""

In [0]:
# Function to generate subcategories for a single row
def generate_subcategories(row):
    feedback = row['Comment']
    team = row['Concerned Team 1']
    candidate_subcategories = row[['Subcategory Prediction 2','Subcategory Prediction 3',
                                   'Subcategory Prediction 4']].values.tolist()

    # Generate the prompt using the helper function
    subcategory_prompt = create_subcategory_prompt(feedback, team, candidate_subcategories)
    
    # Call the llama_prompting function and return the output
    return llama_prompting(feedback, subcategory_prompt, llambda_tokenizer, llambda_model, max_new_tokens=50)


In [0]:
split_comment_prediction['Llama Predicted Subcategories'] = split_comment_prediction.apply(generate_subcategories, axis=1)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end gene

Correct Spelling

In [0]:
# Function to find matches based on Levenshtein distance 
def correct_spelling(input_string, correct_spelling_group):
    input_parts = [part.strip() for part in input_string.split(",")]
    matched_groups = []
    
    # Check Levenshtein distance between each part of the input string and the correct spelling group
    for part in input_parts:
        for group in correct_spelling_group:
            distance = Levenshtein.distance(part.lower(), group.lower())
            if distance <= 2:
                matched_groups.append(group)
    
    return matched_groups

# Apply the function to each row of the DataFrame
def apply_correct_spelling(row):
    input_string = row['Llama Predicted Subcategories']
    correct_spelling_group = row[['Subcategory Prediction 2', 'Subcategory Prediction 3', 'Subcategory Prediction 4']].values.tolist()
    
    return correct_spelling(input_string, correct_spelling_group)


In [0]:
split_comment_prediction['Llama Predicted Subcategories - Corrected'] = split_comment_prediction.apply(apply_correct_spelling, axis=1)

split_comment_prediction['Predicted Subcategories before overall classification'] = split_comment_prediction.apply(lambda row: [row['Subcategory Prediction 1']] + row['Llama Predicted Subcategories - Corrected'], axis=1)

Overall vs Care & Treatment Classification

In [0]:
#Create prompt to classify between Overall and Care & Treatment
def create_overall_vs_care_treatment_prompt(feedback, team):
    return f"""A patient's feedback is enclosed in < and >. 
Task: Categorize Feedback for the team named - {team}

Team Definition:
{team}: {teams_description_dict[team]} 

Categories: [Overall, Care & Treatment] 

1. Overall: Overall is used as a category for very general comments that don’t mention anything specific. 
Examples:

    - "Everything was smooth and easy."
    - "One of the best hospitals."
    - “Everything was excellent.”
 

2. Care & Treatment: Care & Treatment is used as a category when the comment specifically mentions care or treatment, or when referring to any care provider (nurses, physicians, therapists etc because their role inherently involves caring for or treating patients. 
Examples:

    - "I would like to thank the doctor, he was very good and managed to treat me. He listens to the patient's needs and is very professional."
    - "The treatment was excellent."
    - "Some of the nursing staff were good."
    - "Therapist Nivala is very good."
    - "Dr. Osama was great."

Instructions:
1. Feedback will be provided as: Here is the Feedback: <Feedback>
2. Based STRICTLY on the descriptions above, identify the most relevant category from [Overall, Care & Treatment] 
3. Output ONLY the category name: Overall or Care & Treatment
4. DO NOT include explanations, assumptions, or extra text
"""

In [0]:
# Function to generate subcategories for a single row
def overall_vs_subcategory(row):
    feedback = row['Comment']
    team = row['Concerned Team 1']
    subcategories = row['Predicted Subcategories before overall classification']
    
    if 'General Feedback' in subcategories:
        # Generate the prompt using the helper function
        overall_vs_care_treatment_prompt = create_overall_vs_care_treatment_prompt(feedback, team)
        # Call the llama_prompting function and return the output
        llama_output =  llama_prompting(feedback, overall_vs_care_treatment_prompt, llambda_tokenizer, llambda_model, max_new_tokens=50)

        spelling_corrected = correct_spelling(llama_output,['Overall', 'Care & Treatment'])
        if spelling_corrected:
            subcategories = [spelling_corrected[0] if item == 'General Feedback' else item for item in subcategories]
        else:
            # Handle the case where spelling_corrected is empty
            subcategories = [item for item in subcategories if item != 'General Feedback']

    return subcategories

In [0]:
split_comment_prediction['Predicted Subcategories'] = split_comment_prediction.apply(overall_vs_subcategory, axis=1)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128009 for open-end gene

#### Post Processing

group comments having same sub-categories

In [0]:
# Ensure ''Caregiver Names' are hashable for grouping by converting lists to strings or tuples
split_comment_prediction['Caregiver Names'] = split_comment_prediction['Caregiver Names'].apply(lambda x: str(x) if isinstance(x, list) else x)

# Flatten and deduplicate 'Predicted Subcategories' while grouping
subcategories_grouped = split_comment_prediction.groupby(
                        ['Comment_ID', 'Caregiver Names', 'Concerned Team 1', 'Rating Prediction']).agg(
                        {'Predicted Subcategories': lambda x: list(set(item for sublist in x for item in sublist)),
                        'Concerned Team Probability 1': 'mean',
                        'Comment': lambda x: ' '.join(x)}).reset_index()


group comments having same concerned teams

In [0]:
# Ensure 'Predicted Subcategories' are hashable for grouping by converting lists to strings or tuples
subcategories_grouped['Predicted Subcategories'] = subcategories_grouped['Predicted Subcategories'].apply(lambda x: str(x) if isinstance(x, list) else x)

# Perform groupby operation again to aggregate 'Concerned Team 1' into lists
teams_and_subcategories_grouped = subcategories_grouped.groupby(
                                    ['Comment_ID', 'Caregiver Names', 'Predicted Subcategories', 'Rating Prediction']).agg(
                                    {'Concerned Team 1': lambda x: list(x),
                                    'Concerned Team Probability 1': 'mean',
                                    'Comment': lambda x: ' '.join(x)}).reset_index()

# Convert 'Predicted Subcategories' back to list
teams_and_subcategories_grouped['Predicted Subcategories'] = teams_and_subcategories_grouped['Predicted Subcategories'].apply(lambda x: eval(x) if isinstance(x, str) else x)

# Convert 'Caregiver Names' back to list
teams_and_subcategories_grouped['Caregiver Names'] = teams_and_subcategories_grouped['Caregiver Names'].apply(lambda x: eval(x) if isinstance(x, str) else x)

In [0]:
# Rename the generated columns
teams_and_subcategories_grouped = teams_and_subcategories_grouped.rename(columns=column_renaming_for_ai_created_columns)

In [0]:
# Merge the aggregated predictions with the input dataframe
categorization_output = pd.merge(input, teams_and_subcategories_grouped, how='right', on='Comment_ID')

# Sort the final output dataframe by 'Date' and 'Comment_ID'
categorization_output = categorization_output.sort_values(by=['Date', 'Comment_ID'], ascending=True)

#### Apply rules after Categorization

In [0]:
def correct_clinical_ops_vs_surgery_scheduling(row):
    teams = row['Predicted Concerned Teams']
    Setting = row['Setting']

    if  Setting == 'Ambulatory Surgery' and 'Clinical Operations' in teams:
        teams =  ['Surgery Scheduling' if item == 'Clinical Operations' else item for item in teams]
    if  Setting != 'Ambulatory Surgery' and 'Surgery Scheduling' in teams:
        teams =  ['Clinical Operations' if item == 'Surgery Scheduling' else item for item in teams]
    return teams

In [0]:
categorization_output['Predicted Concerned Teams'] = categorization_output.apply(correct_clinical_ops_vs_surgery_scheduling, axis=1)

In [0]:
# Cases where extracted comment is empty, use the default rating
categorization_output.loc[categorization_output['Extracted Comment'] == "", 'Predicted Rating'] = categorization_output['Experience_Rating']

# Cases where extracted comment is empty, replace it with original comment
categorization_output.loc[categorization_output['Extracted Comment'] == "", 'Extracted Comment'] = categorization_output['Comment']


#### Caliberate Confidence Scores

In [0]:
def caliberate_confidence_score(df, column_name, new_min=30, new_max=98):
    # Standard scaling (assuming data is already scaled)
    scaler = StandardScaler()
    df[column_name] = scaler.fit_transform(df[[column_name]])

    # Calculate the min and max of the original column
    original_min = df[column_name].min()
    original_max = df[column_name].max()

    # Apply Min-Max scaling to the column
    df[column_name] = ((df[column_name] - original_min) / (original_max - original_min)) * (new_max - new_min) + new_min
    df[column_name] = df[column_name].round().astype(int)

    return df

In [0]:
categorization_output = caliberate_confidence_score(categorization_output, 'Confidence Score')

In [0]:
categorization_output[selected_output_columns].to_csv('CSVs/categorization_output.csv')