In [17]:
from transformers import pipeline

# Load the zero-shot classifier
# classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
models = {"bart": "facebook/bart-large-mnli",
         "distilBERT":"typeform/distilbert-base-uncased-mnli",
        
         }

In [18]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification",
                      model=models['bart'])




Device set to use cpu


In [19]:
# Long labels for improved performance of the model 
label_map = {
    "The user wants to place a new order": "new_order",
    "The user is asking about the status of an order": "order_status",
    "The message is a general question about products or services": "general_inquiry",
    "The user is unhappy and is filing a complaint": "complaint",
    "The user wants to return or get a refund": "return_refund",
    "The user is following up on a previous conversation": "follow_up",
    "The user is giving feedback, suggestions, or compliments": "feedback",
    "The message doesn’t fit any category above": "others"
}

# Tried to make it more informative but it lowerd down the score as well as the predcution was wrong
# label_map = {
#     "The customer wants to place a new order or buy something": "new_order",
#     "The customer is asking about the delivery or status of an order": "order_status",
#     "The customer is asking a general question about products, policies, or services": "general_inquiry",
#     "The customer is unhappy or expressing dissatisfaction": "complaint",
#     "The customer wants to return a product or request a refund": "return_refund",
#     "The customer is following up on a previous message or request": "follow_up",
#     "The customer is providing feedback, suggestions, or compliments": "feedback",
#     "The message does not clearly match any of the categories above": "others"
# }

long_labels = list(label_map.keys())


In [20]:
def classify_message(text, long_labels, label_map, threshold=0.6):
    
#     The wrapped texts are another way of improving the performance but didn't work here. lower score
#     wrapped_text = f"Classify this customer support message: '{text}'"
#     result = classifier(wrapped_text, long_labels)
    
    result = classifier(text, long_labels)
    
    top_long_label = result["labels"][0]
    top_score = result["scores"][0]
    predicted_short_label = label_map[top_long_label]

    print(f"\n📝 Message: {text}")
    print(f"🏷️ Predicted: {predicted_short_label} (Confidence: {top_score:.2f})")

    if top_score < threshold:
        print("⚠️ Confidence below threshold. Consider fallback or human review.")

    print("\n🔍 All scores:")
    for long_label, score in zip(result["labels"], result["scores"]):
        short = label_map[long_label]
        print(f"{short}: {score:.2f}")

    return predicted_short_label, top_score


In [21]:
# Test a message
text = "Can I return just one item from the bundle?"
classify_message(text, long_labels, label_map)



📝 Message: Can I return just one item from the bundle?
🏷️ Predicted: return_refund (Confidence: 0.45)
⚠️ Confidence below threshold. Consider fallback or human review.

🔍 All scores:
return_refund: 0.45
general_inquiry: 0.20
feedback: 0.13
follow_up: 0.11
order_status: 0.05
new_order: 0.03
complaint: 0.02
others: 0.01


('return_refund', 0.45484107732772827)

In [22]:
text = "Can I return the yoga mat I ordered last week?"
classify_message(text, long_labels, label_map)



📝 Message: Can I return the yoga mat I ordered last week?
🏷️ Predicted: return_refund (Confidence: 0.40)
⚠️ Confidence below threshold. Consider fallback or human review.

🔍 All scores:
return_refund: 0.40
order_status: 0.25
general_inquiry: 0.15
follow_up: 0.12
feedback: 0.05
complaint: 0.03
new_order: 0.01
others: 0.00


('return_refund', 0.3982800543308258)

In [23]:
# third attempt using a multi_label_map
multi_label_map = {
    "new_order": [
        "The customer wants to place a new order",
        "The message shows intent to purchase something",
        "The user is ready to buy a product"
    ],
    "order_status": [
        "The customer is asking about the status of an order",
        "The user wants to know where their order is",
        "The message is about tracking or delivery progress"
    ],
    "general_inquiry": [
        "The customer is asking a general question about products or services",
        "The message is a product or service inquiry",
        "The user is requesting information about offerings or policies or brochures"
    ],
    "complaint": [
        "The customer is expressing dissatisfaction with a product or service",
        "The user is reporting a problem or issue",
        "The message contains a complaint or negative experience"
    ],
    "return_refund": [
        "The customer wants to return a product",
        "The user is requesting a refund",
        "The message is about returning or exchanging an item"
    ],
    "follow_up": [
        "The customer is following up on a previous message",
        "The user is checking the status of a prior conversation",
        "The message is a reminder or nudge for a response"
    ],
    "feedback": [
        "The customer is giving feedback or a suggestion",
        "The user is leaving a compliment or review",
        "The message shares an opinion or appreciation"
    ],
    "others": [
        "The message does not fit any specific category",
        "This is a generic message with no clear intent",
        "The content is irrelevant or not related to customer service"
    ]
}

In [26]:
def classify_with_flat_hypotheses(text, multi_label_map, threshold=0.15):
    all_phrases = []
    phrase_to_label = {}

    # 1. Flatten all hypotheses and track their label
    for label, phrases in multi_label_map.items():
        for phrase in phrases:
            all_phrases.append(phrase)
            phrase_to_label[phrase] = label

    # 2. Run once on all hypotheses
    result = classifier(text, all_phrases)

    # 3. Aggregate raw scores by label
    scores_by_label = {}
    for phrase, score in zip(result["labels"], result["scores"]):
        label = phrase_to_label[phrase]
        if label not in scores_by_label:
            scores_by_label[label] = []
        scores_by_label[label].append(score)

    # 4. Average per label
    averaged_scores = {
        label: sum(scores) / len(scores) for label, scores in scores_by_label.items()
    }

    # 5. Normalize scores to sum to 1
    total_score = sum(averaged_scores.values())
    normalized_scores = {
        label: score / total_score for label, score in averaged_scores.items()
    }

    # 6. Pick the top label
    sorted_labels = sorted(normalized_scores.items(), key=lambda x: x[1], reverse=True)
    top_label, top_score = sorted_labels[0]

    # 7. Display results
    print(f"\n📝 Message: {text}")
    print(f"🏷️ Predicted: {top_label} (Normalized Confidence: {top_score:.2f})")
    if top_score < threshold:
        print("⚠️ Confidence below threshold. Consider fallback or human review.")

    print("\n🔍 Normalized Scores:")
    for label, score in sorted_labels:
        print(f"{label}: {score:.2f}")

    return top_label, top_score


In [27]:
text = "Can I return just one item from the bundle?"
classify_with_flat_hypotheses(text, multi_label_map)


📝 Message: Can I return just one item from the bundle?
🏷️ Predicted: return_refund (Normalized Confidence: 0.34)

🔍 Normalized Scores:
return_refund: 0.34
feedback: 0.24
complaint: 0.14
follow_up: 0.11
general_inquiry: 0.08
new_order: 0.05
order_status: 0.03
others: 0.01


('return_refund', 0.3414390419113004)

In [28]:
def classify_label_only(text, multi_label_map):
    all_phrases = []
    phrase_to_label = {}

    # Flatten hypotheses and track labels
    for label, phrases in multi_label_map.items():
        for phrase in phrases:
            all_phrases.append(phrase)
            phrase_to_label[phrase] = label

    # Run zero-shot classification once
    result = classifier(text, all_phrases)

    # Aggregate scores by label
    scores_by_label = {}
    for phrase, score in zip(result["labels"], result["scores"]):
        label = phrase_to_label[phrase]
        if label not in scores_by_label:
            scores_by_label[label] = []
        scores_by_label[label].append(score)

    # Average scores and pick top label
    averaged_scores = {
        label: sum(scores) / len(scores) for label, scores in scores_by_label.items()
    }

    top_label = max(averaged_scores, key=averaged_scores.get)
    return top_label


In [29]:
text = "Can I return just one item from the bundle?"
classify_label_only(text, multi_label_map)

'return_refund'

In [34]:
import json
# Load the training dataset using relative path
with open("../data/training.json", "r") as f:
    data = json.load(f)

messages = data["messages"]

In [35]:
import random
results=[]
correct=0
selected_messages =30
for i, msg in enumerate(random.sample(messages, selected_messages), start=1):
    text = msg["message"]
    actual = msg["category"]
    
    response = classify_label_only(text, multi_label_map)
    predicted = response.strip()
    results.append({
    "message": text,
    "actual": actual,
    "predicted": predicted
    })
    if predicted == actual:
        correct += 1

    print(f"{i:>3}. Text: {text}")
    print(f"   Actual: {actual} | Predicted: {predicted}\n")
    
accuracy = correct / selected_messages
print(f"\n Accuracy: {accuracy:.2%} ({correct}/{selected_messages})")

  1. Text: What are your store hours on weekends?
   Actual: general_inquiry | Predicted: general_inquiry

  2. Text: Can I get a sample before buying?
   Actual: general_inquiry | Predicted: new_order

  3. Text: How do I check the delivery status?
   Actual: order_status | Predicted: order_status

  4. Text: The refund amount is incorrect.
   Actual: return_refund | Predicted: complaint

  5. Text: Testing integration.
   Actual: others | Predicted: feedback

  6. Text: Excellent experience overall.
   Actual: feedback | Predicted: feedback

  7. Text: Need resolution urgently.
   Actual: follow_up | Predicted: complaint

  8. Text: Your return form isn’t working.
   Actual: return_refund | Predicted: complaint

  9. Text: How long does shipping take?
   Actual: general_inquiry | Predicted: order_status

 10. Text: Can someone please check my previous issue?
   Actual: follow_up | Predicted: complaint

 11. Text: How can I get a catalog?
   Actual: general_inquiry | Predicted: genera