In [None]:
# ==========================================
# MILESTONE 3: URGENCY DETECTION & SCORING
# ==========================================
print("\n" + "#"*50)
print("STARTING MILESTONE 3: URGENCY DETECTION")
print("#"*50)

# Imports
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# -------------------------------------------------------
# STEP 1: PREPARE URGENCY DATA
# -------------------------------------------------------
print("\n--- 1. Preparing Urgency Labels ---")

# Map text labels to integers for the ML model
# High=2, Medium=1, Low=0
urgency_map = {'low': 0, 'medium': 1, 'high': 2}
inverse_urgency_map = {0: 'low', 1: 'medium', 2: 'high'}

# Ensure we have the target column mapped
df_final['urgency_id'] = df_final['urgency'].map(urgency_map)

# Check for any missing values
df_final = df_final.dropna(subset=['urgency_id'])

# Prepare Features (x) and Target (y)
x_urgency = vectorizer.transform(df_final['Cleaned_Text']) 
y_urgency = df_final['urgency_id']

# Split Data specifically for Urgency training
x_train_u, x_test_u, y_train_u, y_test_u = train_test_split(
    x_urgency, y_urgency, test_size=0.2, random_state=42
)

print(f"Training Samples: {x_train_u.shape[0]}")
print(f"Testing Samples:  {x_test_u.shape[0]}")

# -------------------------------------------------------
# STEP 2: TRAIN URGENCY CLASSIFIER (ML MODEL)
# -------------------------------------------------------
print("\n--- 2. Training Urgency Classification Model ---")

# We use Logistic Regression
urgency_model = LogisticRegression(max_iter=1000, class_weight='balanced')
urgency_model.fit(x_train_u, y_train_u)

# Baseline Evaluation (ML Only)
y_pred_ml = urgency_model.predict(x_test_u)
print(f"ML Model Accuracy: {accuracy_score(y_test_u, y_pred_ml):.4f}")

# -------------------------------------------------------
# STEP 3: HYBRID DETECTION (ML + KEYWORDS)
# -------------------------------------------------------
print("\n--- 3. Implementing Hybrid Detection (ML + Keywords) ---")

# Define High-Priority Keywords 
URGENT_KEYWORDS = [
    "urgent", "immediately", "asap", "emergency", "critical", 
    "now", "hurry", "blocked", "hacked", "fraud", "deadline"
]

def predict_urgency_hybrid(text, vectorizer, model):
    """
    Combines ML prediction with Keyword Rules.
    Rule: If keyword exists, boost to HIGH. Otherwise, use ML prediction.
    """
    # 1. Check Keywords (Rule-Based)
    text_lower = text.lower()
    for word in URGENT_KEYWORDS:
        if word in text_lower:
            return 2 # Return 'High' immediately
    
    # 2. If no keywords, use ML Model
    vec_text = vectorizer.transform([clean_email(text)]) # Use M1 cleaning function
    prediction = model.predict(vec_text)[0]
    return prediction

# Apply Hybrid Logic to the Test Set
print("Running Hybrid Prediction on Test Set...")

# We need the original text for keyword search, so we map indices back to df_final
test_indices = y_test_u.index
test_texts = df_final.loc[test_indices, 'text'].values
y_pred_hybrid = []

for text in test_texts:
    pred = predict_urgency_hybrid(text, vectorizer, urgency_model)
    y_pred_hybrid.append(pred)

# -------------------------------------------------------
# STEP 4: VALIDATION (CONFUSION MATRIX & F1 SCORE)
# -------------------------------------------------------
print("\n--- 4. Validating Hybrid Model ---")

target_names_urgency = ['Low', 'Medium', 'High']

# Print Classification Report 
print(classification_report(y_test_u, y_pred_hybrid, target_names=target_names_urgency))

# Calculate Weighted F1 Score
f1 = f1_score(y_test_u, y_pred_hybrid, average='weighted')
print(f"Weighted F1 Score: {f1:.4f}")

# Plot Confusion Matrix 
cm = confusion_matrix(y_test_u, y_pred_hybrid)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names_urgency, 
            yticklabels=target_names_urgency)
plt.xlabel('Predicted Urgency')
plt.ylabel('Actual Urgency')
plt.title('Confusion Matrix: Hybrid Urgency Detection')
plt.show()

# -------------------------------------------------------
# STEP 5: FINAL PREDICTION DEMO (REAL RANDOM DATA)
# -------------------------------------------------------
print("\n" + "="*50)
print("FINAL URGENCY TEST ON 5 RANDOM EMAILS")
print("="*50)

# 1. Pick 5 random indices from the dataframe
random_indices = random.sample(range(len(df_final)), 5)

for idx in random_indices:
    # Get the row data
    row = df_final.iloc[idx]
    email_text = row['text']
    actual_urgency_label = row['urgency'] 
    
    # Predict using Hybrid System
    pred_id = predict_urgency_hybrid(email_text, vectorizer, urgency_model)
    predicted_urgency = inverse_urgency_map[pred_id]
    
    # Display Result
    print(f"EMAIL TEXT:      '{email_text}'")
    print(f"ACTUAL URGENCY:   {str(actual_urgency_label).upper()}")
    print(f"PREDICTED URGENCY:{predicted_urgency.upper()}")
    
    # Check if prediction matches actual
    if str(actual_urgency_label).lower() == predicted_urgency:
        print(" MATCH")
    else:
        print(" MISMATCH")