In [1]:
# Chattea Intent Classifier - MLP + Sentence Transformers
# Complete Working Pipeline in Notebook Format
# Ready to run cell by cell!

"""
JUPYTER NOTEBOOK STRUCTURE:
Run each cell in order (Shift+Enter)

Required Files:
- chatbot_dataset.csv (text, intent columns)
- responses.json (your bilingual responses)

Installation:
!pip install torch sentence-transformers scikit-learn pandas difflib
"""

# ============================================================================
# CELL 1: IMPORTS AND SETUP
# ============================================================================

import json
import pandas as pd
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer, util
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from difflib import get_close_matches
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("CHATTEA INTENT CLASSIFIER - MLP + SENTENCE TRANSFORMERS")
print("=" * 80)
print(f"PyTorch Version: {torch.__version__}")
print("=" * 80)

  from .autonotebook import tqdm as notebook_tqdm


CHATTEA INTENT CLASSIFIER - MLP + SENTENCE TRANSFORMERS
PyTorch Version: 2.9.1+cu130


In [2]:
# ============================================================================
# CELL 2: DEVICE CONFIGURATION
# ============================================================================

# Set device (GPU if available, otherwise CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"üñ•Ô∏è  Using device: {device}")

if torch.cuda.is_available():
    print(f"üéÆ GPU: {torch.cuda.get_device_name(0)}")
    print(f"üíæ GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ö†Ô∏è  No GPU detected - training will use CPU (slower)")


üñ•Ô∏è  Using device: cuda
üéÆ GPU: NVIDIA GeForce RTX 4060 Laptop GPU
üíæ GPU Memory: 8.59 GB


In [3]:
# ============================================================================
# CELL 3: LOAD DATA
# ============================================================================

print("\n" + "=" * 80)
print("üìÇ LOADING DATA")
print("=" * 80)

# Load training dataset
df = pd.read_csv("chatbot_dataset.csv")
print(f"‚úì Loaded dataset: {len(df)} samples")
print(f"‚úì Columns: {list(df.columns)}")
print(f"‚úì Unique intents: {df['intent'].nunique()}")

# Show sample data
print("\nüìä Sample data:")
print(df.head(10))

# Intent distribution
print("\nüìà Intent distribution:")
intent_counts = df['intent'].value_counts()
print(intent_counts.head(15))

# Load responses
with open("responses.json", "r", encoding="utf-8") as f:
    RESPONSES = json.load(f)
print(f"\n‚úì Loaded responses for {len(RESPONSES)} intents")


üìÇ LOADING DATA
‚úì Loaded dataset: 2102 samples
‚úì Columns: ['text', 'intent']
‚úì Unique intents: 14

üìä Sample data:
                                              text        intent
0                          how do i send a message  send_message
1            can you send a message to my contacts  send_message
2                    i want to send a bulk message  send_message
3                                     send message  send_message
4                                 how to broadcast  send_message
5      i need to send a message to multiple people  send_message
6                          can i send messages now  send_message
7                          send a whatsapp message  send_message
8  how do i send bulk messages to my customer list  send_message
9                i want to message all my contacts  send_message

üìà Intent distribution:
intent
send_message        160
schedule_message    160
filter_number       160
what_for            160
pricing             160
creat

In [4]:
# ============================================================================
# CELL 4: BUILD VOCABULARY FOR FUZZY MATCHING
# ============================================================================

print("\n" + "=" * 80)
print("üìö BUILDING VOCABULARY FOR FUZZY MATCHING")
print("=" * 80)

# Extract all unique words from training data for typo correction
all_words = set()
for text in df['text'].str.lower():
    all_words.update(re.findall(r'\w+', text))

VOCAB = all_words
print(f"‚úì Vocabulary size: {len(VOCAB)} unique words")
print(f"‚úì Sample words: {list(VOCAB)[:20]}")

def fuzzy_correct(text: str, cutoff: float = 0.8) -> str:
    """
    Typo correction using difflib (Fuzzy String Matching)
    
    Algorithm: Levenshtein Distance
    - Finds closest matching words from vocabulary
    - Corrects typos while preserving sentence structure
    
    Example: "blst mesage" ‚Üí "blast message"
    """
    words = re.findall(r'\w+', text.lower())
    corrected = []
    
    for word in words:
        # Find closest match in vocabulary
        matches = get_close_matches(word, VOCAB, n=1, cutoff=cutoff)
        corrected.append(matches[0] if matches else word)
    
    # Reconstruct sentence preserving original punctuation
    result = text
    for orig, corr in zip(words, corrected):
        if orig != corr:
            result = re.sub(rf'\b{orig}\b', corr, result, count=1, flags=re.IGNORECASE)
    
    return result

# Test fuzzy correction
print("\nüß™ Testing Fuzzy Correction:")
test_cases = [
    "blst mesage",
    "chek number",
    "craete instance",
    "shedule mesage"
]

for test in test_cases:
    corrected = fuzzy_correct(test)
    print(f"   '{test}' ‚Üí '{corrected}'")


üìö BUILDING VOCABULARY FOR FUZZY MATCHING
‚úì Vocabulary size: 1037 unique words
‚úì Sample words: ['offer', 'paying', 'bonjour', 'every', 'fresh', 'without', 'paid', 'anymore', 'hasn', 'recognize', 'were', 'thursday', 'reflect', 'lol', 'when', 'creating', 'follow', '1000', 'starting', 'does']

üß™ Testing Fuzzy Correction:
   'blst mesage' ‚Üí 'blast message'
   'chek number' ‚Üí 'check number'
   'craete instance' ‚Üí 'create instance'
   'shedule mesage' ‚Üí 'schedule message'


In [5]:
# ============================================================================
# CELL 5: LABEL ENCODING
# ============================================================================

print("\n" + "=" * 80)
print("üè∑Ô∏è  ENCODING LABELS")
print("=" * 80)

# Encode intent labels to numeric values
le = LabelEncoder()
df['label'] = le.fit_transform(df['intent'])

num_classes = len(le.classes_)
intent_map = dict(enumerate(le.classes_))

print(f"‚úì Number of classes: {num_classes}")
print(f"\nüìã Intent mapping (first 10):")
for idx, intent in list(intent_map.items())[:10]:
    print(f"   {idx}: {intent}")


üè∑Ô∏è  ENCODING LABELS
‚úì Number of classes: 14

üìã Intent mapping (first 10):
   0: contact
   1: create_group
   2: create_instance
   3: delete_group
   4: delete_instance
   5: edit_group
   6: edit_instance
   7: filter_number
   8: greeting
   9: pricing


In [6]:

# ============================================================================
# CELL 6: SENTENCE EMBEDDINGS (WORD2VEC ALTERNATIVE)
# ============================================================================

print("\n" + "=" * 80)
print("üß† GENERATING SENTENCE EMBEDDINGS")
print("=" * 80)
print("Using: Sentence Transformers (all-MiniLM-L6-v2)")
print("This is a neural embedding model (similar to Word2Vec but sentence-level)")
print("=" * 80)

# Load pre-trained sentence embedding model
embedder = SentenceTransformer('all-MiniLM-L6-v2')
print("‚úì Loaded embedding model")
print(f"‚úì Embedding dimension: 384")

# Generate embeddings for all training samples
print("\nüìä Encoding training data...")
sentence_embeddings = embedder.encode(
    df['text'].tolist(), 
    convert_to_tensor=True,
    show_progress_bar=True
).to(device)

print(f"‚úì Generated embeddings: {sentence_embeddings.shape}")
print(f"   - Shape: (num_samples, embedding_dim)")
print(f"   - Device: {sentence_embeddings.device}")


üß† GENERATING SENTENCE EMBEDDINGS
Using: Sentence Transformers (all-MiniLM-L6-v2)
This is a neural embedding model (similar to Word2Vec but sentence-level)
‚úì Loaded embedding model
‚úì Embedding dimension: 384

üìä Encoding training data...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 66/66 [00:01<00:00, 53.68it/s]


‚úì Generated embeddings: torch.Size([2102, 384])
   - Shape: (num_samples, embedding_dim)
   - Device: cuda:0


In [7]:

# ============================================================================
# CELL 7: FEEDFORWARD NETWORK
# ============================================================================

print("\n" + "=" * 80)
print("üèóÔ∏è  FEEDFORWARD CLASSIFIER ARCHITECTURE")
print("=" * 80)

class EmbeddingClassifier(nn.Module):
    """
    Simple Feedforward Neural Network for Sentence Embeddings
    
    Why not CNN?
    - CNNs are for sequential data (words in sentence)
    - We already have holistic sentence embeddings (384-dim vectors)
    - Feedforward network is the right architecture for this!
    
    Architecture:
    1. Input: 384-dim sentence embedding
    2. Hidden Layer 1: 384 ‚Üí 256 (ReLU + Dropout)
    3. Hidden Layer 2: 256 ‚Üí 128 (ReLU + Dropout)
    4. Output Layer: 128 ‚Üí num_classes
    """
    
    def __init__(self, embed_dim=384, num_classes=num_classes):
        super().__init__()
        
        self.network = nn.Sequential(
            # Layer 1
            nn.Linear(embed_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # Layer 2
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # Output layer
            nn.Linear(128, num_classes)
        )
    
    def forward(self, x):
        # x shape: (batch_size, 384)
        return self.network(x)  # (batch_size, num_classes)

# Print model architecture
print("\nüìê Model Architecture:")
model = EmbeddingClassifier()
print(model)
print(f"\n‚úì Total parameters: {sum(p.numel() for p in model.parameters()):,}")
print("‚úì Model defined successfully")


üèóÔ∏è  FEEDFORWARD CLASSIFIER ARCHITECTURE

üìê Model Architecture:
EmbeddingClassifier(
  (network): Sequential(
    (0): Linear(in_features=384, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=128, out_features=14, bias=True)
  )
)

‚úì Total parameters: 133,262
‚úì Model defined successfully


In [8]:

# ============================================================================
# CELL 8: TRAIN OR LOAD MLP MODEL
# ============================================================================

print("\n" + "=" * 80)
print("üéØ TRAINING MLP MODEL")
print("=" * 80)

model_path = "chattea.pth"

# Force retrain
import os
if os.path.exists(model_path):
    os.remove(model_path)
    print("‚ö†Ô∏è  Deleted old model - retraining from scratch!")

print("\n" + "=" * 80)
print("üìö PREPARING TRAINING DATA")
print("=" * 80)

# Use the pre-computed embeddings from Cell 6
X = sentence_embeddings.to(device)
y = torch.tensor(df['label'].values, dtype=torch.long).to(device)

print(f"‚úì X shape: {X.shape}")
print(f"‚úì y shape: {y.shape}")

# Train/validation split (stratified)
from sklearn.model_selection import train_test_split

train_idx, val_idx = train_test_split(
    list(range(len(X))),
    test_size=0.2,
    random_state=42,
    stratify=y.cpu()
)

X_train = X[train_idx]
X_val = X[val_idx]
y_train = y[train_idx]
y_val = y[val_idx]

print(f"\n‚úì Training samples: {len(X_train)}")
print(f"‚úì Validation samples: {len(X_val)}")

print("\n" + "=" * 80)
print("üèãÔ∏è  TRAINING LOOP (WITH PROPER BATCHING!)")
print("=" * 80)

# Initialize model
model = EmbeddingClassifier().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Lower LR
criterion = nn.CrossEntropyLoss()

# Training parameters
BATCH_SIZE = 32
EPOCHS = 50  # More epochs

# Create mini-batches
def create_batches(X, y, batch_size):
    """Create mini-batches for training"""
    indices = torch.randperm(len(X))
    for i in range(0, len(X), batch_size):
        batch_idx = indices[i:i+batch_size]
        yield X[batch_idx], y[batch_idx]

# Training loop
print("\nEpoch | Train Acc | Train Loss | Val Acc | Val Loss")
print("-" * 65)

best_val_acc = 0

for epoch in range(EPOCHS):
    # ========== TRAINING ==========
    model.train()
    train_loss = 0
    train_correct = 0
    train_total = 0
    
    for batch_X, batch_y in create_batches(X_train, y_train, BATCH_SIZE):
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Track metrics
        train_loss += loss.item()
        train_correct += (outputs.argmax(1) == batch_y).sum().item()
        train_total += len(batch_y)
    
    train_acc = train_correct / train_total
    train_loss = train_loss / (len(X_train) // BATCH_SIZE + 1)
    
    # ========== VALIDATION ==========
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val)
        val_loss = criterion(val_outputs, y_val).item()
        val_acc = (val_outputs.argmax(1) == y_val).float().mean().item()
    
    # Print progress
    if epoch % 5 == 0 or epoch == EPOCHS - 1:
        print(f"{epoch:5d} | {train_acc:9.4f} | {train_loss:10.4f} | {val_acc:7.4f} | {val_loss:8.4f}")
    
    # Save best model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), model_path)

print("\n" + "=" * 80)
print(f"‚úì Training Complete!")
print(f"‚úì Best Validation Accuracy: {best_val_acc:.4f} ({best_val_acc*100:.2f}%)")
print(f"‚úì Model saved to: {model_path}")
print("=" * 80)

# Load best model
model.load_state_dict(torch.load(model_path))
model.eval()

# Final check on training data
with torch.no_grad():
    final_train_pred = model(X_train).argmax(1)
    final_train_acc = (final_train_pred == y_train).float().mean().item()
    
    final_val_pred = model(X_val).argmax(1)
    final_val_acc = (final_val_pred == y_val).float().mean().item()

print(f"\nüìä Final Performance:")
print(f"   Training Accuracy:   {final_train_acc:.4f} ({final_train_acc*100:.2f}%)")
print(f"   Validation Accuracy: {final_val_acc:.4f} ({final_val_acc*100:.2f}%)")

print("\n‚úì Model ready for inference!")


üéØ TRAINING MLP MODEL

üìö PREPARING TRAINING DATA
‚úì X shape: torch.Size([2102, 384])
‚úì y shape: torch.Size([2102])

‚úì Training samples: 1681
‚úì Validation samples: 421

üèãÔ∏è  TRAINING LOOP (WITH PROPER BATCHING!)

Epoch | Train Acc | Train Loss | Val Acc | Val Loss
-----------------------------------------------------------------
    0 |    0.4878 |     2.3190 |  0.8741 |   1.4435
    5 |    0.9780 |     0.0710 |  0.9857 |   0.0426
   10 |    0.9958 |     0.0262 |  0.9905 |   0.0232
   15 |    0.9976 |     0.0130 |  0.9952 |   0.0222
   20 |    0.9994 |     0.0086 |  0.9952 |   0.0194
   25 |    0.9988 |     0.0054 |  0.9952 |   0.0164
   30 |    1.0000 |     0.0030 |  0.9952 |   0.0191
   35 |    0.9988 |     0.0039 |  0.9929 |   0.0191
   40 |    0.9994 |     0.0026 |  0.9952 |   0.0178
   45 |    1.0000 |     0.0022 |  0.9952 |   0.0182
   49 |    0.9994 |     0.0018 |  0.9952 |   0.0183

‚úì Training Complete!
‚úì Best Validation Accuracy: 0.9952 (99.52%)
‚úì Model s

In [9]:
# ============================================================================
# CELL 10: MAIN CHAT FUNCTION (INFERENCE)
# ============================================================================

print("\n" + "=" * 80)
print("üí¨ CHAT INFERENCE FUNCTION")
print("=" * 80)

def get_chattea_reply(user_input: str) -> str:
    """
    Main chatbot inference function
    
    Pipeline:
    1. Rule-based filters (greetings, goodbyes)
    2. Phone number extraction (if applicable)
    3. CNN classification with confidence check
    4. Retrieval fallback (if low confidence)
    5. Response generation
    
    Args:
        user_input: User's message
        
    Returns:
        Bot's response
    """
    text = user_input.strip().lower()
    
    # ==================== RULE-BASED FILTERS ====================
    # Quick responses for common greetings
    if any(g in text for g in ["hai", "halo", "hello", "hi", "hey", "pagi", "siang", "malam"]):
        return RESPONSES["greeting"]["en"]
    
    if any(g in text for g in ["bye", "goodbye", "dadah", "sampai jumpa"]):
        return RESPONSES["unknown"]["en"]
    
    # ==================== EMBEDDING + PREDICTION ====================
    with torch.no_grad():
        # Encode user input
        user_emb = embedder.encode(user_input, convert_to_tensor=True).to(device)
        user_emb = user_emb.unsqueeze(0)  # (1, 384)
        
        # MLP prediction
        logits = model(user_emb)
        probs = logits.softmax(1)
        confidence = probs.max().item()
        intent = intent_map[logits.argmax(1).item()]
        
        # Retrieval fallback (semantic similarity)
        cos_scores = util.cos_sim(user_emb, sentence_embeddings)[0]
        best_match_idx = cos_scores.argmax().item()
        retrieval_intent = df.iloc[best_match_idx]['intent']
        retrieval_score = cos_scores[best_match_idx].item()
        
        # Choose final intent based on confidence
        if confidence > 0.90:
            final_intent = intent
            source = "MLP"
        else:
            final_intent = retrieval_intent
            source = "Retrieval"
    
    # ==================== SPECIAL CASES ====================

    # ==================== RESPONSE GENERATION ====================
    response = RESPONSES.get(final_intent, RESPONSES.get("help", "I'm not sure how to help with that."))
    
    # Handle both dict (bilingual) and string responses
    if isinstance(response, dict):
        return response.get("en", response.get("id", "I'm not sure how to help with that."))
    
    return response

print("‚úì Chat function ready!")


üí¨ CHAT INFERENCE FUNCTION
‚úì Chat function ready!


In [10]:

# ============================================================================
# CELL 11: TEST INFERENCE
# ============================================================================

print("\n" + "=" * 80)
print("üß™ TESTING INFERENCE")
print("=" * 80)

test_queries = [
    "hello",
    "what is chattea",
    "how to blast message",
    "check 08123456789",
    "create instance",
    "schedule message",
    "thanks",
    "goodbye"
]

print("\nRunning test queries:\n")
for query in test_queries:
    print(f"üë§ User: {query}")
    response = get_chattea_reply(query)
    print(f"ü§ñ Bot: {response[:100]}{'...' if len(response) > 100 else ''}")
    print("-" * 80)



üß™ TESTING INFERENCE

Running test queries:

üë§ User: hello
ü§ñ Bot: Hello! üëã Welcome to Chattea.

I'm here to help you navigate features like sending messages, managing...
--------------------------------------------------------------------------------
üë§ User: what is chattea
ü§ñ Bot: Chattea is a WhatsApp marketing automation platform designed for businesses.

üéØ Key features:
‚Ä¢ Send...
--------------------------------------------------------------------------------
üë§ User: how to blast message
ü§ñ Bot: To send a message:

1. Choose an active instance
2. Enter the destination number (e.g., 628123456789...
--------------------------------------------------------------------------------
üë§ User: check 08123456789
ü§ñ Bot: To filter/check phone numbers:

1. Open **Tools** ‚Üí **Phone Checker**
2. Enter a phone number or upl...
--------------------------------------------------------------------------------
üë§ User: create instance
ü§ñ Bot: To create a new What

In [11]:

# ============================================================================
# CELL 13: MLP MODEL EVALUATION
# ============================================================================

print("\n" + "=" * 80)
print("üìä MODEL EVALUATION")
print("=" * 80)

# Evaluate on validation set
with torch.no_grad():
    # Get embeddings
    X_all = embedder.encode(df['text'].tolist(), convert_to_tensor=True).to(device)
    y_all = torch.tensor(df['label'].values, dtype=torch.long).to(device)
    
    # Split
    train_idx, val_idx = train_test_split(
        torch.arange(len(X_all)),
        test_size=0.2,
        random_state=42,
        stratify=y_all.cpu()
    )
    
    X_val = X_all[val_idx].to(device)
    y_val = y_all[val_idx].to(device)
    
    # Predict
    val_outputs = model(X_val)
    val_preds = val_outputs.argmax(1)
    
    # Accuracy
    val_acc = (val_preds == y_val).float().mean().item()
    
    print(f"‚úì Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
    
    # Per-class accuracy
    print("\nüìã Per-Intent Performance:")
    for intent_id in range(num_classes):
        intent_name = intent_map[intent_id]
        mask = y_val == intent_id
        if mask.sum() > 0:
            intent_acc = (val_preds[mask] == y_val[mask]).float().mean().item()
            count = mask.sum().item()
            print(f"   {intent_name:30s}: {intent_acc:.3f} ({count:2d} samples)")

# Test on ALL training data (should be near perfect)
with torch.no_grad():
    all_outputs = model(sentence_embeddings.to(device))
    all_preds = all_outputs.argmax(1)
    all_labels = torch.tensor(df['label'].values, dtype=torch.long).to(device)
    
    train_acc = (all_preds == all_labels).float().mean().item()
    print(f"Accuracy on FULL training set: {train_acc:.4f}")

print("\n" + "=" * 80)
print("‚úÖ NOTEBOOK COMPLETE!")
print("=" * 80)
print("\nYour model is ready to use!")

# ============================================================================
# FRESH START - DELETE EVERYTHING AND RETRAIN
# ============================================================================

import os

# 1. Delete saved model
if os.path.exists("chattea.pth"):
    os.remove("chattea.pth")
    print("‚úì Deleted old model")

# 2. Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("‚úì Cleared GPU cache")

# 3. Restart notebook kernel (Kernel ‚Üí Restart & Run All)
print("\n‚ö†Ô∏è  NOW RESTART KERNEL AND RUN ALL CELLS FROM TOP!")


üìä MODEL EVALUATION
‚úì Validation Accuracy: 0.9952 (99.52%)

üìã Per-Intent Performance:
   contact                       : 1.000 (32 samples)
   create_group                  : 1.000 (32 samples)
   create_instance               : 1.000 (32 samples)
   delete_group                  : 1.000 (32 samples)
   delete_instance               : 1.000 (32 samples)
   edit_group                    : 1.000 (32 samples)
   edit_instance                 : 1.000 (32 samples)
   filter_number                 : 1.000 (32 samples)
   greeting                      : 0.923 (13 samples)
   pricing                       : 1.000 (32 samples)
   schedule_message              : 1.000 (32 samples)
   send_message                  : 1.000 (32 samples)
   unknown                       : 1.000 (24 samples)
   what_for                      : 0.969 (32 samples)
Accuracy on FULL training set: 0.9981

‚úÖ NOTEBOOK COMPLETE!

Your model is ready to use!
‚úì Deleted old model
‚úì Cleared GPU cache

‚ö†Ô∏è  NOW RE

In [12]:

# ============================================================================
# CELL 7: CNN MODEL ARCHITECTURE
# ============================================================================

print("\n" + "=" * 80)
print("üèóÔ∏è  CNN MODEL ARCHITECTURE")
print("=" * 80)

class TextCNN(nn.Module):
    """
    Convolutional Neural Network for Text Classification
    
    Architecture:
    1. Input: Sentence embeddings (384-dim vectors)
    2. Multiple Conv1D layers with different kernel sizes (3, 4, 5)
       - Detects patterns of different n-gram lengths
    3. Max pooling: Extract most important features
    4. Dropout: Prevent overfitting (40%)
    5. Fully connected layer: Final classification
    
    Why CNN for text?
    - Detects local patterns (like phrases)
    - Translation invariant (same pattern anywhere in text)
    - Faster than RNN/LSTM
    - Simpler than Transformers
    """
    
    def __init__(self, embed_dim=384, num_classes=num_classes):
        super().__init__()
        
        # Multiple convolution layers with different kernel sizes
        # This captures n-grams of different lengths
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=1, out_channels=128, kernel_size=k) 
            for k in [3, 4, 5]
        ])
        
        # Dropout for regularization
        self.dropout = nn.Dropout(0.4)
        
        # Fully connected output layer
        self.fc = nn.Linear(128 * 3, num_classes)
    
    def forward(self, x):
        # x shape: (batch_size, 384)
        x = x.unsqueeze(1)  # (batch_size, 1, 384)
        
        # Apply convolutions and max pooling
        convs = [F.relu(conv(x)).max(dim=2)[0] for conv in self.convs]
        
        # Concatenate all conv outputs
        x = torch.cat(convs, dim=1)  # (batch_size, 128*3)
        
        # Dropout and classification
        x = self.dropout(x)
        return self.fc(x)

# Print model architecture
print("\nüìê Model Architecture:")
print(TextCNN())
print("\n‚úì Model defined successfully")


üèóÔ∏è  CNN MODEL ARCHITECTURE

üìê Model Architecture:
TextCNN(
  (convs): ModuleList(
    (0): Conv1d(1, 128, kernel_size=(3,), stride=(1,))
    (1): Conv1d(1, 128, kernel_size=(4,), stride=(1,))
    (2): Conv1d(1, 128, kernel_size=(5,), stride=(1,))
  )
  (dropout): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=384, out_features=14, bias=True)
)

‚úì Model defined successfully


In [13]:

# ============================================================================
# CELL 8: TRAIN OR LOAD CNN MODEL
# ============================================================================

print("\n" + "=" * 80)
print("üéØ TRAINING CNN MODEL")
print("=" * 80)

model_path = "cnn_chattea.pth"

try:
    # Try to load pre-trained model
    cnn_model = TextCNN().to(device)
    cnn_model.load_state_dict(torch.load(model_path, map_location=device))
    print("‚úì Loaded pre-trained CNN model from", model_path)
    
except FileNotFoundError:
    print("‚ö†Ô∏è  No pre-trained model found. Training from scratch...")
    print("\n" + "=" * 80)
    print("üìö PREPARING TRAINING DATA")
    print("=" * 80)
    
    # Encode all texts
    X = embedder.encode(df['text'].tolist(), convert_to_tensor=True).to(device)
    y = torch.tensor(df['label'].values, dtype=torch.long).to(device)
    
    print(f"‚úì X shape: {X.shape}")
    print(f"‚úì y shape: {y.shape}")
    
    # Train/validation split (stratified)
    train_idx, val_idx = train_test_split(
        torch.arange(len(X)),
        test_size=0.2,
        random_state=42,
        stratify=y.cpu()  # Stratify to maintain class distribution
    )
    
    X_train = X[train_idx].to(device)
    X_val = X[val_idx].to(device)
    y_train = y[train_idx].to(device)
    y_val = y[val_idx].to(device)
    
    print(f"\n‚úì Training samples: {len(X_train)}")
    print(f"‚úì Validation samples: {len(X_val)}")
    
    print("\n" + "=" * 80)
    print("üèãÔ∏è  TRAINING LOOP")
    print("=" * 80)
    
    # Initialize model
    cnn_model = TextCNN().to(device)
    optimizer = torch.optim.Adam(cnn_model.parameters(), lr=0.002)
    criterion = nn.CrossEntropyLoss()
    
    # Training loop
    cnn_model.train()
    print("\nEpoch | Accuracy | Loss")
    print("-" * 40)
    
    for epoch in range(30):
        optimizer.zero_grad()
        
        # Forward pass
        outputs = cnn_model(X_train)
        loss = criterion(outputs, y_train)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        # Calculate accuracy
        acc = (outputs.argmax(1) == y_train).float().mean().item()
        
        # Print progress every 5 epochs
        if epoch % 5 == 0:
            print(f"{epoch:5d} | {acc:8.4f} | {loss.item():8.4f}")
    
    # Final training accuracy
    with torch.no_grad():
        outputs = cnn_model(X_train)
        train_acc = (outputs.argmax(1) == y_train).float().mean().item()
        
        # Validation accuracy
        val_outputs = cnn_model(X_val)
        val_acc = (val_outputs.argmax(1) == y_val).float().mean().item()
    
    print("\n" + "=" * 80)
    print(f"‚úì Final Training Accuracy: {train_acc:.4f} ({train_acc*100:.2f}%)")
    print(f"‚úì Final Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
    print("=" * 80)
    
    # Save model
    torch.save(cnn_model.state_dict(), model_path)
    print(f"\n‚úì Model saved to: {model_path}")

# Set model to evaluation mode
cnn_model.eval()
print("\n‚úì Model ready for inference!")



üéØ TRAINING CNN MODEL
‚ö†Ô∏è  No pre-trained model found. Training from scratch...

üìö PREPARING TRAINING DATA
‚úì X shape: torch.Size([2102, 384])
‚úì y shape: torch.Size([2102])

‚úì Training samples: 1681
‚úì Validation samples: 421

üèãÔ∏è  TRAINING LOOP

Epoch | Accuracy | Loss
----------------------------------------
    0 |   0.0714 |   2.6593
    5 |   0.0833 |   2.6337
   10 |   0.0756 |   2.6298
   15 |   0.0892 |   2.6273
   20 |   0.0821 |   2.6222
   25 |   0.0964 |   2.6161

‚úì Final Training Accuracy: 0.0976 (9.76%)
‚úì Final Validation Accuracy: 0.0998 (9.98%)

‚úì Model saved to: cnn_chattea.pth

‚úì Model ready for inference!


In [14]:
# ============================================================================
# CELL 10: MAIN CHAT FUNCTION (INFERENCE)
# ============================================================================

print("\n" + "=" * 80)
print("üí¨ CHAT INFERENCE FUNCTION")
print("=" * 80)

def get_chattea_reply(user_input: str) -> str:
    """
    Main chatbot inference function
    
    Pipeline:
    1. Rule-based filters (greetings, goodbyes)
    2. Phone number extraction (if applicable)
    3. CNN classification with confidence check
    4. Retrieval fallback (if low confidence)
    5. Response generation
    
    Args:
        user_input: User's message
        
    Returns:
        Bot's response
    """
    text = user_input.strip().lower()
    
    # ==================== RULE-BASED FILTERS ====================
    # Quick responses for common greetings
    if any(g in text for g in ["hai", "halo", "hello", "hi", "hey", "pagi", "siang", "malam"]):
        return RESPONSES["greeting"]["en"]
    
    if any(g in text for g in ["bye", "goodbye", "dadah", "sampai jumpa"]):
        return RESPONSES["unknown"]["en"]
    
    # ==================== EMBEDDING + PREDICTION ====================
    with torch.no_grad():
        # Encode user input
        user_emb = embedder.encode(user_input, convert_to_tensor=True).to(device)
        user_emb = user_emb.unsqueeze(0)  # (1, 384)
        
        # CNN prediction
        cnn_logits = cnn_model(user_emb)
        cnn_probs = cnn_logits.softmax(1)
        cnn_confidence = cnn_probs.max().item()
        cnn_intent = intent_map[cnn_logits.argmax(1).item()]
        
        # Retrieval fallback (semantic similarity)
        cos_scores = util.cos_sim(user_emb, sentence_embeddings)[0]
        best_match_idx = cos_scores.argmax().item()
        retrieval_intent = df.iloc[best_match_idx]['intent']
        retrieval_score = cos_scores[best_match_idx].item()
        
        # Choose final intent based on confidence
        if cnn_confidence > 0.90:
            final_intent = cnn_intent
            source = "CNN"
        else:
            final_intent = retrieval_intent
            source = "Retrieval"
    
    # ==================== SPECIAL CASES ====================
        
    # ==================== RESPONSE GENERATION ====================
    response = RESPONSES.get(final_intent, RESPONSES.get("help", "I'm not sure how to help with that."))
    
    # Handle both dict (bilingual) and string responses
    if isinstance(response, dict):
        return response.get("en", response.get("id", "I'm not sure how to help with that."))
    
    return response

print("‚úì Chat function ready!")


üí¨ CHAT INFERENCE FUNCTION
‚úì Chat function ready!


In [15]:

# ============================================================================
# CELL 11: TEST INFERENCE
# ============================================================================

print("\n" + "=" * 80)
print("üß™ TESTING INFERENCE")
print("=" * 80)

test_queries = [
    "hello",
    "what is chattea",
    "how to blast message",
    "check 08123456789",
    "create instance",
    "schedule message",
    "thanks",
    "goodbye"
]

print("\nRunning test queries:\n")
for query in test_queries:
    print(f"üë§ User: {query}")
    response = get_chattea_reply(query)
    print(f"ü§ñ Bot: {response[:100]}{'...' if len(response) > 100 else ''}")
    print("-" * 80)



üß™ TESTING INFERENCE

Running test queries:

üë§ User: hello
ü§ñ Bot: Hello! üëã Welcome to Chattea.

I'm here to help you navigate features like sending messages, managing...
--------------------------------------------------------------------------------
üë§ User: what is chattea
ü§ñ Bot: Chattea is a WhatsApp marketing automation platform designed for businesses.

üéØ Key features:
‚Ä¢ Send...
--------------------------------------------------------------------------------
üë§ User: how to blast message
ü§ñ Bot: To send a message:

1. Choose an active instance
2. Enter the destination number (e.g., 628123456789...
--------------------------------------------------------------------------------
üë§ User: check 08123456789
ü§ñ Bot: To filter/check phone numbers:

1. Open **Tools** ‚Üí **Phone Checker**
2. Enter a phone number or upl...
--------------------------------------------------------------------------------
üë§ User: create instance
ü§ñ Bot: To create a new What

In [16]:

# ============================================================================
# CELL 13: CNN MODEL EVALUATION
# ============================================================================

print("\n" + "=" * 80)
print("üìä MODEL EVALUATION")
print("=" * 80)

# Evaluate on validation set
with torch.no_grad():
    # Get embeddings
    X_all = embedder.encode(df['text'].tolist(), convert_to_tensor=True).to(device)
    y_all = torch.tensor(df['label'].values, dtype=torch.long).to(device)
    
    # Split
    train_idx, val_idx = train_test_split(
        torch.arange(len(X_all)),
        test_size=0.2,
        random_state=42,
        stratify=y_all.cpu()
    )
    
    X_val = X_all[val_idx].to(device)
    y_val = y_all[val_idx].to(device)
    
    # Predict
    val_outputs = cnn_model(X_val)
    val_preds = val_outputs.argmax(1)
    
    # Accuracy
    val_acc = (val_preds == y_val).float().mean().item()
    
    print(f"‚úì Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)")
    
    # Per-class accuracy
    print("\nüìã Per-Intent Performance:")
    for intent_id in range(num_classes):
        intent_name = intent_map[intent_id]
        mask = y_val == intent_id
        if mask.sum() > 0:
            intent_acc = (val_preds[mask] == y_val[mask]).float().mean().item()
            count = mask.sum().item()
            print(f"   {intent_name:30s}: {intent_acc:.3f} ({count:2d} samples)")

# Test on ALL training data (should be near perfect)
with torch.no_grad():
    all_outputs = cnn_model(sentence_embeddings.to(device))
    all_preds = all_outputs.argmax(1)
    all_labels = torch.tensor(df['label'].values, dtype=torch.long).to(device)
    
    train_acc = (all_preds == all_labels).float().mean().item()
    print(f"Accuracy on FULL training set: {train_acc:.4f}")

print("\n" + "=" * 80)
print("‚úÖ NOTEBOOK COMPLETE!")
print("=" * 80)
print("\nYour model is ready to use!")

# ============================================================================
# FRESH START - DELETE EVERYTHING AND RETRAIN
# ============================================================================

import os

# 1. Delete saved model
if os.path.exists("cnn_chattea.pth"):
    os.remove("cnn_chattea.pth")
    print("‚úì Deleted old model")

# 2. Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("‚úì Cleared GPU cache")

# 3. Restart notebook kernel (Kernel ‚Üí Restart & Run All)
print("\n‚ö†Ô∏è  NOW RESTART KERNEL AND RUN ALL CELLS FROM TOP!")


üìä MODEL EVALUATION
‚úì Validation Accuracy: 0.1758 (17.58%)

üìã Per-Intent Performance:
   contact                       : 0.094 (32 samples)
   create_group                  : 0.531 (32 samples)
   create_instance               : 0.000 (32 samples)
   delete_group                  : 0.250 (32 samples)
   delete_instance               : 0.094 (32 samples)
   edit_group                    : 0.906 (32 samples)
   edit_instance                 : 0.031 (32 samples)
   filter_number                 : 0.000 (32 samples)
   greeting                      : 0.000 (13 samples)
   pricing                       : 0.406 (32 samples)
   schedule_message              : 0.000 (32 samples)
   send_message                  : 0.000 (32 samples)
   unknown                       : 0.000 (24 samples)
   what_for                      : 0.000 (32 samples)
Accuracy on FULL training set: 0.1912

‚úÖ NOTEBOOK COMPLETE!

Your model is ready to use!
‚úì Deleted old model
‚úì Cleared GPU cache

‚ö†Ô∏è  NOW RE

In [27]:
import json
import pandas as pd
import re
import os
import sys
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from difflib import get_close_matches, SequenceMatcher
import warnings
import time

# ============================================================================
# CONFIGURATION
# ============================================================================

class Config:
    # File paths
    DATASET_PATH = "chatbot_dataset.csv"
    RESPONSES_PATH = "responses.json"
    MODEL_PATH = "cnn_chattea.pth"
    WORD2VEC_PATH = "word2vec.model"
    
    # Word2Vec parameters
    EMBEDDING_DIM = 100        # Embedding dimension
    WORD2VEC_WINDOW = 5        # Context window
    WORD2VEC_MIN_COUNT = 1     # Minimum word frequency
    WORD2VEC_SG = 1            # Skip-gram (better for small datasets)

    # CNN parameters
    NUM_FILTERS = 128          # Filters per kernel
    KERNEL_SIZES = [2, 3, 4]   # Includes 2-word phrases!
    DROPOUT = 0.5              # Higher regularization
    MAX_SEQ_LENGTH = 20        # Shorter = more efficient

    # Training parameters
    BATCH_SIZE = 32
    EPOCHS = 30
    LEARNING_RATE = 0.001
    TEST_SIZE = 0.2
    RANDOM_SEED = 42

    # Inference parameters
    FUZZY_CUTOFF = 0.8
    CONFIDENCE_THRESHOLD = 0.75

    # Device
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = Config()

# Reproducibility
torch.manual_seed(Config.RANDOM_SEED)
np.random.seed(Config.RANDOM_SEED)

In [31]:
# ============================================================================
# DATASET SANITY CHECK & EXPLORATORY ANALYSIS
# ============================================================================

print("=" * 80)
print("üìä DATASET SANITY CHECK & ANALYSIS")
print("=" * 80)

# Load dataset
df = pd.read_csv(Config.DATASET_PATH)

print("\n1Ô∏è‚É£  BASIC STATISTICS")
print("-" * 80)
print(f"Total samples: {len(df)}")
print(f"Total intents: {df['intent'].nunique()}")
print(f"Columns: {list(df.columns)}")

# Check for missing values
print("\n2Ô∏è‚É£  DATA QUALITY")
print("-" * 80)
missing = df.isnull().sum()
print("Missing values:")
for col in df.columns:
    print(f"  {col}: {missing[col]} ({missing[col]/len(df)*100:.2f}%)")

# Check for duplicates
duplicates = df.duplicated().sum()
print(f"\nDuplicate rows: {duplicates} ({duplicates/len(df)*100:.2f}%)")

# Intent distribution
print("\n3Ô∏è‚É£  INTENT DISTRIBUTION")
print("-" * 80)
intent_counts = df['intent'].value_counts()
print(intent_counts)
print(f"\nMost common: {intent_counts.index[0]} ({intent_counts.iloc[0]} samples)")
print(f"Least common: {intent_counts.index[-1]} ({intent_counts.iloc[-1]} samples)")
print(f"Class imbalance ratio: {intent_counts.iloc[0] / intent_counts.iloc[-1]:.2f}x")

# Text length analysis
print("\n4Ô∏è‚É£  TEXT LENGTH ANALYSIS")
print("-" * 80)
df['text_length'] = df['text'].str.len()
df['word_count'] = df['text'].str.split().str.len()

print(f"Character length:")
print(f"  Min: {df['text_length'].min()}")
print(f"  Max: {df['text_length'].max()}")
print(f"  Mean: {df['text_length'].mean():.2f}")
print(f"  Median: {df['text_length'].median():.0f}")

print(f"\nWord count:")
print(f"  Min: {df['word_count'].min()}")
print(f"  Max: {df['word_count'].max()}")
print(f"  Mean: {df['word_count'].mean():.2f}")
print(f"  Median: {df['word_count'].median():.0f}")

# Find longest sentences
print("\n5Ô∏è‚É£  LONGEST SENTENCES (Top 5)")
print("-" * 80)
longest = df.nlargest(5, 'word_count')[['text', 'intent', 'word_count']]
for idx, row in longest.iterrows():
    print(f"\n[{row['word_count']} words] Intent: {row['intent']}")
    print(f"Text: {row['text']}")

# Find shortest sentences
print("\n6Ô∏è‚É£  SHORTEST SENTENCES (Top 5)")
print("-" * 80)
shortest = df.nsmallest(5, 'word_count')[['text', 'intent', 'word_count']]
for idx, row in shortest.iterrows():
    print(f"\n[{row['word_count']} words] Intent: {row['intent']}")
    print(f"Text: {row['text']}")

# Vocabulary analysis
print("\n7Ô∏è‚É£  VOCABULARY STATISTICS")
print("-" * 80)
all_words = []
for text in df['text']:
    all_words.extend(str(text).lower().split())

unique_words = set(all_words)
print(f"Total words (with repetition): {len(all_words)}")
print(f"Unique words: {len(unique_words)}")
print(f"Vocabulary richness: {len(unique_words)/len(all_words):.4f}")

# Most common words
from collections import Counter
word_freq = Counter(all_words)
print(f"\nMost common words (Top 10):")
for word, count in word_freq.most_common(10):
    print(f"  '{word}': {count} times")

# Justification for hyperparameters
print("\n8Ô∏è‚É£  HYPERPARAMETER JUSTIFICATION")
print("-" * 80)
max_words = df['word_count'].max()
mean_words = df['word_count'].mean()
percentile_95 = df['word_count'].quantile(0.95)

print(f"‚úì MAX_SEQ_LENGTH = {Config.MAX_SEQ_LENGTH}")
print(f"  Rationale: 95th percentile = {percentile_95:.0f} words")
print(f"  Only {(df['word_count'] > Config.MAX_SEQ_LENGTH).sum()} samples ({(df['word_count'] > Config.MAX_SEQ_LENGTH).sum()/len(df)*100:.2f}%) exceed this length")

print(f"\n‚úì EMBEDDING_DIM = {Config.EMBEDDING_DIM}")
print(f"  Rationale: Vocabulary size = {len(unique_words)}")
print(f"  Rule of thumb: embedding_dim ‚âà vocab_size^0.25 = {len(unique_words)**0.25:.0f}")
print(f"  100 dimensions provides good balance for vocab of ~1000 words")

print(f"\n‚úì KERNEL_SIZES = {Config.KERNEL_SIZES}")
print(f"  Rationale: Mean sentence length = {mean_words:.1f} words")
print(f"  Kernels [2,3,4] capture 2-4 word phrases (n-grams)")
print(f"  Examples: 'send message' (2), 'how to send' (3), 'create new instance now' (4)")

print(f"\n‚úì BATCH_SIZE = {Config.BATCH_SIZE}")
print(f"  Rationale: Dataset size = {len(df)} samples")
print(f"  {len(df)//Config.BATCH_SIZE} batches per epoch")
print(f"  Provides good gradient estimation without excessive memory usage")

print(f"\n‚úì DROPOUT = {Config.DROPOUT}")
print(f"  Rationale: Small dataset ({len(df)} samples) ‚Üí high overfitting risk")
print(f"  Higher dropout (0.5) provides aggressive regularization")

# Class balance visualization
print("\n9Ô∏è‚É£  CLASS BALANCE CHECK")
print("-" * 80)
min_samples = intent_counts.min()
max_samples = intent_counts.max()
imbalance = max_samples / min_samples

if imbalance < 1.5:
    print("‚úì Classes are WELL BALANCED (ratio < 1.5x)")
elif imbalance < 3:
    print("‚ö†Ô∏è  Classes are MODERATELY IMBALANCED (ratio 1.5-3x)")
else:
    print("‚ùå Classes are SEVERELY IMBALANCED (ratio > 3x)")
    print("   Consider: class weighting, oversampling minority, or undersampling majority")

print(f"   Imbalance ratio: {imbalance:.2f}x")

# Sample queries per intent
print("\nüîü SAMPLE QUERIES PER INTENT (3 examples each)")
print("-" * 80)
for intent in df['intent'].unique()[:5]:  # Show first 5 intents
    print(f"\nüìå Intent: {intent}")
    samples = df[df['intent'] == intent]['text'].head(3).tolist()
    for i, sample in enumerate(samples, 1):
        print(f"   {i}. {sample}")

print("\n" + "=" * 80)
print("‚úÖ DATASET SANITY CHECK COMPLETE!")
print("=" * 80)

# Clean up temporary columns
df = df.drop(['text_length', 'word_count'], axis=1)

üìä DATASET SANITY CHECK & ANALYSIS

1Ô∏è‚É£  BASIC STATISTICS
--------------------------------------------------------------------------------
Total samples: 2102
Total intents: 14
Columns: ['text', 'intent']

2Ô∏è‚É£  DATA QUALITY
--------------------------------------------------------------------------------
Missing values:
  text: 0 (0.00%)
  intent: 0 (0.00%)

Duplicate rows: 47 (2.24%)

3Ô∏è‚É£  INTENT DISTRIBUTION
--------------------------------------------------------------------------------
intent
send_message        160
schedule_message    160
filter_number       160
what_for            160
pricing             160
create_instance     159
edit_instance       159
delete_instance     159
edit_group          159
create_group        159
delete_group        159
contact             159
unknown             124
greeting             65
Name: count, dtype: int64

Most common: send_message (160 samples)
Least common: greeting (65 samples)
Class imbalance ratio: 2.46x

4Ô∏è‚É£  TEXT LE

In [28]:
# ============================================================================
# DEVICE SETUP
# ============================================================================

print("=" * 80)
print("CHATTEA INTENT CLASSIFIER - CNN + WORD2VEC")
print("=" * 80)
print(f"Device: {config.DEVICE}")
if torch.cuda.is_available():
    try:
        print(f"GPU: {torch.cuda.get_device_name(0)}")
    except Exception:
        pass
print("=" * 80)

CHATTEA INTENT CLASSIFIER - CNN + WORD2VEC
Device: cuda
GPU: NVIDIA GeForce RTX 4060 Laptop GPU


In [19]:
# ============================================================================
# TEXT PROCESSING
# ============================================================================

def clean_text(text):
    """Clean and normalize text"""
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"\s+", " ", text).strip()  # Normalize whitespace
    return text

def tokenize(text):
    """Tokenize text into words"""
    return clean_text(text).split()

def build_vocabulary(texts):
    """Extract all unique words from texts"""
    vocab = set()
    for text in texts:
        vocab.update(re.findall(r'\w+', str(text).lower()))
    return vocab

def fuzzy_correct(text, vocab, cutoff=Config.FUZZY_CUTOFF):
    """Correct typos using difflib.get_close_matches"""
    words = re.findall(r'\w+', text.lower())
    corrected = []
    for word in words:
        matches = get_close_matches(word, vocab, n=1, cutoff=cutoff)
        corrected.append(matches[0] if matches else word)
    return ' '.join(corrected)

In [20]:
# ============================================================================
# WORD2VEC EMBEDDER
# ============================================================================

class Word2VecEmbedder:
    """Word2Vec embedding wrapper with proper initialization"""

    def __init__(self):
        self.model = None
        self.word2idx = {"<PAD>": 0, "<UNK>": 1}
        self.idx2word = {}
        self.embedding_matrix = None
        self.vocab_size = 0
        self.embed_dim = Config.EMBEDDING_DIM

    def train(self, sentences):
        """Train Word2Vec on tokenized sentences"""
        print("\nüß† Training Word2Vec...")
        self.model = Word2Vec(
            sentences=sentences,
            vector_size=Config.EMBEDDING_DIM,
            window=Config.WORD2VEC_WINDOW,
            min_count=Config.WORD2VEC_MIN_COUNT,
            sg=Config.WORD2VEC_SG,
            seed=Config.RANDOM_SEED,
            workers=4
        )

        idx = 2
        for word in self.model.wv.index_to_key:
            self.word2idx[word] = idx
            idx += 1

        self.idx2word = {idx: word for word, idx in self.word2idx.items()}
        self.vocab_size = len(self.word2idx)

        self.embedding_matrix = np.zeros((self.vocab_size, self.embed_dim), dtype=np.float32)

        for word, idx in self.word2idx.items():
            if word in ['<PAD>', '<UNK>']:
                if word == '<UNK>':
                    self.embedding_matrix[idx] = np.random.randn(self.embed_dim) * 0.01
            else:
                try:
                    self.embedding_matrix[idx] = self.model.wv[word]
                except KeyError:
                    self.embedding_matrix[idx] = np.random.randn(self.embed_dim) * 0.01

        print(f"‚úì Word2Vec trained: vocab={self.vocab_size}, dim={self.embed_dim}")
        return self

    def save(self, path=Config.WORD2VEC_PATH):
        if self.model:
            self.model.save(path)
            print(f"‚úì Word2Vec saved to {path}")

    def load(self, path=Config.WORD2VEC_PATH):
        print(f"\nüß† Loading Word2Vec from {path}...")
        self.model = Word2Vec.load(path)

        self.word2idx = {"<PAD>": 0, "<UNK>": 1}
        idx = 2
        for word in self.model.wv.index_to_key:
            self.word2idx[word] = idx
            idx += 1

        self.idx2word = {idx: word for word, idx in self.word2idx.items()}
        self.vocab_size = len(self.word2idx)

        self.embedding_matrix = np.zeros((self.vocab_size, self.embed_dim), dtype=np.float32)

        for word, idx in self.word2idx.items():
            if word in ['<PAD>', '<UNK>']:
                if word == '<UNK>':
                    self.embedding_matrix[idx] = np.random.randn(self.embed_dim) * 0.01
            else:
                try:
                    self.embedding_matrix[idx] = self.model.wv[word]
                except KeyError:
                    self.embedding_matrix[idx] = np.random.randn(self.embed_dim) * 0.01

        print(f"‚úì Word2Vec loaded: vocab={self.vocab_size}, dim={self.embed_dim}")
        return self

    def encode_sequence(self, tokens, max_length=Config.MAX_SEQ_LENGTH):
        indices = [self.word2idx.get(token, self.word2idx["<UNK>"]) for token in tokens[:max_length]]
        while len(indices) < max_length:
            indices.append(self.word2idx["<PAD>"])
        return indices

    def sentence_vector(self, tokens):
        vectors = []
        for token in tokens:
            if token in self.word2idx and token not in ("<PAD>", "<UNK>"):
                idx = self.word2idx[token]
                if idx < len(self.embedding_matrix):
                    vectors.append(self.embedding_matrix[idx])
        if len(vectors) == 0:
            return np.zeros(self.embed_dim, dtype=np.float32)
        return np.mean(vectors, axis=0)

In [21]:
# ============================================================================
# CNN MODEL ARCHITECTURE
# ============================================================================

class TextCNN(nn.Module):
    """CNN for Text Classification (Kim, 2014)"""

    def __init__(self, vocab_size, embedding_dim, num_classes, embedding_matrix=None):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
            print("‚úì CNN initialized with Word2Vec embeddings!")
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=Config.NUM_FILTERS, kernel_size=k)
            for k in Config.KERNEL_SIZES
        ])
        self.dropout = nn.Dropout(Config.DROPOUT)
        self.fc = nn.Linear(Config.NUM_FILTERS * len(Config.KERNEL_SIZES), num_classes)

    def forward(self, x):
        embedded = self.embedding(x)                       # (batch, seq_len, embed_dim)
        embedded = embedded.transpose(1, 2)                # (batch, embed_dim, seq_len)
        conv_outputs = []
        for conv in self.convs:
            conv_out = F.relu(conv(embedded))              # (batch, num_filters, L)
            pooled = F.max_pool1d(conv_out, conv_out.size(2)).squeeze(2)
            conv_outputs.append(pooled)
        concatenated = torch.cat(conv_outputs, dim=1)
        dropped = self.dropout(concatenated)
        logits = self.fc(dropped)
        return logits

In [22]:
# ============================================================================
# DATASET
# ============================================================================

class IntentDataset(Dataset):
    """Simple dataset wrapper"""

    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [23]:
# ============================================================================
# TRAINING CNN
# ============================================================================

def train_model_pretty(model, X_train, y_train, X_val, y_val):
    """Train the CNN classifier with MLP-style formatted output"""
    model = model.to(config.DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=Config.LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()

    train_dataset = IntentDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True)

    best_val_acc = 0.0

    print("\n" + "=" * 80)
    print("üèãÔ∏è  TRAINING LOOP (WITH PROPER BATCHING!)")
    print("=" * 80)
    print()
    print("Epoch | Train Acc | Train Loss | Val Acc | Val Loss")
    print("-" * 65)

    for epoch in range(Config.EPOCHS):
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for batch_X, batch_y in train_loader:
            batch_X = batch_X.to(config.DEVICE)
            batch_y = batch_y.to(config.DEVICE)

            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            train_correct += (outputs.argmax(1) == batch_y).sum().item()
            train_total += len(batch_y)

        train_acc = train_correct / (train_total + 1e-12)
        avg_train_loss = train_loss / (len(train_loader) + 1e-12)

        # Validation
        model.eval()
        with torch.no_grad():
            X_val_device = X_val.to(config.DEVICE)
            y_val_device = y_val.to(config.DEVICE)
            val_outputs = model(X_val_device)
            val_loss = criterion(val_outputs, y_val_device).item()
            val_acc = (val_outputs.argmax(1) == y_val_device).float().mean().item()

        # Print progress (every 5 epochs + last)
        if epoch % 5 == 0 or epoch == Config.EPOCHS - 1:
            print(f"{epoch:5d} | {train_acc:9.4f} | {avg_train_loss:10.4f} | {val_acc:7.4f} | {val_loss:8.4f}")

        # Save best model
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            torch.save(model.state_dict(), Config.MODEL_PATH)

    print("\n" + "=" * 80)
    print(f"‚úì Best Validation Accuracy: {best_val_acc:.4f} ({best_val_acc*100:.2f}%)")
    print(f"‚úì Model saved to: {Config.MODEL_PATH}")
    print("=" * 80)

    # Load best model
    model.load_state_dict(torch.load(Config.MODEL_PATH, map_location=config.DEVICE))
    model.eval()

    # Final evaluation on train and val
    with torch.no_grad():
        final_train_pred = model(X_train.to(config.DEVICE)).argmax(1).cpu()
        final_train_acc = (final_train_pred == y_train).float().mean().item()

        final_val_pred = model(X_val.to(config.DEVICE)).argmax(1).cpu()
        final_val_acc = (final_val_pred == y_val).float().mean().item()

    print("\n   Training Accuracy:   {:.4f} ({:.2f}%)".format(final_train_acc, final_train_acc*100))
    print("   Validation Accuracy: {:.4f} ({:.2f}%)".format(final_val_acc, final_val_acc*100))
    print("\n‚úì Model ready for inference!")

    return model

In [None]:
# ============================================================================
# CHATBOT CLASS
# ============================================================================

class ChatteaBot:
    """Main chatbot class with hybrid classification"""

    def __init__(self, model, embedder, label_encoder, responses,
                 df, sentence_vectors, vocab):
        self.model = model
        self.embedder = embedder
        self.le = label_encoder  # sklearn LabelEncoder instance
        self.responses = responses
        self.df = df.reset_index(drop=True)
        self.sentence_vectors = sentence_vectors.astype(np.float32) if sentence_vectors is not None else np.zeros((len(self.df), embedder.embed_dim))
        self.vocab = vocab

        # Intent mapping
        if hasattr(self.le, "classes_"):
            self.intent_map = {i: label for i, label in enumerate(self.le.classes_)}
        elif isinstance(self.le, dict):
            self.intent_map = {v: k for k, v in self.le.items()}
        else:
            self.intent_map = {}

        self.model.eval()

    def _get_response(self, intent):
        """Get response for intent"""
        response = self.responses.get(intent, self.responses.get("help", "I'm not sure how to help with that."))
        if isinstance(response, dict):
            return response.get("en", response.get("id", next(iter(response.values()))))
        return response

    def get_reply(self, user_input):
        """Get chatbot response"""
        text = str(user_input).strip()

        if text == "":
            return "Say something :)"

        # Rule-based greeting
        if any(g in text.lower() for g in ["hai", "halo", "hello", "hi", "hey", "pagi", "siang", "malam"]):
            return self._get_response("greeting")

        # Fuzzy correction
        corrected = fuzzy_correct(text, self.vocab, Config.FUZZY_CUTOFF)
        tokens = tokenize(corrected)

        # Model prediction
        sequence = self.embedder.encode_sequence(tokens, Config.MAX_SEQ_LENGTH)
        x = torch.LongTensor([sequence]).to(config.DEVICE)

        with torch.no_grad():
            logits = self.model(x)
            probs = F.softmax(logits, dim=1).cpu().numpy()[0]
            model_conf = float(probs.max())
            model_idx = int(np.argmax(probs))
            # map to label string
            try:
                model_intent = self.intent_map[model_idx]
            except Exception:
                model_intent = str(model_idx)

        # Retrieval fallback (sentence vectors from embedder average)
        user_vec = self.embedder.sentence_vector(tokens).reshape(1, -1)

        if self.sentence_vectors is None or len(self.sentence_vectors) == 0:
            retrieval_intent = model_intent
            retrieval_score = 0.0
        else:
            similarities = cosine_similarity(user_vec, self.sentence_vectors)[0]
            best_idx = int(np.argmax(similarities))
            retrieval_score = float(similarities[best_idx])
            retrieval_intent = str(self.df.iloc[best_idx]["intent"])

        # Decision
        if model_conf >= Config.CONFIDENCE_THRESHOLD:
            final_intent = model_intent
            decision = "MODEL"
        else:
            final_intent = retrieval_intent
            decision = "RETRIEVAL"

        return self._get_response(final_intent)

In [None]:
# ============================================================================
# UTILS: Pretty evaluation & inference output
# ============================================================================

def pretty_inference_tests(bot, test_queries=None):
    if test_queries is None:
        test_queries = [
            "hello",
            "what is chattea",
            "how to blast message",
            "create instance",
            "send bulk messages"
        ]

    print("\n" + "=" * 80)
    print("üß™ TESTING INFERENCE")
    print("=" * 80)
    print("\nRunning test queries:\n")

    for query in test_queries:
        print(f"üë§ User: {query}")
        try:
            response = bot.get_reply(query)
            # truncate like your example
            out = response if isinstance(response, str) else str(response)
            print(f"ü§ñ Bot: {out[:200]}{'...' if len(out) > 200 else ''}")
        except Exception as e:
            print("Error during inference:", e)
        print("-" * 80)

def pretty_evaluation(model, X_val, y_val, le, df):
    with torch.no_grad():
        outputs = model(X_val.to(config.DEVICE))
        preds = outputs.argmax(1).cpu().numpy()
        labels = y_val.numpy()

    val_acc = accuracy_score(labels, preds)
    print("\n" + "=" * 80)
    print("üìä MODEL EVALUATION")
    print("=" * 80)
    print(f"‚úì Validation Accuracy: {val_acc:.4f} ({val_acc*100:.2f}%)\n")
    print("üìã Per-Intent Performance:")

    intent_names = list(le.classes_)
    for i, intent_name in enumerate(intent_names):
        mask = labels == i
        count = int(mask.sum())
        if count == 0:
            continue
        intent_acc = (preds[mask] == labels[mask]).mean()
        print(f"   {intent_name:30s}: {intent_acc:.3f} ({count:2d} samples)")

    # Full training set accuracy if available as X_all global
    try:
        if 'X' in globals():
            with torch.no_grad():
                all_outputs = model(X.to(config.DEVICE))
                all_preds = all_outputs.argmax(1).cpu().numpy()
                all_labels = np.array([int(x) for x in df['label'].values])
                train_acc = (all_preds == all_labels).mean()
                print(f"\nAccuracy on FULL training set: {train_acc:.4f}")
    except Exception:
        pass

    print("\n" + "=" * 80)
    print("‚úÖ EVALUATION COMPLETE")
    print("=" * 80)

In [29]:
# ============================================================================
# MAIN PIPELINE
# ============================================================================

def main():
    print("\nüìÇ Loading data...")
    if not os.path.exists(Config.DATASET_PATH):
        raise FileNotFoundError(f"Dataset not found: {Config.DATASET_PATH}")

    df = pd.read_csv(Config.DATASET_PATH)
    if "text" not in df.columns or "intent" not in df.columns:
        raise ValueError("Dataset must have 'text' and 'intent' columns")

    print(f"‚úì Loaded {len(df)} samples, {df['intent'].nunique()} intents")

    # Load responses
    if not os.path.exists(Config.RESPONSES_PATH):
        raise FileNotFoundError(f"Responses file not found: {Config.RESPONSES_PATH}")

    with open(Config.RESPONSES_PATH, "r", encoding="utf-8") as f:
        responses = json.load(f)

    # Build vocabulary (for fuzzy)
    print("\nüìö Building vocabulary...")
    vocab = build_vocabulary(df['text'].tolist())
    print(f"‚úì Vocabulary: {len(vocab)} words")

    # Label encoding
    print("\nüè∑Ô∏è  Encoding labels...")
    le = LabelEncoder()
    df['label'] = le.fit_transform(df['intent'].astype(str))
    num_classes = len(le.classes_)
    print(f"‚úì Classes: {num_classes}")

    # Tokenize
    print("\n‚úÇÔ∏è  Tokenizing...")
    df['tokens'] = df['text'].apply(lambda t: tokenize(str(t)))

    # Word2Vec
    embedder = Word2VecEmbedder()
    if os.path.exists(Config.WORD2VEC_PATH):
        embedder.load(Config.WORD2VEC_PATH)
    else:
        embedder.train(df['tokens'].tolist())
        embedder.save(Config.WORD2VEC_PATH)

    # Prepare sequences
    print("\nüìä Preparing sequences...")
    sequences = np.array([embedder.encode_sequence(tokens, Config.MAX_SEQ_LENGTH) for tokens in df['tokens']], dtype=np.int64)

    X = torch.tensor(sequences, dtype=torch.long)
    y = torch.tensor(df['label'].values, dtype=torch.long)

    # Train/val split
    train_idx, val_idx = train_test_split(
        range(len(df)),
        test_size=Config.TEST_SIZE,
        random_state=Config.RANDOM_SEED,
        stratify=df['label']
    )
    
    X_train = X[train_idx]
    y_train = y[train_idx]
    X_val = X[val_idx]
    y_val = y[val_idx]

    # Build model
    model = TextCNN(vocab_size=embedder.vocab_size, embedding_dim=Config.EMBEDDING_DIM, num_classes=num_classes, embedding_matrix=embedder.embedding_matrix)

    # Train or load model
    if os.path.exists(Config.MODEL_PATH):
        print(f"\n‚úì Found existing model: {Config.MODEL_PATH}")
        try:
            model.load_state_dict(torch.load(Config.MODEL_PATH, map_location=config.DEVICE))
            model.eval()
            print("‚úì Model loaded!")
        except Exception as e:
            print("Failed to load model, will retrain:", e)
            model = train_model_pretty(model, X_train, y_train, X_val, y_val)
    else:
        print("\n‚ö†Ô∏è  No pre-trained model found. Training from scratch...")
        model = train_model_pretty(model, X_train, y_train, X_val, y_val)

    # Prepare sentence vectors for retrieval
    print("\nüìê Preparing sentence vectors for retrieval...")
    sent_vecs = np.stack([embedder.sentence_vector(tokens) for tokens in df['tokens']])
    norms = np.linalg.norm(sent_vecs, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    sent_vecs_normalized = sent_vecs / norms

    # Create bot
    print("\nü§ñ Initializing chatbot...")
    bot = ChatteaBot(model, embedder, le, responses, df, sent_vecs_normalized, vocab)
    print("‚úì Chatbot ready!")

    # Inference tests
    pretty_inference_tests(bot)

    # Evaluation
    pretty_evaluation(model, X_val, y_val, le, df)

if __name__ == "__main__":
    main()


üìÇ Loading data...
‚úì Loaded 2102 samples, 14 intents

üìö Building vocabulary...
‚úì Vocabulary: 1037 words

üè∑Ô∏è  Encoding labels...
‚úì Classes: 14

‚úÇÔ∏è  Tokenizing...

üß† Training Word2Vec...
‚úì Word2Vec trained: vocab=1043, dim=100
‚úì Word2Vec saved to word2vec.model

üìä Preparing sequences...
‚úì CNN initialized with Word2Vec embeddings!

‚ö†Ô∏è  No pre-trained model found. Training from scratch...

üèãÔ∏è  TRAINING LOOP (WITH PROPER BATCHING!)

Epoch | Train Acc | Train Loss | Val Acc | Val Loss
-----------------------------------------------------------------
    0 |    0.0833 |     2.5948 |  0.1544 |   2.5428
    5 |    0.9358 |     0.2424 |  0.9549 |   0.1789
   10 |    0.9970 |     0.0301 |  0.9786 |   0.0754
   15 |    1.0000 |     0.0075 |  0.9786 |   0.0668
   20 |    0.9994 |     0.0062 |  0.9834 |   0.0651
   25 |    0.9988 |     0.0051 |  0.9810 |   0.0616
   29 |    0.9988 |     0.0062 |  0.9786 |   0.0629

‚úì Best Validation Accuracy: 0.9834 (98.34

In [30]:
# ============================================================================
# FRESH START - DELETE EVERYTHING AND RETRAIN
# ============================================================================

import os

# 1. Delete saved model
if os.path.exists("chattea.pth"):
    os.remove("chattea.pth")
    print("‚úì Deleted old model")

# 2. Clear GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("‚úì Cleared GPU cache")

# 3. Delete saved word2vec.model
if os.path.exists("word2vec.model"):
    os.remove("word2vec.model")
    print("Deleted Old Word2Vec Model")

‚úì Cleared GPU cache
Deleted Old Word2Vec Model
