In [4]:
# -*- coding: utf-8 -*-
"""
Changi Virtual Assist Triage - RNN Text Classification
Using ATIS Dataset from Hugging Face

Fixed version using: from datasets import load_dataset
"""

# ============================================================================
# CELL 1: Setup and Imports
# ============================================================================

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence

import numpy as np
from collections import Counter
import re
from datasets import load_dataset
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support
import matplotlib.pyplot as plt
import seaborn as sns

torch.manual_seed(42)
np.random.seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# ============================================================================
# CELL 2: Load ATIS Dataset
# ============================================================================

print("Loading ATIS dataset from Hugging Face...")
ds = load_dataset("tuetschek/atis")

# Extract data from dataset and convert to lists
train_texts = list(ds['train']['text'])
train_labels_raw = list(ds['train']['intent'])

test_texts = list(ds['test']['text'])
test_labels_raw = list(ds['test']['intent'])

# Create validation split from training data (10%)
val_size = int(0.1 * len(train_texts))
val_texts = train_texts[:val_size]
val_labels_raw = train_labels_raw[:val_size]
train_texts = train_texts[val_size:]
train_labels_raw = train_labels_raw[val_size:]

print(f"\nDataset loaded successfully!")
print(f"  Train: {len(train_texts)} samples")
print(f"  Val:   {len(val_texts)} samples")
print(f"  Test:  {len(test_texts)} samples")

# Get unique intents and create mapping
all_intents = sorted(list(set(train_labels_raw + test_labels_raw + val_labels_raw)))
intent_to_idx = {intent: idx for idx, intent in enumerate(all_intents)}
idx_to_intent = {idx: intent for intent, idx in intent_to_idx.items()}

NUM_CLASSES = len(intent_to_idx)

# Convert intent strings to indices
train_labels = [intent_to_idx[intent] for intent in train_labels_raw]
val_labels = [intent_to_idx[intent] for intent in val_labels_raw]
test_labels = [intent_to_idx[intent] for intent in test_labels_raw]

print(f"\nNumber of unique intents: {NUM_CLASSES}")
print(f"\nTop 15 most common intents:")
intent_counts = Counter(train_labels_raw)
for i, (intent, count) in enumerate(intent_counts.most_common(15), 1):
    print(f"  {i:2d}. {intent:<35} {count:>4} samples")

# Display distribution
print("\nTrain label distribution:")
train_dist = Counter(train_labels)
for i in sorted(train_dist.keys())[:10]:
    intent = idx_to_intent[i]
    count = train_dist[i]
    print(f"  {intent:<35} {count:>4}")

# Show sample queries
print("\nSample queries (first 10):")
for i in range(min(10, len(train_texts))):
    print(f"  {i+1:2d}. [{idx_to_intent[train_labels[i]]:<30}] '{train_texts[i][:60]}...'")

# ============================================================================
# CELL 3: Text Preprocessing
# ============================================================================

class TextPreprocessor:
    """Text preprocessing and vocabulary management"""
    def __init__(self):
        self.vocab = {'<PAD>': 0, '<UNK>': 1}
        self.word_to_idx = self.vocab.copy()

    def clean_text(self, text):
        """Clean and normalize text"""
        text = text.lower()
        text = re.sub(r'[^a-z0-9\s]', ' ', text)
        return re.sub(r'\s+', ' ', text).strip()

    def tokenize(self, text):
        """Tokenize text into words"""
        return text.split()

    def build_vocab(self, texts, min_freq=2):
        """Build vocabulary from texts"""
        word_freq = Counter()
        for text in texts:
            word_freq.update(self.tokenize(self.clean_text(text)))

        for word, freq in word_freq.items():
            if freq >= min_freq and word not in self.word_to_idx:
                self.word_to_idx[word] = len(self.word_to_idx)

        print(f"\nVocabulary built!")
        print(f"  Vocab size: {len(self.word_to_idx)} (min_freq={min_freq})")
        return self.word_to_idx

    def text_to_indices(self, text):
        """Convert text to indices"""
        tokens = self.tokenize(self.clean_text(text))
        return [self.word_to_idx.get(t, 1) for t in tokens]  # 1 is <UNK>

# Build vocabulary from training data
preprocessor = TextPreprocessor()
preprocessor.build_vocab(train_texts, min_freq=2)

# Analyze sequence lengths
lengths = [len(preprocessor.text_to_indices(t)) for t in train_texts]
print(f"\nSequence length statistics:")
print(f"  Mean: {np.mean(lengths):.1f}")
print(f"  Median: {np.median(lengths):.0f}")
print(f"  Max: {np.max(lengths)}")
print(f"  Min: {np.min(lengths)}")
print(f"  95th percentile: {np.percentile(lengths, 95):.0f}")

# ============================================================================
# CELL 4: PyTorch Dataset
# ============================================================================

class IntentDataset(Dataset):
    """PyTorch Dataset for intent classification"""
    def __init__(self, texts, labels, preprocessor):
        self.texts = texts
        self.labels = labels
        self.preprocessor = preprocessor

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        indices = self.preprocessor.text_to_indices(self.texts[idx])
        return {
            'indices': torch.LongTensor(indices),
            'label': torch.LongTensor([self.labels[idx]]),
            'length': len(indices)
        }

def collate_fn(batch):
    """Collate function for DataLoader"""
    indices = pad_sequence([b['indices'] for b in batch], batch_first=True)
    labels = torch.cat([b['label'] for b in batch])
    lengths = torch.LongTensor([b['length'] for b in batch])
    return {'indices': indices, 'labels': labels, 'lengths': lengths}

# Create datasets and dataloaders
BATCH_SIZE = 32

train_dataset = IntentDataset(train_texts, train_labels, preprocessor)
val_dataset = IntentDataset(val_texts, val_labels, preprocessor)
test_dataset = IntentDataset(test_texts, test_labels, preprocessor)

train_loader = DataLoader(train_dataset, BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, BATCH_SIZE, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, BATCH_SIZE, collate_fn=collate_fn)

print(f"\nDataLoaders created:")
print(f"  Train batches: {len(train_loader)}")
print(f"  Val batches:   {len(val_loader)}")
print(f"  Test batches:  {len(test_loader)}")

# ============================================================================
# CELL 5: RNN Model
# ============================================================================

class IntentRNN(nn.Module):
    """Bidirectional LSTM for Intent Classification"""
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes,
                 num_layers=2, dropout=0.4, bidirectional=True):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.emb_dropout = nn.Dropout(0.2)

        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers,
                           dropout=dropout if num_layers > 1 else 0,
                           bidirectional=bidirectional, batch_first=True)

        fc_input = hidden_dim * (2 if bidirectional else 1)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(fc_input, num_classes)

    def forward(self, x, lengths):
        # Embedding layer
        embedded = self.embedding(x)
        embedded = self.emb_dropout(embedded)

        # Pack padded sequence for efficient LSTM processing
        packed = pack_padded_sequence(embedded, lengths.cpu(),
                                     batch_first=True, enforce_sorted=False)
        _, (hidden, _) = self.lstm(packed)

        # Concatenate forward and backward hidden states
        if self.bidirectional:
            hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)
        else:
            hidden = hidden[-1]

        # Classification layer
        out = self.dropout(hidden)
        return self.fc(out)

# Initialize model
VOCAB_SIZE = len(preprocessor.word_to_idx)
model = IntentRNN(
    vocab_size=VOCAB_SIZE,
    embed_dim=128,           # Increased from 64
    hidden_dim=256,          # Increased from 128
    num_classes=NUM_CLASSES,
    num_layers=2,
    dropout=0.3,             # Reduced from 0.4
    bidirectional=True
).to(device)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel Architecture:")
print(model)
print(f"\nModel Statistics:")
print(f"  Total parameters: {total_params:,}")
print(f"  Trainable parameters: {trainable_params:,}")

# ============================================================================
# CELL 6: Training Functions
# ============================================================================

def train_epoch(model, loader, criterion, optimizer):
    """Train for one epoch"""
    model.train()
    total_loss, correct, total = 0, 0, 0

    for batch in loader:
        indices = batch['indices'].to(device)
        labels = batch['labels'].to(device)
        lengths = batch['lengths']

        optimizer.zero_grad()
        outputs = model(indices, lengths)
        loss = criterion(outputs, labels)
        loss.backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        total_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return total_loss / len(loader), 100 * correct / total

def evaluate(model, loader, criterion):
    """Evaluate the model"""
    model.eval()
    total_loss, correct, total = 0, 0, 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in loader:
            indices = batch['indices'].to(device)
            labels = batch['labels'].to(device)
            lengths = batch['lengths']

            outputs = model(indices, lengths)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return total_loss / len(loader), 100 * correct / total, all_preds, all_labels

# ============================================================================
# CELL 7: Training Loop
# ============================================================================

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=3, factor=0.5)

EPOCHS = 20
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
best_val_acc = 0

print(f"\n{'='*70}")
print("STARTING TRAINING")
print(f"{'='*70}\n")

for epoch in range(EPOCHS):
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer)
    val_loss, val_acc, _, _ = evaluate(model, val_loader, criterion)

    scheduler.step(val_loss)

    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

    print(f"Epoch {epoch+1:2d}/{EPOCHS}")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:6.2f}%")
    print(f"  Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:6.2f}%")

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_model_atis.pth')
        print(f"  âœ“ New best model saved! (Val Acc: {val_acc:.2f}%)")
    print()

print(f"{'='*70}")
print(f"Training completed! Best validation accuracy: {best_val_acc:.2f}%")
print(f"{'='*70}\n")

# ============================================================================
# CELL 8: Test Evaluation
# ============================================================================

# Load best model
model.load_state_dict(torch.load('best_model_atis.pth'))

# Evaluate on test set
test_loss, test_acc, y_pred, y_true = evaluate(model, test_loader, criterion)

print(f"\n{'='*70}")
print("TEST SET EVALUATION")
print(f"{'='*70}\n")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.2f}%")

# Detailed classification report
# Only include intents that appear in test set
unique_test_labels = sorted(list(set(y_true)))
intent_names_in_test = [idx_to_intent[i] for i in unique_test_labels]

print(f"\n{'='*70}")
print("CLASSIFICATION REPORT")
print(f"{'='*70}\n")
print(classification_report(y_true, y_pred, labels=unique_test_labels,
                           target_names=intent_names_in_test, zero_division=0))

# ============================================================================
# CELL 9: Confusion Matrix
# ============================================================================

# Get top 15 most common intents for visualization
top_15_intents = [intent for intent, _ in Counter([idx_to_intent[i] for i in y_true]).most_common(15)]
top_15_indices = [intent_to_idx[intent] for intent in top_15_intents]

# Filter predictions and labels for top 15
filtered_true = [y for y in y_true if y in top_15_indices]
filtered_pred = [y_pred[i] for i, y in enumerate(y_true) if y in top_15_indices]

# Create confusion matrix
cm = confusion_matrix(filtered_true, filtered_pred, labels=top_15_indices)

# Plot
plt.figure(figsize=(14, 12))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=[intent[:25] for intent in top_15_intents],
            yticklabels=[intent[:25] for intent in top_15_intents])
plt.title('Confusion Matrix - Top 15 Most Common Intents')
plt.ylabel('True Intent')
plt.xlabel('Predicted Intent')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('confusion_matrix_atis.png', dpi=300, bbox_inches='tight')
print("\nâœ“ Confusion matrix saved as 'confusion_matrix_atis.png'")
plt.close()

# ============================================================================
# CELL 10: Training Curves
# ============================================================================

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Loss plot
ax1.plot(history['train_loss'], label='Train Loss', marker='o', markersize=4)
ax1.plot(history['val_loss'], label='Val Loss', marker='s', markersize=4)
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.set_title('Training and Validation Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Accuracy plot
ax2.plot(history['train_acc'], label='Train Acc', marker='o', markersize=4)
ax2.plot(history['val_acc'], label='Val Acc', marker='s', markersize=4)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.set_title('Training and Validation Accuracy')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_curves_atis.png', dpi=300, bbox_inches='tight')
print("âœ“ Training curves saved as 'training_curves_atis.png'\n")
plt.close()

# ============================================================================
# CELL 11: Prediction Function
# ============================================================================

def predict_intent(text, model=model, preprocessor=preprocessor, top_k=3):
    """
    Predict intent for a given text query

    Args:
        text: Input query string
        model: Trained model
        preprocessor: Text preprocessor
        top_k: Number of top predictions to return

    Returns:
        List of (intent, confidence) tuples
    """
    model.eval()

    # Preprocess text
    indices = preprocessor.text_to_indices(text)
    indices_tensor = torch.LongTensor([indices]).to(device)
    length = torch.LongTensor([len(indices)])

    # Predict
    with torch.no_grad():
        outputs = model(indices_tensor, length)
        probs = torch.softmax(outputs, dim=1)[0]
        top_probs, top_indices = torch.topk(probs, min(top_k, NUM_CLASSES))

    # Format results
    results = []
    for prob, idx in zip(top_probs.cpu().numpy(), top_indices.cpu().numpy()):
        results.append({
            'intent': idx_to_intent[int(idx)],
            'confidence': float(prob) * 100
        })

    return results

# ============================================================================
# CELL 12: Test on Real Airport Queries
# ============================================================================

test_queries = [
    "What time does my flight to Tokyo depart?",
    "I can't find my luggage",
    "Where is gate A15?",
    "I need wheelchair assistance to my gate",
    "How do I get to downtown Singapore from here?",
    "What items can I bring in my carry-on?",
    "My bag was damaged during the flight",
    "Where is the immigration counter in Terminal 2?",
    "Can I get special assistance for traveling with an infant?",
    "How much does a taxi cost to Marina Bay Sands?",
    "Do I need to declare alcohol purchases?",
    "Where can I find a restroom near gate B12?",
    "What time does boarding start for my flight?",
    "I need to report a lost bag",
    "Show me flights to New York",
    "Where is the SilverKris lounge?",
    "I need help finding the MRT station",
    "Can I bring batteries in my luggage?",
    "My flight has been delayed, what should I do?",
    "Where can I exchange currency?"
]

print(f"\n{'='*70}")
print("REAL AIRPORT QUERY PREDICTIONS")
print(f"{'='*70}\n")

for query in test_queries:
    print(f"Query: '{query}'")
    print("-" * 70)
    preds = predict_intent(query, top_k=3)
    for i, pred in enumerate(preds, 1):
        bar = "â–ˆ" * int(pred['confidence'] / 3)
        print(f"  {i}. {pred['intent'][:45]:.<50} {pred['confidence']:>5.1f}%  {bar}")
    print()

# ============================================================================
# CELL 13: Test on Dataset Examples
# ============================================================================

print(f"{'='*70}")
print("PREDICTIONS ON TEST SET EXAMPLES")
print(f"{'='*70}\n")

# Show 15 random test examples
indices = np.random.choice(len(test_texts), min(15, len(test_texts)), replace=False)

correct_predictions = 0
total_predictions = 0

for idx in indices:
    text = test_texts[idx]
    true_label = test_labels[idx]
    true_intent = idx_to_intent[true_label]

    print(f"Query: '{text}'")
    print(f"True: {true_intent}")
    print("-" * 70)

    preds = predict_intent(text, top_k=3)

    for i, pred in enumerate(preds, 1):
        marker = "âœ“" if pred['intent'] == true_intent else " "
        bar = "â–ˆ" * int(pred['confidence'] / 4)
        print(f"{marker} {i}. {pred['intent'][:45]:.<50} {pred['confidence']:>5.1f}%  {bar}")

        if i == 1 and pred['intent'] == true_intent:
            correct_predictions += 1

    total_predictions += 1
    print()

print(f"Accuracy on shown examples: {100 * correct_predictions / total_predictions:.1f}%\n")

# ============================================================================
# CELL 14: Error Analysis
# ============================================================================

print(f"{'='*70}")
print("ERROR ANALYSIS")
print(f"{'='*70}\n")

# Find misclassified examples
misclassified = []
for text, true_label, pred_label in zip(test_texts, y_true, y_pred):
    if true_label != pred_label:
        misclassified.append({
            'text': text,
            'true': idx_to_intent[true_label],
            'predicted': idx_to_intent[pred_label]
        })

print(f"Total misclassifications: {len(misclassified)} out of {len(test_texts)}")
print(f"Test accuracy: {100 * (len(test_texts) - len(misclassified)) / len(test_texts):.2f}%")

if misclassified:
    print(f"\nMisclassified Examples (showing up to 15):")
    print("-" * 70)
    for i, example in enumerate(misclassified[:15], 1):
        print(f"\n{i:2d}. Query: '{example['text']}'")
        print(f"    True: {example['true'][:50]}")
        print(f"    Pred: {example['predicted'][:50]}")

# ============================================================================
# CELL 15: Model Summary
# ============================================================================

print(f"\n{'='*70}")
print("MODEL SUMMARY & PERFORMANCE INSIGHTS")
print(f"{'='*70}\n")

print("Model Architecture:")
print(f"  Type: Bidirectional LSTM")
print(f"  Embedding dim: 128")
print(f"  Hidden dim: 256")
print(f"  LSTM layers: 2")
print(f"  Dropout: 0.3")
print(f"  Total parameters: {sum(p.numel() for p in model.parameters()):,}")

print(f"\nDataset Statistics:")
print(f"  Training samples: {len(train_texts)}")
print(f"  Validation samples: {len(val_texts)}")
print(f"  Test samples: {len(test_texts)}")
print(f"  Vocabulary size: {VOCAB_SIZE}")
print(f"  Number of intents: {NUM_CLASSES}")

print(f"\nPerformance Metrics:")
print(f"  Best validation accuracy: {best_val_acc:.2f}%")
print(f"  Test accuracy: {test_acc:.2f}%")
print(f"  Training epochs: {len(history['train_loss'])}")

# Calculate per-class metrics (only for intents in test set)
precision, recall, f1, support = precision_recall_fscore_support(
    y_true, y_pred, labels=unique_test_labels, average=None, zero_division=0
)

# Show top 10 performing intents
print(f"\nTop 10 Best Performing Intents (by F1-Score):")
print("-" * 70)
print(f"{'Intent':<45} {'Precision':>10} {'Recall':>10} {'F1-Score':>10}")
print("-" * 70)

intent_f1 = [(idx_to_intent[unique_test_labels[i]], f1[i], precision[i], recall[i])
             for i in range(len(unique_test_labels))]
intent_f1_sorted = sorted(intent_f1, key=lambda x: x[1], reverse=True)

for intent, f1_score, prec, rec in intent_f1_sorted[:10]:
    print(f"{intent[:45]:<45} {prec:>10.2%} {rec:>10.2%} {f1_score:>10.2%}")

print(f"\n{'='*70}")

# ============================================================================
# CELL 16: Save Complete Model Checkpoint
# ============================================================================

checkpoint = {
    'model_state': model.state_dict(),
    'vocab': preprocessor.word_to_idx,
    'intent_to_idx': intent_to_idx,
    'idx_to_intent': idx_to_intent,
    'hyperparameters': {
        'vocab_size': VOCAB_SIZE,
        'embed_dim': 128,
        'hidden_dim': 256,
        'num_layers': 2,
        'dropout': 0.3,
        'bidirectional': True
    },
    'test_acc': test_acc,
    'best_val_acc': best_val_acc,
    'training_history': history
}

torch.save(checkpoint, 'changi_airport_rnn_atis_complete.pth')
print(f"\n{'='*70}")
print("âœ“ Complete model checkpoint saved as 'changi_airport_rnn_atis_complete.pth'")
print(f"  Test Accuracy: {test_acc:.2f}%")
print(f"  Best Val Accuracy: {best_val_acc:.2f}%")
print(f"\nTo load the model later, use:")
print("  checkpoint = torch.load('changi_airport_rnn_atis_complete.pth')")
print("  model.load_state_dict(checkpoint['model_state'])")
print(f"{'='*70}\n")

# ============================================================================
# CELL 17: Interactive Demo Function
# ============================================================================

def interactive_demo():
    """Interactive demo for testing the classifier"""
    print(f"\n{'='*70}")
    print("CHANGI AIRPORT VIRTUAL ASSIST - INTERACTIVE DEMO")
    print(f"{'='*70}")
    print("\nEnter airport service queries to classify (type 'quit' to exit)")
    print("\nExample queries:")
    print("  - Where is my gate?")
    print("  - My bag is missing")
    print("  - How to get downtown?")
    print("  - I need wheelchair help")
    print(f"\n{'='*70}\n")

    while True:
        query = input("Enter query: ").strip()

        if query.lower() in ['quit', 'exit', 'q']:
            print("\nThank you for using Changi Virtual Assist!")
            break

        if not query:
            continue

        print("\nPredicted Intents:")
        print("-" * 70)

        preds = predict_intent(query, top_k=3)

        for i, pred in enumerate(preds, 1):
            bar = "â–ˆ" * int(pred['confidence'] / 2.5)
            print(f"{i}. {pred['intent'][:45]:.<50} {pred['confidence']:>6.2f}% {bar}")

        print()

# Uncomment to run interactive demo:
# interactive_demo()

# ============================================================================
# CELL 18: Usage Instructions
# ============================================================================

print(f"{'='*70}")
print("HOW TO USE THIS MODEL")
print(f"{'='*70}\n")

print("1. Make predictions on new queries:")
print("   predictions = predict_intent('where is my luggage?')")
print()

print("2. Get top-k predictions:")
print("   predictions = predict_intent('gate for my flight', top_k=3)")
print()

print("3. Run interactive demo:")
print("   interactive_demo()")
print()

print("4. Access model components:")
print("   - model: The trained RNN model")
print("   - preprocessor: Text preprocessing utilities")
print("   - idx_to_intent: Intent label mapping")
print()

print(f"{'='*70}\n")

print("âœ… All cells completed successfully!")
print("ðŸ“Š Model ready for deployment at Changi Airport Virtual Assist!")
print(f"\n{'='*70}")

Device: cuda
Loading ATIS dataset from Hugging Face...

Dataset loaded successfully!
  Train: 4481 samples
  Val:   497 samples
  Test:  893 samples

Number of unique intents: 26

Top 15 most common intents:
   1. flight                              3289 samples
   2. airfare                              388 samples
   3. ground_service                       231 samples
   4. airline                              140 samples
   5. abbreviation                         135 samples
   6. aircraft                              74 samples
   7. flight_time                           47 samples
   8. quantity                              43 samples
   9. flight+airfare                        21 samples
  10. distance                              18 samples
  11. airport                               18 samples
  12. city                                  17 samples
  13. ground_fare                           16 samples
  14. capacity                              15 samples
  15. flight_no       