# Call Centre Ticket Classification - ML Model Architecture Design

## 📋 Project Overview

**Objective**: Design and implement a text classification system to automatically categorize call centre tickets into 6 predefined categories with ≥85% accuracy.

**Categories**: 
- **BILLING**: Account, payment, billing issues
- **TECHNICAL**: Network, connectivity, service problems  
- **SALES**: New services, upgrades, product inquiries
- **COMPLAINTS**: Service complaints, escalations
- **NETWORK**: Infrastructure and network-related issues
- **ACCOUNT**: Account management, profile changes

**Success Criteria**:
- Classification accuracy ≥85% on test data
- Processing speed <2 seconds per ticket
- Production-ready FastAPI deployment
- 80% test coverage

---

## 🎯 Experiment Documentation

**Hypothesis**: Transformer-based models (BERT/DistilBERT) will outperform traditional ML approaches for this multi-class text classification task due to better contextual understanding of telecoms terminology.

**Baseline Models to Test**:
1. **Traditional**: Logistic Regression + TF-IDF
2. **Ensemble**: Random Forest + TF-IDF
3. **Transformer**: DistilBERT fine-tuned
4. **Hybrid**: Ensemble of traditional + transformer

In [None]:
# Import Required Libraries
import pandas as pd  # type: ignore
import matplotlib.pyplot as plt  # type: ignore
import seaborn as sns  # type: ignore
from sklearn.model_selection import train_test_split  # type: ignore
from sklearn.feature_extraction.text import TfidfVectorizer  # type: ignore
from sklearn.linear_model import LogisticRegression  # type: ignore
from sklearn.metrics import classification_report, accuracy_score  # type: ignore
from sklearn.pipeline import Pipeline  # type: ignore
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("📚 Libraries imported successfully!")
print("🔧 Environment setup complete")

# Configuration
RANDOM_STATE = 42
TARGET_ACCURACY = 0.85
MAX_PROCESSING_TIME = 2.0  # seconds


## 📊 Data Analysis & Preparation

First, let's generate our mock dataset and perform initial exploratory data analysis.

In [None]:
import sys
sys.path.append('../src')

from data.mock_data_generator import TelecomsTicketGenerator

# Initialize generator and create dataset
generator = TelecomsTicketGenerator(seed=RANDOM_STATE)
dataset = generator.generate_dataset()

print("📈 Dataset Overview:")
print(f"   Total samples: {len(dataset):,}")
print(f"   Features: {list(dataset.columns)}")
print("   Target distribution:")
print(dataset['category'].value_counts().sort_index())

# Display sample tickets
print("\n📝 Sample Tickets:")
for category in dataset['category'].unique()[:3]:
    sample = dataset[dataset['category'] == category]['ticket_text'].iloc[0]
    print(f"\n{category}: {sample[:100]}...")

# Basic statistics
dataset.info()
dataset.describe()


In [None]:
# Exploratory Data Analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Category distribution
dataset['category'].value_counts().plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('Ticket Category Distribution')
axes[0,0].tick_params(axis='x', rotation=45)

# Text length distribution
dataset['text_length'] = dataset['ticket_text'].str.len()
dataset['text_length'].hist(bins=50, ax=axes[0,1], color='lightgreen')
axes[0,1].set_title('Ticket Text Length Distribution')
axes[0,1].set_xlabel('Characters')

# Priority distribution by category
priority_cat = pd.crosstab(dataset['category'], dataset['priority'])
priority_cat.plot(kind='bar', stacked=True, ax=axes[1,0])
axes[1,0].set_title('Priority Distribution by Category')
axes[1,0].tick_params(axis='x', rotation=45)

# Customer type distribution
dataset['customer_type'].value_counts().plot(kind='pie', ax=axes[1,1], autopct='%1.1f%%')
axes[1,1].set_title('Customer Type Distribution')

plt.tight_layout()
plt.show()

# Text statistics by category
print("📊 Text Length Statistics by Category:")
print(dataset.groupby('category')['text_length'].agg(['mean', 'median', 'std']).round(2))

## 🔧 Feature Engineering & Text Preprocessing

In [None]:
import re

def preprocess_text(text):
    """Clean and preprocess ticket text."""
    # Convert to lowercase
    text = text.lower()
    
    # Remove special characters and digits (keep some punctuation for context)
    text = re.sub(r'[^\w\s\.\,\!\?]', ' ', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# Apply preprocessing
dataset['processed_text'] = dataset['ticket_text'].apply(preprocess_text)

# Feature Engineering: Extract additional features
def extract_features(df):
    """Extract additional features from ticket data."""
    features = df.copy()
    
    # Text features
    features['word_count'] = features['processed_text'].str.split().str.len()
    features['char_count'] = features['processed_text'].str.len()
    features['avg_word_length'] = features['char_count'] / features['word_count']
    
    # Telecoms-specific keywords
    billing_keywords = ['bill', 'payment', 'charge', 'cost', 'money', 'rand', 'debit', 'account']
    technical_keywords = ['internet', 'connection', 'speed', 'wifi', 'router', 'signal', 'slow']
    
    features['has_billing_keywords'] = features['processed_text'].str.contains('|'.join(billing_keywords))
    features['has_technical_keywords'] = features['processed_text'].str.contains('|'.join(technical_keywords))
    
    # Urgency indicators
    urgency_keywords = ['urgent', 'asap', 'immediately', 'emergency', 'critical']
    features['has_urgency'] = features['processed_text'].str.contains('|'.join(urgency_keywords))
    
    return features

# Apply feature engineering
dataset_enhanced = extract_features(dataset)

print("🔧 Feature Engineering Complete:")
print(f"   New features added: {set(dataset_enhanced.columns) - set(dataset.columns)}")
print(f"   Total features: {len(dataset_enhanced.columns)}")

# Show sample of enhanced features
enhanced_sample = dataset_enhanced[['category', 'word_count', 'char_count', 'has_billing_keywords', 'has_technical_keywords']].head()
print("\n📊 Enhanced Feature Sample:")
print(enhanced_sample)


## 🤖 Model Architecture & Training Pipeline

Now let's design and implement multiple model architectures to find the best performing approach.

In [None]:
# Prepare features and target
X = dataset_enhanced['processed_text']
y = dataset_enhanced['category']

# Stratified split to maintain class balance
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=RANDOM_STATE
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=RANDOM_STATE
)

print("📊 Data Split Summary:")
print(f"   Training: {len(X_train):,} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"   Validation: {len(X_val):,} samples ({len(X_val)/len(X)*100:.1f}%)")  
print(f"   Test: {len(X_test):,} samples ({len(X_test)/len(X)*100:.1f}%)")

# Verify class balance is maintained
print("\n🎯 Class Distribution Verification:")
print("Training set:", y_train.value_counts().sort_index().tolist())
print("Validation set:", y_val.value_counts().sort_index().tolist())
print("Test set:", y_test.value_counts().sort_index().tolist())


In [None]:
import time

# Create baseline pipeline
tfidf_lr_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=10000,
        ngram_range=(1, 2),
        stop_words='english',
        min_df=2,
        max_df=0.95
    )),
    ('classifier', LogisticRegression(
        random_state=RANDOM_STATE,
        max_iter=1000,
        class_weight='balanced'
    ))
])

# Train and evaluate
print("🚀 Training Baseline Model (Logistic Regression + TF-IDF)...")
start_time = time.time()
tfidf_lr_pipeline.fit(X_train, y_train)
training_time = time.time() - start_time

# Predictions and evaluation
start_time = time.time()
y_pred_lr = tfidf_lr_pipeline.predict(X_val)
y_pred_proba_lr = tfidf_lr_pipeline.predict_proba(X_val)
inference_time = (time.time() - start_time) / len(X_val)

# Performance metrics
lr_accuracy = accuracy_score(y_val, y_pred_lr)

print("✅ Baseline Results:")
print(f"   Accuracy: {lr_accuracy:.4f}")
print(f"   Training time: {training_time:.2f} seconds")
print(f"   Avg inference time: {inference_time*1000:.2f} ms per sample")
print(f"   Target achieved: {'✅ YES' if lr_accuracy >= TARGET_ACCURACY else '❌ NO'}")

# Detailed classification report
print("\n📊 Detailed Classification Report:")
print(classification_report(y_val, y_pred_lr))
