# Text Classification Demo

This notebook demonstrates the text classification capabilities of the NLP toolkit, including:
- Loading and preprocessing data
- Training a transformer-based classifier
- Evaluating model performance
- Visualizing results
- Making predictions on new data

In [None]:
# Setup path to allow importing from the src directory
import sys
import os
from pathlib import Path

# Add parent directory to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

# Import toolkit modules
from src.data.preprocessing import TextPreprocessor
from src.data.data_loader import get_text_classification_loader
from src.models.classifier import TransformerClassifier
from src.training.metrics import classification_report
from src.utils.visualization import plot_confusion_matrix, plot_classification_metrics, plot_training_history

# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer

## 1. Configuration and Setup

In [None]:
# Configuration
TASK = "classification"
MODEL_NAME = "distilbert-base-uncased"  # Smaller model for faster execution
DATASET_NAME = "imdb"  # Movie reviews sentiment dataset
MAX_LENGTH = 128
BATCH_SIZE = 16
NUM_EPOCHS = 1  # Using just 1 epoch for demonstration purposes

# Output directory for model and results
OUTPUT_DIR = os.path.join(project_root, "models", "demo_classifier")
os.makedirs(OUTPUT_DIR, exist_ok=True)

## 2. Data Loading and Preprocessing

In [None]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Initialize preprocessor
preprocessor = TextPreprocessor()

# Create dataset loader
dataset_loader = get_text_classification_loader(
    tokenizer=tokenizer,
    preprocessor=preprocessor,
    max_length=MAX_LENGTH
)

In [None]:
# Load the dataset
dataset = dataset_loader.load_huggingface_dataset(
    dataset_name=DATASET_NAME,
    text_column="text",
    label_column="label"
)

# Display dataset information
print(f"Dataset: {DATASET_NAME}")
print(f"Number of splits: {len(dataset.keys())}")
for split in dataset.keys():
    print(f"  {split}: {len(dataset[split])} examples")

# Show example data
print("\nExample data:")
for i, example in enumerate(dataset["train"][:3]):
    print(f"Example {i+1}:")
    print(f"  Text: {example['text'][:100]}...")
    print(f"  Label: {example['label']}")

In [None]:
# Preprocess the dataset
preprocessed_dataset = dataset_loader.preprocess_dataset(dataset)

# Create PyTorch DataLoaders
dataloaders = dataset_loader.create_torch_dataloaders(
    preprocessed_dataset,
    batch_size=BATCH_SIZE
)

# Extract train and validation dataloaders
train_dataloader = dataloaders["train"]
val_dataloader = dataloaders["test"]

print(f"Training batches: {len(train_dataloader)}")
print(f"Validation batches: {len(val_dataloader)}")

## 3. Model Training

In [None]:
# Initialize classifier
classifier = TransformerClassifier(
    model_name=MODEL_NAME,
    num_labels=2  # Binary classification for IMDB
)

# Print model information
print(f"Model: {MODEL_NAME}")
print(f"Number of parameters: {classifier.get_model_size():,}")

In [None]:
# Train the model
training_history = classifier.train(
    train_dataloader=train_dataloader,
    val_dataloader=val_dataloader,
    num_epochs=NUM_EPOCHS,
    learning_rate=2e-5,
    weight_decay=0.01,
    warmup_steps=0,
    output_dir=OUTPUT_DIR,
    save_best=True
)

In [None]:
# Visualize training history
plot_training_history(training_history)

## 4. Model Evaluation

In [None]:
# Evaluate the model
eval_results = classifier.evaluate(val_dataloader)

# Print metrics
print("Model Evaluation Results:")
print(f"  Loss: {eval_results['loss']:.4f}")
print(f"  Accuracy: {eval_results['accuracy']:.4f}")
print(f"  Precision: {eval_results['precision']:.4f}")
print(f"  Recall: {eval_results['recall']:.4f}")
print(f"  F1 Score: {eval_results['f1']:.4f}")

In [None]:
# Compute predictions and true labels
import torch

all_predictions = []
all_labels = []
all_probas = []

device = classifier.device
model = classifier.model
model.eval()

with torch.no_grad():
    for batch in val_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        logits = outputs.logits
        
        probas = torch.softmax(logits, dim=1).cpu().numpy()
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        labels = batch["labels"].cpu().numpy()
        
        all_predictions.extend(predictions)
        all_labels.extend(labels)
        all_probas.extend(probas)

In [None]:
# Plot confusion matrix
class_names = ["Negative", "Positive"]
plot_confusion_matrix(
    y_true=all_labels,
    y_pred=all_predictions,
    class_names=class_names
)

In [None]:
# Plot classification metrics (ROC curve, precision-recall curve)
plot_classification_metrics(
    y_true=all_labels,
    y_pred=all_predictions,
    y_proba=np.array(all_probas),
    class_names=class_names
)

## 5. Making Predictions on New Data

In [None]:
# Sample texts for prediction
sample_texts = [
    "This movie was fantastic! The acting was superb and the plot kept me engaged throughout.",
    "I really enjoyed this film. Great performances by the cast.",
    "What a waste of time. The story made no sense and the special effects were terrible.",
    "This is one of the worst movies I've ever seen. Boring and predictable.",
    "The movie was just okay. Some good moments but overall pretty average."
]

# Make predictions
predictions = classifier.predict(sample_texts)
probabilities = classifier.predict_proba(sample_texts)

# Display results
print("Prediction Results:")
for i, (text, pred, proba) in enumerate(zip(sample_texts, predictions, probabilities)):
    sentiment = class_names[pred]
    confidence = proba[pred] * 100
    print(f"\nText {i+1}: {text[:50]}...")
    print(f"Prediction: {sentiment} (Confidence: {confidence:.2f}%)")
    print(f"Class probabilities: Negative={proba[0]:.4f}, Positive={proba[1]:.4f}")

## 6. Save and Load Model

In [None]:
# Save the model
save_path = os.path.join(OUTPUT_DIR, "final_model")
classifier.save(save_path)
print(f"Model saved to {save_path}")

In [None]:
# Load the model
loaded_classifier = TransformerClassifier.load(save_path)
print("Model loaded successfully")

# Verify with a prediction
test_text = "I absolutely loved this movie, would watch it again!"
pred = loaded_classifier.predict([test_text])[0]
print(f"Test prediction: {class_names[pred]}")

## 7. Conclusion

In this notebook, we demonstrated the text classification capabilities of the NLP toolkit:

1. We loaded and preprocessed the IMDB dataset for sentiment analysis
2. We trained a DistilBERT classifier on the dataset
3. We evaluated model performance and visualized the results
4. We made predictions on new text samples
5. We saved and loaded the model for future use

The model achieved good performance even with limited training time, demonstrating the effectiveness of transformer models for text classification tasks. For production use, consider training for more epochs and trying different model architectures or hyperparameters to optimize performance.