# Text Classification Demo

This notebook demonstrates the text classification capabilities of the NLP toolkit, including:
- Loading and preprocessing data
- Training a transformer-based classifier
- Evaluating model performance
- Visualizing results
- Making predictions on new data

In [None]:
# Setup path to allow importing from the src directory
import sys
import os
from pathlib import Path

# Add parent directory to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

# Import toolkit modules
from src.data.preprocessing import TextPreprocessor
from src.data.data_loader import get_text_classification_loader
from src.models.classifier import TransformerClassifier
from src.training.metrics import classification_report
from src.utils.visualization import plot_confusion_matrix, plot_classification_metrics, plot_training_history

# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer
from datasets import load_dataset

## 1. Configuration and Setup

In [None]:
# Configuration
TASK = "classification"
MODEL_NAME = "distilbert-base-uncased"  # Smaller model for faster execution
DATASET_NAME = "imdb"  # Movie reviews sentiment dataset
MAX_LENGTH = 128
BATCH_SIZE = 16
NUM_EPOCHS = 1  # Using just 1 epoch for demonstration purposes

# Output directory for model and results
OUTPUT_DIR = os.path.join(project_root, "models", "demo_classifier")
os.makedirs(OUTPUT_DIR, exist_ok=True)

## 2. Data Loading and Preprocessing

In [None]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Initialize preprocessor
preprocessor = TextPreprocessor()

# Create dataset loader
dataset_loader = get_text_classification_loader(
    tokenizer=tokenizer,
    preprocessor=preprocessor,
    max_length=MAX_LENGTH
)

In [None]:
# Load the dataset directly using Hugging Face datasets
# This approach ensures we get the expected structure
raw_dataset = load_dataset(DATASET_NAME)

# Display dataset information
print(f"Dataset: {DATASET_NAME}")
print(f"Number of splits: {len(raw_dataset.keys())}")
for split in raw_dataset.keys():
    print(f"  {split}: {len(raw_dataset[split])} examples")

# Show example data
print("\nExample data:")
for i, example in enumerate(raw_dataset["train"][:3]):
    print(f"Example {i+1}:")
    print(f"  Text: {example['text'][:100]}...")
    print(f"  Label: {example['label']}")

# We still use our loader to create PyTorch datasets
dataset = raw_dataset

In [None]:
# Create PyTorch DataLoaders for training and evaluation
dataloaders = dataset_loader.create_dataloaders(
    dataset,
    batch_size=BATCH_SIZE,
    shuffle_train=True
)

print("DataLoaders created:")
for split, dataloader in dataloaders.items():
    print(f"  {split}: {len(dataloader)} batches of size {BATCH_SIZE}")

## 3. Model Training

In [None]:
# Initialize the classifier
classifier = TransformerClassifier(
    model_name=MODEL_NAME,
    num_labels=2,  # Binary classification for sentiment
    task="binary"
)

# Display model information
print(f"Model: {MODEL_NAME}")
print(f"Task: {TASK}")
print(f"Number of parameters: {classifier.get_model_size():,}")

In [None]:
# Train the model
training_history = classifier.train(
    train_dataloader=dataloaders["train"],
    val_dataloader=dataloaders["test"],
    num_epochs=NUM_EPOCHS,
    output_dir=OUTPUT_DIR
)

In [None]:
# Plot training history
plot_training_history(training_history)

## 4. Model Evaluation

In [None]:
# Evaluate on test set
test_results = classifier.evaluate(dataloaders["test"])

# Display results
print("Test Results:")
for metric, value in test_results.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Generate predictions and true labels
predictions, true_labels = classifier.predict(dataloaders["test"])

# Calculate and display classification report
report = classification_report(true_labels, predictions, target_names=["Negative", "Positive"])
print(report)

In [None]:
# Plot confusion matrix
plot_confusion_matrix(true_labels, predictions, class_names=["Negative", "Positive"])

In [None]:
# Plot classification metrics
plot_classification_metrics(report)

## 5. Making Predictions on New Data

In [None]:
# Sample reviews for prediction
sample_reviews = [
    "This movie was fantastic! The acting was great and the plot was engaging.",
    "I was disappointed with this film. The story was predictable and the characters were one-dimensional.",
    "A decent movie, but nothing special. Some parts were good, others were mediocre."
]

# Preprocess the reviews
preprocessed_reviews = [preprocessor.preprocess_text(review) for review in sample_reviews]

# Make predictions
predictions = classifier.predict_text(preprocessed_reviews)

# Map predictions to sentiment labels
sentiment_labels = ["Negative", "Positive"]

# Display results
print("Prediction Results:")
for i, (review, prediction) in enumerate(zip(sample_reviews, predictions)):
    print(f"\nReview {i+1}: {review[:100]}...")
    print(f"Prediction: {sentiment_labels[prediction]} (class {prediction})")

## 6. Saving and Loading the Model

In [None]:
# Save the model and config
save_path = os.path.join(OUTPUT_DIR, "final_model")
classifier.save(save_path)
print(f"Model saved to {save_path}")

In [None]:
# Load the model
loaded_classifier = TransformerClassifier.load(save_path)
print(f"Model loaded from {save_path}")

# Verify loaded model works
loaded_predictions = loaded_classifier.predict_text(preprocessed_reviews)

# Check if predictions match
match = all(p1 == p2 for p1, p2 in zip(predictions, loaded_predictions))
print(f"Loaded model predictions match original: {match}")