# Text Classification Demo (Robust Version)

This notebook demonstrates the text classification capabilities of the NLP toolkit, with special handling to ensure compatibility with different dataset structures.

In [None]:
# Setup path to allow importing from the src directory
import sys
import os
from pathlib import Path

# Add parent directory to path
project_root = Path().resolve().parent
sys.path.insert(0, str(project_root))

# Import toolkit modules
from src.data.preprocessing import TextPreprocessor
from src.data.data_loader import get_text_classification_loader
from src.models.classifier import TransformerClassifier
from src.training.metrics import classification_report
from src.utils.visualization import plot_confusion_matrix, plot_classification_metrics, plot_training_history

# Import standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, DatasetDict
import torch

## 1. Configuration and Setup

In [None]:
# Configuration
TASK = "classification"
MODEL_NAME = "distilbert-base-uncased"  # Smaller model for faster execution
DATASET_NAME = "imdb"  # Movie reviews sentiment dataset
MAX_LENGTH = 128
BATCH_SIZE = 16
NUM_EPOCHS = 1  # Using just 1 epoch for demonstration purposes

# Output directory for model and results
OUTPUT_DIR = os.path.join(project_root, "models", "demo_classifier")
os.makedirs(OUTPUT_DIR, exist_ok=True)

## 2. Data Loading and Preprocessing

In [None]:
# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Initialize preprocessor
preprocessor = TextPreprocessor()

In [None]:
# Load the dataset directly using Hugging Face datasets library
print(f"Loading dataset: {DATASET_NAME}")
hf_dataset = load_dataset(DATASET_NAME)

# Display dataset information
print(f"Dataset splits: {list(hf_dataset.keys())}")
for split in hf_dataset.keys():
    print(f"  {split}: {len(hf_dataset[split])} examples")

# Analyze the structure of the dataset examples
print("\nAnalyzing dataset structure:")
try:
    # Try to get the first example
    example = hf_dataset["train"][0]
    print(f"Example type: {type(example)}")
    
    # Check if it's a dictionary
    if isinstance(example, dict):
        print(f"Example fields: {list(example.keys())}")
        
        # Safely show text field
        if 'text' in example:
            text = example.get('text', example.get('sentence', str(example)))
            print(f"Text field type: {type(text)}")
            if isinstance(text, str):
                print(f"Text preview: {text[:50]}...")
        else:
            print("No 'text' field found")
            
        # Safely show label field
        if 'label' in example:
            print(f"Label: {example.get('label', 0)} (type: {type(example.get('label', 0))})")
    else:
        # If not a dictionary, try to interpret it as raw text
        print(f"Example might be raw text. First 50 chars: {str(example)[:50]}...")
except Exception as e:
    print(f"Error analyzing dataset: {type(e).__name__}: {str(e)}")

In [None]:
# Create a custom wrapper for dataset access that handles different structures
class RobustDatasetWrapper:
    def __init__(self, dataset):
        self.dataset = dataset
        
    def __getitem__(self, key):
        # Handle split access
        if isinstance(key, str) and key in self.dataset:
            # Return a wrapped split
            return RobustSplitWrapper(self.dataset[key])
        return self.dataset[key]
    
    def keys(self):
        return self.dataset.keys()
        
class RobustSplitWrapper:
    def __init__(self, split):
        self.split = split
        
    def __getitem__(self, idx):
        # Get item from the original split
        item = self.split[idx]
        
        # Handle different item structures
        if isinstance(item, dict):
            # Dictionary - look for text and label
            return {
                'text': item.get('text', str(item)),
                'label': item.get('label', 0)
            }
        elif isinstance(item, str):
            # String - treat as text
            return {'text': item, 'label': 0}
        else:
            # Something else - convert to string
            return {'text': str(item), 'label': 0}
    
    def __len__(self):
        return len(self.split)
    
    def select(self, indices):
        # Handle select method (used by DataLoader)
        return RobustSplitWrapper(self.split.select(indices))

# Wrap the dataset
robust_dataset = RobustDatasetWrapper(hf_dataset)

# Test the wrapped dataset
print("\nTesting robust dataset wrapper:")
print(f"Splits: {list(robust_dataset.keys())}")

# Show example data with the wrapper
print("\nExample data with wrapper:")
for i in range(3):
    example = robust_dataset["train"][i]
    print(f"Example {i+1}:")
    print(f"  Text: {example.get('text', example.get('sentence', str(example)))[:50]}...")
    print(f"  Label: {example.get('label', 0)}")

In [None]:
# Create a custom PyTorch Dataset for training
class CustomTextDataset(torch.utils.data.Dataset):
    def __init__(self, dataset_split, tokenizer, max_length=128):
        self.dataset = dataset_split
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        # Get item from wrapped dataset
        item = self.dataset[idx]
        text = item['text']
        label = item['label']
        
        # Tokenize the text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        # Remove batch dimension
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        
        # Add the label
        encoding['labels'] = torch.tensor(label)
        
        return encoding

# Create datasets and dataloaders
train_dataset = CustomTextDataset(robust_dataset["train"], tokenizer, MAX_LENGTH)
test_dataset = CustomTextDataset(robust_dataset["test"], tokenizer, MAX_LENGTH)

train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE)

# Create a dict of dataloaders for compatibility with existing code
dataloaders = {
    "train": train_dataloader,
    "test": test_dataloader
}

print("DataLoaders created:")
for split, dataloader in dataloaders.items():
    print(f"  {split}: {len(dataloader)} batches of size {BATCH_SIZE}")

## 3. Model Training

In [None]:
# Initialize the classifier
classifier = TransformerClassifier(
    model_name=MODEL_NAME,
    num_labels=2,  # Binary classification for sentiment
    problem_type="single_label_classification"
)

# Display model information
print(f"Model: {MODEL_NAME}")
print(f"Task: {TASK}")
print(f"Number of parameters: {classifier.get_model_size():,}")

In [None]:
# Train the model
training_history = classifier.train(
    train_dataloader=dataloaders["train"],
    val_dataloader=dataloaders["test"],
    num_epochs=NUM_EPOCHS,
    output_dir=OUTPUT_DIR
)

In [None]:
# Plot training history
plot_training_history(training_history)

## 4. Model Evaluation

In [None]:
# Evaluate on test set
test_results = classifier.evaluate(dataloaders["test"])

# Display results
print("Test Results:")
for metric, value in test_results.items():
    print(f"  {metric}: {value:.4f}")

In [None]:
# Generate predictions and true labels
predictions, true_labels = classifier.predict(dataloaders["test"])

# Calculate and display classification report
report = classification_report(true_labels, predictions, target_names=["Negative", "Positive"])
print(report)

In [None]:
# Plot confusion matrix
plot_confusion_matrix(true_labels, predictions, class_names=["Negative", "Positive"])

## 5. Making Predictions on New Data

In [None]:
# Sample reviews for prediction
sample_reviews = [
    "This movie was fantastic! The acting was great and the plot was engaging.",
    "I was disappointed with this film. The story was predictable and the characters were one-dimensional.",
    "A decent movie, but nothing special. Some parts were good, others were mediocre."
]

# Preprocess the reviews
preprocessed_reviews = [preprocessor.preprocess_text(review) for review in sample_reviews]

# Make predictions
predictions = classifier.predict_text(preprocessed_reviews)

# Map predictions to sentiment labels
sentiment_labels = ["Negative", "Positive"]

# Display results
print("Prediction Results:")
for i, (review, prediction) in enumerate(zip(sample_reviews, predictions)):
    print(f"\nReview {i+1}: {review[:100]}...")
    print(f"Prediction: {sentiment_labels[prediction]} (class {prediction})")

## 6. Saving and Loading the Model

In [None]:
# Save the model and config
save_path = os.path.join(OUTPUT_DIR, "final_model")
classifier.save(save_path)
print(f"Model saved to {save_path}")

In [None]:
# Load the model
loaded_classifier = TransformerClassifier.load(save_path)
print(f"Model loaded from {save_path}")

# Verify loaded model works
loaded_predictions = loaded_classifier.predict_text(preprocessed_reviews)

# Check if predictions match
match = all(p1 == p2 for p1, p2 in zip(predictions, loaded_predictions))
print(f"Loaded model predictions match original: {match}")