# Multilingual Content Safety Guardrail
This notebook implements a binary classifier to detect harmful content vs e-commerce queries in multiple languages.

In [None]:
# Install required packages
!pip install transformers torch pandas scikit-learn datasets accelerate>=0.26.0

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

## Load Pre-trained Model
We'll use the multilingual BERT model that can understand multiple languages.

In [None]:
# Load pre-trained multilingual model
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=2,
    ignore_mismatched_sizes=True  # Required when changing number of labels
)

## Sample Dataset
We'll create a small dataset with examples of e-commerce and harmful content in multiple languages.

In [None]:
# Sample dataset with examples in English and other languages
train_texts = [
    # E-commerce examples (label 0)
    "where can I find the best deals on laptops",
    "shopping cart checkout process",
    "best online shopping deals",
    "অনলাইনে জুতা কিনতে চাই",  # Want to buy shoes online (Bengali)
    "comprar zapatos en línea",  # Buy shoes online (Spanish)
    "acheter des vêtements en ligne",  # Buy clothes online (French)
    
    # Harmful content examples (label 1)
    "how to hack into someone's account",
    "ways to steal credit card information",
    "কিভাবে সিস্টেম হ্যাক করা যায়",  # How to hack system (Bengali)
    "ডেটা চুরি করার উপায়",  # Ways to steal data (Bengali)
    "cómo hackear una cuenta",  # How to hack an account (Spanish)
    "comment pirater un système"  # How to hack a system (French)
]

train_labels = [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]  # 0: e-commerce, 1: harmful

## Fine-tune the Model
We'll fine-tune the model on our dataset to classify content.

In [None]:
from transformers import Trainer, TrainingArguments
from datasets import Dataset

# Prepare dataset
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,
    per_device_train_batch_size=4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

## Create a Simple Classifier Function
This function will classify new text as either e-commerce or harmful.

In [None]:
def classify_content(text):
    # Prepare the text for the model
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    # Get model prediction
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=-1).item()
    
    # Get confidence score
    confidence = predictions[0][predicted_class].item()
    
    # Return classification and confidence
    result = {
        "text": text,
        "classification": "E-commerce" if predicted_class == 0 else "Harmful",
        "confidence": f"{confidence:.2%}"
    }
    
    return result

## Test the Classifier
Let's test our classifier on some examples.

In [None]:
# Test examples
test_examples = [
    "I want to buy a new smartphone",
    "How to hack into a bank account",
    "Best deals on electronics",
    "আমি অনলাইনে কিনতে চাই",  # I want to buy online (Bengali)
    "কিভাবে পাসওয়ার্ড হ্যাক করা যায়",  # How to hack passwords (Bengali)
    "quiero comprar zapatos",  # I want to buy shoes (Spanish)
    "cómo robar información",  # How to steal information (Spanish)
]

for example in test_examples:
    result = classify_content(example)
    print(f"Text: {result['text']}")
    print(f"Classification: {result['classification']}")
    print(f"Confidence: {result['confidence']}")
    print("-" * 50)

## Create a Content Safety Guardrail
This function will act as a guardrail to filter out harmful content.

In [None]:
def content_safety_guardrail(text, threshold=0.7):
    """Filter content based on safety classification
    
    Args:
        text: The text to classify
        threshold: Confidence threshold for blocking content (default: 0.7)
        
    Returns:
        dict: Result with safety status and reason
    """
    # Get classification
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.softmax(outputs.logits, dim=-1)
        harmful_score = predictions[0][1].item()  # Score for harmful class
    
    # Determine if content should be blocked
    if harmful_score >= threshold:
        return {
            "text": text,
            "allowed": False,
            "reason": f"Content classified as harmful with {harmful_score:.2%} confidence",
            "harmful_score": f"{harmful_score:.2%}"
        }
    else:
        return {
            "text": text,
            "allowed": True,
            "reason": "Content appears to be safe",
            "harmful_score": f"{harmful_score:.2%}"
        }

# Test the guardrail
guardrail_examples = [
    "I want to buy a new phone online",
    "How to hack into someone's email",
    "Best deals on summer clothing",
    "How to steal credit card information"
]

for example in guardrail_examples:
    result = content_safety_guardrail(example)
    print(f"Text: {result['text']}")
    print(f"Allowed: {result['allowed']}")
    print(f"Reason: {result['reason']}")
    print(f"Harmful Score: {result['harmful_score']}")
    print("-" * 50)

## Save the Model
Save the fine-tuned model for future use.

In [None]:
# Save the model and tokenizer
model.save_pretrained('./content_safety_model')
tokenizer.save_pretrained('./content_safety_model')
print("Model saved successfully!")