In [30]:
!pip install transformers
!pip install torch





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [34]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import pandas as pd
import numpy as np

# Load the modified dataset
dataset_path = 'demo.csv'
dataset = pd.read_csv(dataset_path)

# Preprocessing steps: Create a mapping for euphemisms
toxic_words_path = 'toxic_words_with_variations.csv'
toxic_words_df = pd.read_csv(toxic_words_path)

# Flatten the toxic words variations into a dictionary
euphemism_dict = {}
for _, row in toxic_words_df.iterrows():
    base_word = row['Original Word']
    for col in toxic_words_df.columns[1:]:
        variation = row[col]
        if pd.notna(variation):
            euphemism_dict[variation] = base_word

# Tokenizer initialization
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class ToxicDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
        # Ensure the target columns are numeric and cast them to float
        target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
        self.data[target_columns] = self.data[target_columns].apply(pd.to_numeric, errors='coerce').fillna(0).astype(float)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        comment_text = self.data.loc[index, 'comment_text']
        targets = self.data.loc[index, ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

        # Convert targets to float32 explicitly
        targets = targets.astype(np.float32)
    
        # Convert to PyTorch tensor
        targets = torch.tensor(targets, dtype=torch.float32)

        # Replace euphemisms with the base toxic word
        for euphemism, base_word in euphemism_dict.items():
            comment_text = comment_text.replace(euphemism, base_word)

        # Tokenize the text
        encoding = self.tokenizer.encode_plus(
            comment_text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,  # Ensure that text is truncated to max_len
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': targets  # Change 'targets' to 'labels' to match Trainer expectations
        }

# Initialize dataset
MAX_LEN = 128
train_dataset = ToxicDataset(dataset, tokenizer, MAX_LEN)

# Model initialization
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    data_collator=lambda data: {
        'input_ids': torch.stack([f['input_ids'] for f in data]),
        'attention_mask': torch.stack([f['attention_mask'] for f in data]),
        'labels': torch.stack([f['labels'] for f in data])
    }
)

# Start training
trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.7547
20,0.728
30,0.658
40,0.5907
50,0.5442
60,0.5295
70,0.4974
80,0.468
90,0.4158
100,0.3303


TrainOutput(global_step=939, training_loss=0.12139153880433153, metrics={'train_runtime': 40204.0592, 'train_samples_per_second': 0.373, 'train_steps_per_second': 0.023, 'total_flos': 986504552741376.0, 'train_loss': 0.12139153880433153, 'epoch': 3.0})

In [60]:
# Save the model and tokenizer
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')

# Load the trained model and tokenizer
model = BertForSequenceClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')

def preprocess_input(text, tokenizer, euphemism_dict, max_len):
    # Replace euphemisms with the base toxic word
    for euphemism, base_word in euphemism_dict.items():
        text = text.replace(euphemism, base_word)

    # Tokenize the text
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    return encoding

def predict_toxicity(text, model, tokenizer, euphemism_dict, max_len):
    model.eval()
    original_text = text  # Store the original text for comparison
    reasons = []  # To store reasons for toxicity
    
    # Replace euphemisms with the base toxic word
    for euphemism, base_word in euphemism_dict.items():
        if euphemism in text:
            reasons.append(f"Euphemism '{euphemism}' detected, replaced with '{base_word}'")
            text = text.replace(euphemism, base_word)

    encoding = preprocess_input(text, tokenizer, euphemism_dict, max_len)

    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predictions = torch.sigmoid(logits).cpu().numpy().flatten()

    # Threshold for binary classification
    threshold = 0.5
    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    detected_labels = [labels[i] for i in range(len(predictions)) if predictions[i] >= threshold]

    result = ""
    if detected_labels:
        result += f"Detected toxicity: {', '.join(detected_labels)}"
        if reasons:
            result += f"\nReasons for toxicity: {', '.join(reasons)}"
    else:
        result = "The input text is classified as non-toxic."

    return result

# Example usage
user_input = "Jack is a bad boy "
result = predict_toxicity(user_input, model, tokenizer, euphemism_dict, MAX_LEN)
print(result)


Detected toxicity: toxic
