In [None]:

import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# 1. Tera dataset load karna
df = pd.read_csv('train.csv').sample(5000) # Ensure file name is correct
cols = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# Labels ko list mein convert karna (Multi-label classification ke liye)
df['labels'] = df[cols].values.astype(float).tolist()

# 2. Dataset prepare karna
dataset = Dataset.from_pandas(df[['comment_text', 'labels']])
tokenizer = AutoTokenizer.from_pretrained("unitary/toxic-bert")

def tokenize_func(examples):
    return tokenizer(examples["comment_text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_func, batched=True)

# 3. Model load karna
model = AutoModelForSequenceClassification.from_pretrained(
    "unitary/toxic-bert", 
    num_labels=6, 
    problem_type="multi_label_classification"
)

# 4. Training settings (Small values for demo/fast result)
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    dataloader_pin_memory=False,
    save_strategy="no",
    use_cpu=True, # Forcing CPU usage
    report_to="none" 
)

# 5. Training Start
trainer = Trainer(model=model, args=training_args, train_dataset=tokenized_dataset)
print("Training starting...")
trainer.train()

# 6. Save Model
model.save_pretrained("./my_moderation_model")
tokenizer.save_pretrained("./my_moderation_model")
print("âœ… Model trained and saved in folder: ./my_moderation_model")

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5000/5000 [00:00<00:00, 5102.16 examples/s]
Loading weights: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 201/201 [00:00<00:00, 272.64it/s, Materializing param=classifier.weight]                                      
BertForSequenceClassification LOAD REPORT from: unitary/toxic-bert
Key                          | Status     |  | 
-----------------------------+------------+--+-
bert.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


ðŸš€ Training starting...


Step,Training Loss
500,0.279925
1000,0.256127


Writing model shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1/1 [00:00<00:00,  1.27it/s]

âœ… Model trained and saved in folder: ./my_moderation_model





In [4]:
pip install torch

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip
