In [14]:
!pip install -U transformers datasets peft




In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/cellula toxic data (1).csv")
df.drop_duplicates().shape



(2027, 3)

In [2]:
#converting each Toxic Category into a number
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["label"] = le.fit_transform(df["Toxic Category"])
label_map = {label: idx for idx, label in enumerate(le.classes_)}
label_map



{'Child Sexual Exploitation': 0,
 'Elections': 1,
 'Non-Violent Crimes': 2,
 'Safe': 3,
 'Sex-Related Crimes': 4,
 'Suicide & Self-Harm': 5,
 'Unknown S-Type': 6,
 'Violent Crimes': 7,
 'unsafe': 8}

In [3]:
#Splitting the dataset into 80% training data and 20% test data
from sklearn.model_selection import train_test_split
df["text"] = df["query"] + " " + df["image descriptions"]
x_train, x_test, y_train, y_test = train_test_split(
    df["text"] ,
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"] #splitting the data while preserving the class distribution instead of splitting randomly (for F1 score)
)


In [4]:
#converting pandas dataframes into dataset for the transformer model
from datasets import Dataset

train_dataset = Dataset.from_dict({
    "text": x_train.tolist(), #Dataset expects pure python lists
    "label": y_train.tolist()
})

test_dataset = Dataset.from_dict({
    "text": x_test.tolist(),
    "label": y_test.tolist()
})


In [8]:
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name) #loading the same tokenizer it was trained with

def tokenize_function(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
#transformers use PyTorch tensors (converting the lists to tensors)
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "label"]
)

# sample = train_dataset[0]
# sample



Map:   0%|          | 0/2400 [00:00<?, ? examples/s]

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

In [9]:
#Adding a classification Head for pretrained DistilBERT
from transformers import AutoModelForSequenceClassification

num_labels = len(label_map)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.bias     | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


In [10]:
#LoRA (freezes BERT, trains small adapters)
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8, #rank(size of LoRA adapter)
    lora_alpha=16, #how strongly the LoRA update affects the model
    target_modules=["q_lin", "v_lin"],  # DistilBERT attention(attach adapters inside self-attention)
    lora_dropout=0.1, #applies only on LoRA adapters
    bias="none", #only train LoRA weights(do not train bias terms)
    task_type="SEQ_CLS" #(Sequence Classification task)
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 744,969 || all params: 67,705,362 || trainable%: 1.1003


In [11]:
import numpy as np
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred #tuple provided by the trainer
    predictions = np.argmax(logits, axis=1) #to find the index of the largest logit for each sample
    return {
        "macro_f1": f1_score(labels, predictions, average="macro")
    }


In [27]:
from transformers import TrainingArguments
from transformers import Trainer
training_args = TrainingArguments(
    output_dir="./bonus_results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=15,
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    greater_is_better=True,
    report_to="none"
)

#Training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Macro F1
1,No log,0.143396,0.947809
2,No log,0.134816,0.947809
3,No log,0.143424,0.947809
4,0.112349,0.13294,0.947809
5,0.112349,0.138876,0.947809
6,0.112349,0.131326,0.947809
7,0.110692,0.13722,0.947809
8,0.110692,0.142725,0.947809
9,0.110692,0.137442,0.947809
10,0.105999,0.140899,0.947809


TrainOutput(global_step=2250, training_loss=0.10748051876491971, metrics={'train_runtime': 299.1825, 'train_samples_per_second': 120.328, 'train_steps_per_second': 7.52, 'total_flos': 1212952320000000.0, 'train_loss': 0.10748051876491971, 'epoch': 15.0})

In [28]:
results = trainer.evaluate()
print(results)


{'eval_loss': 0.14339616894721985, 'eval_macro_f1': 0.9478090482858236, 'eval_runtime': 2.1433, 'eval_samples_per_second': 279.945, 'eval_steps_per_second': 17.73, 'epoch': 15.0}
