<a href="https://colab.research.google.com/github/Kent-mak/Anti-Hate-dashboard/blob/filter_model/Civil%20Regression%20BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch datasets transformers peft pickle

Define Model with LoRA for regression

In [None]:
import torch
if torch.cuda.is_available():
    device_name = torch.device("cuda")
else:
    device_name = torch.device('cpu')
print("Using {}.".format(device_name))

Using cuda.


In [None]:

import torch.nn as nn


class CustomRegressionModel(nn.Module):
    def __init__(self, base_model, num_labels=7):
        super(CustomRegressionModel, self).__init__()
        self.model = base_model
        self.dropout = nn.Dropout(0.1)
        self.regressor = nn.Linear(self.model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        pooled_output = outputs[0][:, 0]
        pooled_output = self.dropout(pooled_output)
        logits = self.regressor(pooled_output)

        if labels is not None:
            loss_fct = nn.MSELoss()
            loss = loss_fct(logits, labels)
            return loss, logits
        else:
            return logits


In [None]:
from peft import LoraConfig, get_peft_model
from transformers import AutoModel

model_name = "distilbert-base-uncased"
base_model = AutoModel.from_pretrained(model_name)

lora_config = LoraConfig(
    r=4,                      # Rank of the low-rank matrices
    lora_alpha=32,            # Scaling factor
    lora_dropout=0.1,         # Dropout probability for LoRA layers
    target_modules=["attention.q_lin", "attention.k_lin", "attention.v_lin"]  # Modules to which LoRA will be applied
)

model_with_lora = get_peft_model(base_model, lora_config)


In [None]:
# Load the tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Integrate LoRA with the custom regression model
model = CustomRegressionModel(model_with_lora)

Work with the dataset

In [None]:
from datasets import load_dataset

# Load your dataset
# dataset = load_dataset('google/civil_comments')
training_set = load_dataset('google/civil_comments', split= 'train[:1%]')
validation_set = load_dataset('google/civil_comments', split= 'validation[:1%]')
test_set = load_dataset('google/civil_comments', split= 'test[:1%]')
print(training_set)

Dataset({
    features: ['text', 'toxicity', 'severe_toxicity', 'obscene', 'threat', 'insult', 'identity_attack', 'sexual_explicit'],
    num_rows: 18049
})


In [None]:
# Define the preprocessing function
def preprocess_function(examples):
    inputs = tokenizer(examples['text'], truncation=True, padding='max_length')
    labels = torch.tensor(list(zip(
        examples['toxicity'],
        examples['severe_toxicity'],
        examples['obscene'],
        examples['threat'],
        examples['insult'],
        examples['identity_attack'],
        examples['sexual_explicit']
    )), dtype=torch.float32)

    inputs["labels"] = labels
    return inputs

# Apply the preprocessing function to each split
# encoded_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
encoded_training_set = training_set.map(preprocess_function, batched=True, remove_columns=training_set.column_names)
encoded_validation_set = validation_set.map(preprocess_function, batched=True, remove_columns=validation_set.column_names)
encoded_test_set = test_set.map(preprocess_function, batched=True, remove_columns=test_set.column_names)
print(encoded_training_set)



Map:   0%|          | 0/18049 [00:00<?, ? examples/s]

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Map:   0%|          | 0/973 [00:00<?, ? examples/s]

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 18049
})


Train the model

In [None]:
from transformers import Trainer, TrainingArguments

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
      # print(inputs)
      labels = inputs.get("labels").to(device_name)
      model.to(device_name)
      loss, logits = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=labels)
      return (loss, logits) if return_outputs else loss


In [None]:


# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=encoded_training_set,
    eval_dataset=encoded_validation_set,
)

# Fine-tune the model
trainer.train()
test_results = trainer.evaluate(encoded_test_set)
print(f"Test Results: {test_results}")

Epoch,Training Loss,Validation Loss
1,0.0165,0.01135
2,0.011,0.010805
3,0.0106,0.010633


Test Results: {'eval_loss': 0.012133462354540825, 'eval_runtime': 9.7315, 'eval_samples_per_second': 99.984, 'eval_steps_per_second': 3.186, 'epoch': 3.0}


In [None]:
baseRegressionModel = CustomRegressionModel(base_model)

training_args = TrainingArguments(
    output_dir="./base_results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer
base_trainer = CustomTrainer(
    model=baseRegressionModel,
    args=training_args
)

base_test_results = base_trainer.evaluate(encoded_test_set)
print(f"Test Results: {base_test_results}")

Test Results: {'eval_loss': 0.04703187197446823, 'eval_runtime': 9.7189, 'eval_samples_per_second': 100.114, 'eval_steps_per_second': 3.19}


In [None]:
torch.save(model.state_dict(), "model.pt")

cp: -r not specified; omitting directory '/content/results'
