In [1]:
import numpy as np
import pandas as pd
import os

import datasets
from datasets import load_dataset
from datasets import Dataset

In [2]:
# two installations required for hugging face implementation
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# importing the csv file
df = pd.read_csv("data/bias_data.csv")

# convert to hugging face dataset

dataset = Dataset.from_pandas(df)

In [4]:
# splitting into testing and training datasets
train_test_split = dataset.train_test_split(test_size=0.2)

In [5]:
import datasets
from datasets import load_dataset
# we are going to pull from the repository specific to this project that contains the testing and training datasets

train_test_split["train"][100]

{'Unnamed: 0': 2088,
 'attribute_in_window': True,
 'comment': 'my mom is a nurse. sometimes shes worked sixs in a week!',
 'phrase': 'my mom is a nurse. sometimes shes worked sixs',
 'bias_sent': '1',
 'bias_phrase': 1.0,
 'bias_type': 'gender'}

In [6]:
# using a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence
# is this necessary? Tokenization is a good method but not sure if this is the best function to utilize
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["comment"], padding="max_length", truncation=True)


tokenized_datasets = train_test_split.map(tokenize_function, batched=True)

Map:   0%|          | 0/9101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2276 [00:00<?, ? examples/s]

In [7]:
# repeating the same for phrase as it is also a 'text' type
def tokenize_function(examples):
    return tokenizer(examples["phrase"], padding="max_length", truncation=True)


tokenized_datasets = train_test_split.map(tokenize_function, batched=True)

Map:   0%|          | 0/9101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2276 [00:00<?, ? examples/s]

In [8]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [9]:
# the actual finetuning process
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
# training hyperparameters
# using the default hyperparameters for now, but can change them depending on how well it works with our data
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./output",
    evaluation_strategy= "epoch",
    learning_rate = 9e-5,
    per_device_eval_batch_size= 8,
    per_device_train_batch_size= 8,
    num_train_epochs= 1,
    weight_decay= 0.01,
    #push_to_hub=True,
    max_steps=10
)



In [21]:
training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False,
evaluation_s

In [22]:
# evaluation metric

import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [23]:
# converting predictions to logits
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [29]:
# creating a custom compute loss function
import torch

def compute_loss(model, inputs):
    outputs = model(**inputs)
    logits = outputs.get("logits")
    labels = inputs.get("labels")
    # Assuming CrossEntropyLoss for classification
    loss_fn = torch.nn.CrossEntropyLoss()
    loss = loss_fn(logits, labels)
    return loss

In [31]:
# Training process
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_loss_func = compute_loss,
    compute_metrics=compute_metrics,
)

max_steps is given, it will override any value given in num_train_epochs


In [32]:
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [33]:
trainer.train()

  0%|          | 0/10 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.