In [1]:
import numpy as np
import pandas as pd
import os

import datasets
from datasets import load_dataset
from datasets import Dataset

In [2]:
# two installations required for hugging face implementation
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
# importing the csv file
df = pd.read_csv("data/bias_data.csv")

# convert to hugging face dataset

dataset = Dataset.from_pandas(df)

In [5]:
import torch
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
device

device(type='mps')

In [6]:
dataset

Dataset({
    features: ['Unnamed: 0', 'attribute_in_window', 'comment', 'phrase', 'bias_sent', 'bias_phrase', 'bias_type'],
    num_rows: 11377
})

In [7]:
dataset = dataset.rename_column("bias_phrase", "label")

In [8]:
# splitting into testing and training datasets
train_test_split = dataset.train_test_split(test_size=0.2)

In [10]:
import datasets
from datasets import load_dataset
# we are going to pull from the repository specific to this project that contains the testing and training datasets

train_test_split["train"][100]

{'Unnamed: 0': 2012,
 'attribute_in_window': True,
 'comment': 'meanwhile the pa complains that israeli doctors wont treat patients cheaper in their wb towns. no matter what we do or dont do they will make it look like jews are the devil.',
 'phrase': 'do they will make it look like jews are the devil.',
 'bias_sent': '0',
 'label': 0.0,
 'bias_type': 'religion'}

In [11]:
# using a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence
# is this necessary? Tokenization is a good method but not sure if this is the best function to utilize
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["comment"], padding="max_length", truncation=True)


tokenized_datasets = train_test_split.map(tokenize_function, batched=True)

Map:   0%|          | 0/9101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2276 [00:00<?, ? examples/s]

In [12]:
# repeating the same for phrase as it is also a 'text' type
def tokenize_function(examples):
    return tokenizer(examples["phrase"], padding="max_length", truncation=True)


tokenized_datasets = train_test_split.map(tokenize_function, batched=True)

Map:   0%|          | 0/9101 [00:00<?, ? examples/s]

Map:   0%|          | 0/2276 [00:00<?, ? examples/s]

In [25]:
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

In [14]:
# the actual finetuning process
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# training hyperparameters
# using the default hyperparameters for now, but can change them depending on how well it works with our data
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./output",
    evaluation_strategy= "epoch",
    learning_rate = 9e-5,
    per_device_eval_batch_size= 2,
    per_device_train_batch_size= 1,
    num_train_epochs= 1,
    weight_decay= 0.01,
    #push_to_hub=True,
    max_steps=10
)



In [16]:
training_args


TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

In [26]:
def model_init():
    return model.to(device)

In [18]:
# evaluation metric

import numpy as np
import evaluate

metric = evaluate.load("accuracy")

In [19]:
# converting predictions to logits
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [20]:


def optuna_hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32, 64, 128]),
    }

In [21]:
import sklearn.ensemble
import sklearn.svm


def objective(trial):
    classifier_name = trial.suggest_categorical("classifier", ["SVC", "RandomForest"])
    if classifier_name == "SVC":
        svc_c = trial.suggest_float("svc_c", 1e-10, 1e10, log=True)
        classifier_obj = sklearn.svm.SVC(C=svc_c)
    else:
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 32, log=True)
        classifier_obj = sklearn.ensemble.RandomForestClassifier(max_depth=rf_max_depth)

In [27]:
# Training process
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

max_steps is given, it will override any value given in num_train_epochs


In [28]:
# Using the hyperparameter training class
best_trials = trainer.hyperparameter_search(
    direction="maximize",
    #backend="optuna",
    hp_space=optuna_hp_space,
   # n_trials=20,
    #compute_objective=objective,
)

[I 2024-11-30 15:44:57,920] A new study created in memory with name: no-name-b3e47b36-6b47-4b7c-bfc6-95a06638e470


  0%|          | 0/10 [00:00<?, ?it/s]

[W 2024-11-30 15:45:00,022] Trial 0 failed with parameters: {'learning_rate': 2.3999314086754196e-05, 'per_device_train_batch_size': 64} because of the following error: RuntimeError('MPS backend out of memory (MPS allocated: 17.02 GB, other allocations: 1.02 GB, max allowed: 18.13 GB). Tried to allocate 96.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).').
Traceback (most recent call last):
  File "/Users/sanikadeshmukh/Desktop/VSCodeProjects/Meta1b-Project/venv/lib/python3.11/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/Users/sanikadeshmukh/Desktop/VSCodeProjects/Meta1b-Project/venv/lib/python3.11/site-packages/transformers/integrations/integration_utils.py", line 248, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/Users/sanikadeshmukh/Desktop/VSCodeProjects/

RuntimeError: MPS backend out of memory (MPS allocated: 17.02 GB, other allocations: 1.02 GB, max allowed: 18.13 GB). Tried to allocate 96.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [18]:
model.train()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [19]:
trainer.train()

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/125 [00:00<?, ?it/s]

TrainOutput(global_step=10, training_loss=0.34990596771240234, metrics={'train_runtime': 104.4975, 'train_samples_per_second': 0.766, 'train_steps_per_second': 0.096, 'total_flos': 21048695439360.0, 'train_loss': 0.34990596771240234, 'epoch': 0.08})

In [30]:
# evaluation of the model
from evaluate import evaluator
task_evaluator = evaluator("text-classification")
# we're using the testing data now
eval_results = task_evaluator.compute(
    model_or_pipeline=model,
    data=small_eval_dataset,
    label_mapping={"LABEL_0": 0, "LABEL_1": 1},
    input_column = "comment",
    tokenizer= tokenizer
)

In [31]:
# passing in an instantiated pipeline
from transformers import pipeline

pipe = pipeline("text-classification",tokenizer=tokenizer, model=model)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [34]:
# computing the results from the evaluation

eval_results = task_evaluator.compute(
    model_or_pipeline=pipe,
    data=small_eval_dataset,
    label_mapping={"LABEL_0": 0, "LABEL_1": 1},
    input_column = "comment",
    tokenizer = tokenizer
)

In [35]:
eval_results

{'accuracy': 0.435,
 'total_time_in_seconds': 29.48703133291565,
 'samples_per_second': 33.913213870523634,
 'latency_in_seconds': 0.02948703133291565}