In [13]:
! pip install datasets transformers accelerate evaluate degender-pronoun



In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
!apt install git-lfs

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [47]:
from datasets import Dataset
import pandas as pd
from degender_pronoun import degenderizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
from datasets import load_metric
from torch import nn
import torch
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import balanced_accuracy_score




degender_pronouns = {
    ' mr ': ' mx ',
    ' mrs ': ' mx ',
    ' ms ': ' mx ',
    ' miss ': ' mx ',
    ' mister ': ' mx ',
}

degender_nouns = {
    ' man ': ' person ',
    ' men ': ' persons ',
    ' woman ': ' person ',
    ' women ': ' persons ',
    " man's ": " person's",
    " men's ": " person's",
    " woman's ": " person's",
    " women's ": " person's",
    " gentleman ": " person ",
    " lady ": " person ",
    " gentleman's ": " person's ",
    " lady's ": " person's ",
}

def preprocess(df, data_column, preprocess_type):
    if preprocess_type == 'none':
        return df

    D = degenderizer()
    df[data_column] = df[data_column].apply(lambda x: D.degender(x) if len(x) > 5 else x)

    for k, v in degender_pronouns.items():
        df[data_column] = df[data_column].str.lower().replace(k,v)

    if preprocess_type == 'all':
        for k, v in degender_nouns.items():
            df[data_column] = df[data_column].str.lower().replace(k,v)
    return df


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 2 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([8.0, 1.0], device=model.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

metric = evaluate.load('f1')

model_checkpoint = "distilbert-base-uncased"
#model_checkpoint = "roberta-base"
batch_size = 16
metric_name = "f1"
data_column = 's1_s2'
#preprocessing_type = 'none'
#preprocessing_type = 'pronouns'
preprocessing_type = 'all'
task = "nlp-letters-{}-{}-class-weighted".format(data_column, preprocessing_type)
labels = ['female','male']



model_name = model_checkpoint.split("/")[-1]
num_labels = 2

dataset_path = "/content/drive/MyDrive/sentence_sets.csv"
df = pd.read_csv(dataset_path, encoding='unicode_escape')
df = preprocess(df, data_column, preprocessing_type)

dataset = Dataset.from_pandas(df).rename_column("APPLICANT_GENDER", "label").class_encode_column("label").train_test_split(test_size=0.2)

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

# DistilBERT params
args = TrainingArguments(
    f"{model_name}-finetuned-{task}",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=True,
)

# RoBERTA params
# args = TrainingArguments(
#     f"{model_name}-finetuned-{task}",
#     evaluation_strategy = "epoch",
#     save_strategy = "epoch",
#     learning_rate=5e-5,
#     weight_decay=0.01,
#     warmup_steps=500,
#     per_device_train_batch_size=batch_size,
#     per_device_eval_batch_size=batch_size,
#     num_train_epochs=10,
#     load_best_model_at_end=True,
#     metric_for_best_model=metric_name,
#     push_to_hub=True,
# )

def preprocess_function(sample):
    return tokenizer(sample[data_column], truncation=True, padding=True)


confusion_metric = evaluate.load("confusion_matrix")

def compute_metrics(eval_pred):
    x, y = eval_pred
    preds = np.argmax(x, -1)
    print(confusion_metric.compute(predictions=preds, references=y))
    print(classification_report(y, preds, target_names=labels))
    print("MCC: {}".format(matthews_corrcoef(y, preds)))
    print("Balanced Accuracy: {}".format(balanced_accuracy_score(y, preds)))
    return metric.compute(predictions=preds, references=y, average="macro")

encoded_dataset = dataset.map(preprocess_function, batched=True)

trainer = CustomTrainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)



Casting to class labels:   0%|          | 0/4411 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3528 [00:00<?, ? examples/s]

Map:   0%|          | 0/883 [00:00<?, ? examples/s]

In [48]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,No log,0.431362,0.576443
2,No log,0.360464,0.705035
3,0.422200,0.383582,0.643549
4,0.422200,0.808269,0.766424
5,0.200700,1.227568,0.780993
6,0.200700,1.911142,0.795374
7,0.069100,1.769652,0.776686
8,0.069100,1.748114,0.775759
9,0.069100,1.745621,0.775462
10,0.017900,1.946121,0.784823


{'confusion_matrix': array([[254,  24],
       [350, 255]])}
              precision    recall  f1-score   support

      female       0.42      0.91      0.58       278
        male       0.91      0.42      0.58       605

    accuracy                           0.58       883
   macro avg       0.67      0.67      0.58       883
weighted avg       0.76      0.58      0.58       883

MCC: 0.33483232506196575
Balanced Accuracy: 0.6675783340269933
{'confusion_matrix': array([[245,  33],
       [221, 384]])}
              precision    recall  f1-score   support

      female       0.53      0.88      0.66       278
        male       0.92      0.63      0.75       605

    accuracy                           0.71       883
   macro avg       0.72      0.76      0.71       883
weighted avg       0.80      0.71      0.72       883

MCC: 0.4800578824195367
Balanced Accuracy: 0.7580028539152149
{'confusion_matrix': array([[263,  15],
       [299, 306]])}
              precision    recall  f1-

TrainOutput(global_step=2210, training_loss=0.16223053004407234, metrics={'train_runtime': 1066.5768, 'train_samples_per_second': 33.078, 'train_steps_per_second': 2.072, 'total_flos': 4673449824583680.0, 'train_loss': 0.16223053004407234, 'epoch': 10.0})

In [49]:
trainer.evaluate()

{'confusion_matrix': array([[162, 116],
       [ 24, 581]])}
              precision    recall  f1-score   support

      female       0.87      0.58      0.70       278
        male       0.83      0.96      0.89       605

    accuracy                           0.84       883
   macro avg       0.85      0.77      0.80       883
weighted avg       0.85      0.84      0.83       883

MCC: 0.6185553268907982
Balanced Accuracy: 0.7715321957310185


{'eval_loss': 1.9111422300338745,
 'eval_f1': 0.7953744901742676,
 'eval_runtime': 8.5649,
 'eval_samples_per_second': 103.095,
 'eval_steps_per_second': 6.538,
 'epoch': 10.0}

In [50]:
trainer.push_to_hub()

events.out.tfevents.1721606211.089fb13e6219.3203.20:   0%|          | 0.00/405 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/ben-yu/distilbert-base-uncased-finetuned-nlp-letters-s1_s2-all-class-weighted/commit/6b52a2c6584586e27443452bd1aee72031849570', commit_message='End of training', commit_description='', oid='6b52a2c6584586e27443452bd1aee72031849570', pr_url=None, pr_revision=None, pr_num=None)