In [None]:
# Mount Google Drive
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!nvidia-smi

Fri Jun  9 19:48:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
# Install Libraries
%%capture
!pip install transformers
!pip install evaluate
!pip install datasets
!pip install accelerate

In [None]:
# set data paths
import os
import sys
import torch
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

In [None]:
# set data paths
device = 'cuda' if torch.cuda.is_available() else 'cpu'

root_dir = "gdrive/My Drive/Master_Thesis/"
model_dir = os.path.join(root_dir, 'models/sexism_classifiers')
train_dir = os.path.join(root_dir, 'data/Custom/Topic-classification_train.csv')
test_dir = os.path.join(root_dir, 'data/Custom/Topic-classification_test.csv')

In [None]:
# model_name = "NLP-LTU/bertweet-large-sexism-detector"
model_name = "cardiffnlp/tweet-topic-21-multi"

In [None]:
df_raw = pd.read_csv(train_dir)
df_new = df_raw.copy()

In [None]:
df_new

Unnamed: 0,Dataset,Row ID,text,TARGET,MIGRANTS,POC,LGBT+,MUSLIMS,WOMEN,JEWS,other,DISABLED
0,DIALO_CONAN_Counterspeech,944,That intolerance and hatred is one of the bigg...,['MUSLIMS'],0,0,0,1,0,0,0.0,0.0
1,DIALO_CONAN_Counterspeech,5394,Why would you say this about a religion?,['MUSLIMS'],0,0,0,1,0,0,0.0,0.0
2,DIALO_CONAN_Counterspeech,7424,Why do you think this is right? Women are huma...,['WOMEN'],0,0,0,0,1,0,0.0,0.0
3,DIALO_CONAN_Counterspeech,4539,"If you're really worried about women's rights,...",['LGBT+'],0,0,1,0,0,0,0.0,0.0
4,DIALO_CONAN_Counterspeech,4789,All people have a right to life regardless of ...,['POC'],0,1,0,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
11977,DIALO_CONAN_Counterspeech,5731,How would you explain the systematic persecuti...,['JEWS'],0,0,0,0,0,1,0.0,0.0
11978,Multi_CONAN_Counterspeech,359,"To be honest, there are already programs that ...",MIGRANTS,1,0,0,0,0,0,0.0,0.0
11979,DIALO_CONAN_Counterspeech,862,The religion of millions of people is not inco...,['MUSLIMS'],0,0,0,1,0,0,0.0,0.0
11980,DIALO_CONAN_Counterspeech,5415,How can Jews be a curse to the world when they...,['JEWS'],0,0,0,0,0,1,0.0,0.0


In [None]:
df_new = df_new.dropna()
df_new = df_new[["text","MIGRANTS","POC","LGBT+","MUSLIMS","WOMEN","JEWS","other","DISABLED"]]
df_new['MIGRANTS'] = df_new['MIGRANTS'].astype(float)
df_new["POC"] = df_new["POC"].astype(float)
df_new['LGBT+'] = df_new['LGBT+'].astype(float)
df_new["MUSLIMS"] = df_new["MUSLIMS"].astype(float)
df_new['WOMEN'] = df_new['WOMEN'].astype(float)
df_new["JEWS"] = df_new["JEWS"].astype(float)
df_new['other'] = df_new['other'].astype(float)
df_new["DISABLED"] = df_new["DISABLED"].astype(float)

In [None]:
df_new

Unnamed: 0,text,MIGRANTS,POC,LGBT+,MUSLIMS,WOMEN,JEWS,other,DISABLED
0,That intolerance and hatred is one of the bigg...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,Why would you say this about a religion?,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,Why do you think this is right? Women are huma...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,"If you're really worried about women's rights,...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,All people have a right to life regardless of ...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
11977,How would you explain the systematic persecuti...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
11978,"To be honest, there are already programs that ...",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11979,The religion of millions of people is not inco...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
11980,How can Jews be a curse to the world when they...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [None]:
ds = Dataset.from_pandas(df_new)
dataset = ds.train_test_split(test_size=0.2)
dataset = dataset.remove_columns("__index_level_0__")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

In [None]:
# create labels column
cols = dataset["train"].column_names
dataset = dataset.map(lambda x : {"labels": [x[c] for c in cols if c != "text"]})

Map:   0%|          | 0/9584 [00:00<?, ? examples/s]

Map:   0%|          | 0/2397 [00:00<?, ? examples/s]

In [None]:
dataset["train"][0]

{'text': 'Education is an essential tool to fight for human rights, free thought, and to promote the scientific method. But it is not enough. It has also been shown by the Pew Research Center that people of colour are more likely to report discrimination in universities.',
 'MIGRANTS': 0.0,
 'POC': 1.0,
 'LGBT+': 0.0,
 'MUSLIMS': 0.0,
 'WOMEN': 0.0,
 'JEWS': 0.0,
 'other': 0.0,
 'DISABLED': 0.0,
 'labels': [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}

In [None]:
def preprocess_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding=True,
        return_tensors='pt')

In [None]:
cols=dataset["train"].column_names
cols.remove("labels")
tokenized_ds = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=cols
    )

Map:   0%|          | 0/9584 [00:00<?, ? examples/s]

Map:   0%|          | 0/2397 [00:00<?, ? examples/s]

In [None]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 9584
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 2397
    })
})

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# create label2id, id2label dicts for nice outputs for the model
labels = ["MIGRANTS","POC","LGBT+","MUSLIMS","WOMEN","JEWS","other","DISABLED"]
num_labels = len(labels)
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = str(i)
    id2label[str(i)] = label

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.88k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/tweet-topic-21-multi and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([19, 768]) in the checkpoint and torch.Size([8, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([19]) in the checkpoint and torch.Size([8]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
import numpy as np

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.6):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [None]:
training_args = TrainingArguments(
    num_train_epochs= 8,
    output_dir= os.path.join(model_dir, model_name),
    learning_rate=1e-5,
    weight_decay=0.01,
    optim="adamw_torch",
    lr_scheduler_type="cosine",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    auto_find_batch_size=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train(resume_from_checkpoint=False,)

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.0256,0.077765,0.929306,0.956765,0.92282
2,0.0251,0.083426,0.927341,0.955248,0.9199
3,0.0252,0.086963,0.923947,0.954206,0.918648
4,0.019,0.085926,0.928676,0.957538,0.923655
5,0.0134,0.087455,0.930991,0.958937,0.926575
6,0.0082,0.085067,0.928452,0.95733,0.923655
7,0.0069,0.085563,0.931994,0.959264,0.927409
8,0.007,0.085645,0.931576,0.959026,0.926575


TrainOutput(global_step=9584, training_loss=0.01643798339844546, metrics={'train_runtime': 2615.4649, 'train_samples_per_second': 29.315, 'train_steps_per_second': 3.664, 'total_flos': 6277839070955520.0, 'train_loss': 0.01643798339844546, 'epoch': 8.0})

# TESTING


In [None]:
pred_average

0.9795007091615702

In [None]:
from datetime import datetime

def get_datetime(format):
    # datetime object containing current date and time
    now = datetime.now()
    # dd/mm/YY H:M:S
    dt = now.strftime(format)
    return dt

pt_save_directory = os.path.join(model_dir, model_name)
pt_save_directory = os.path.join(pt_save_directory, get_datetime("%d,%m,%Y--%H,%M"))

tokenizer.save_pretrained(pt_save_directory)
model.save_pretrained(pt_save_directory)