In [1]:
import ast
import datasets
import json
# import evaluate
import pandas as pd
import numpy as np
import torch

from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, BertTokenizer, DataCollatorWithPadding, EvalPrediction
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.is_available())
device = "cuda:0" if torch.cuda.is_available() else "cpu"

True


In [3]:
df = pd.read_csv("BERT_input.csv")

In [4]:
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result
     

In [5]:
labels = [label for label in df.columns if label != 'text']
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}
labels

['nyheter',
 'tema',
 'sport',
 'kjendis',
 'kultur',
 'meninger',
 'annonse',
 'bok',
 'magasinet',
 'okonomi']

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    "NbAiLab/nb-bert-large",
    problem_type="multi_label_classification",
    num_labels=len(labels), 
    id2label=id2label, 
    label2id=label2id
)

Some weights of the model checkpoint at NbAiLab/nb-bert-large were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not init

In [7]:
tokenizer = BertTokenizer.from_pretrained("NbAiLab/nb-bert-large")

In [8]:
def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [9]:
dataset = datasets.Dataset.from_pandas(df).train_test_split(test_size=0.2)
dataset

DatasetDict({
    train: Dataset({
        features: ['nyheter', 'tema', 'sport', 'kjendis', 'kultur', 'meninger', 'annonse', 'bok', 'magasinet', 'okonomi', 'text'],
        num_rows: 13596
    })
    test: Dataset({
        features: ['nyheter', 'tema', 'sport', 'kjendis', 'kultur', 'meninger', 'annonse', 'bok', 'magasinet', 'okonomi', 'text'],
        num_rows: 3399
    })
})

In [10]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset['train'].column_names)
encoded_dataset.set_format("torch")
train = encoded_dataset["train"]
test = encoded_dataset["test"]

100%|██████████| 14/14 [00:18<00:00,  1.30s/ba]
100%|██████████| 4/4 [00:04<00:00,  1.16s/ba]


In [11]:
train

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 13596
})

In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
batch_size = 4
metric_name = "f1"

In [14]:
training_args = TrainingArguments(
    output_dir="E:\\Pytorch\\Saved_Models\\CLASSIFICATION_MODEL",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    metric_for_best_model=metric_name,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [15]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [16]:
tokenizer.decode(example['input_ids'])

'[CLS] overskrift : avslorer rehab opphold matte innlegges. oppsummering : onsdag kom den første episoden av den ferske podkasten [UNK] sophie og fetisha [UNK] på nrk. personer : sophie elise, fetisha williams. nøkkelord : på, nrk, sophie, ferske, podkasten, sophie elise, fetisha williams, kjendis, podkast, influenser. spådd tema : tv, nrk, programmet, viaplay, viaplays, premier, dagbladet, tall, abonnenter, låt, melodi, prix, ukjent. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [17]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['kjendis']

In [18]:
trainer.train()

***** Running training *****
  Num examples = 13596
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 16995
  Number of trainable parameters = 355097610
  3%|▎         | 500/16995 [03:26<1:51:45,  2.46it/s]

{'loss': 0.1093, 'learning_rate': 1.9411591644601354e-05, 'epoch': 0.15}


  6%|▌         | 1000/16995 [06:46<1:45:51,  2.52it/s]

{'loss': 0.0199, 'learning_rate': 1.882318328920271e-05, 'epoch': 0.29}


  9%|▉         | 1500/16995 [10:06<1:44:42,  2.47it/s]

{'loss': 0.0134, 'learning_rate': 1.8234774933804062e-05, 'epoch': 0.44}


 12%|█▏        | 2000/16995 [13:27<1:43:59,  2.40it/s]

{'loss': 0.0093, 'learning_rate': 1.7646366578405414e-05, 'epoch': 0.59}


 15%|█▍        | 2500/16995 [16:46<1:40:38,  2.40it/s]

{'loss': 0.0115, 'learning_rate': 1.7057958223006767e-05, 'epoch': 0.74}


 18%|█▊        | 3000/16995 [20:06<1:30:40,  2.57it/s]

{'loss': 0.0119, 'learning_rate': 1.6469549867608122e-05, 'epoch': 0.88}


 20%|██        | 3399/16995 [22:45<1:32:01,  2.46it/s]***** Running Evaluation *****
  Num examples = 3399
  Batch size = 4
                                                      
 20%|██        | 3399/16995 [23:31<1:32:01,  2.46it/s]Saving model checkpoint to E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-3399
Configuration saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-3399\config.json


{'eval_loss': 0.011122689582407475, 'eval_f1': 0.9879376287143278, 'eval_roc_auc': 0.9932986826190711, 'eval_accuracy': 0.9876434245366285, 'eval_runtime': 45.4272, 'eval_samples_per_second': 74.823, 'eval_steps_per_second': 18.711, 'epoch': 1.0}


Model weights saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-3399\pytorch_model.bin
tokenizer config file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-3399\tokenizer_config.json
Special tokens file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-3399\special_tokens_map.json
 21%|██        | 3500/16995 [24:32<1:27:56,  2.56it/s] 

{'loss': 0.0093, 'learning_rate': 1.5881141512209475e-05, 'epoch': 1.03}


 24%|██▎       | 4000/16995 [27:51<1:24:34,  2.56it/s]

{'loss': 0.0065, 'learning_rate': 1.5292733156810827e-05, 'epoch': 1.18}


 26%|██▋       | 4500/16995 [31:11<1:20:28,  2.59it/s]

{'loss': 0.0064, 'learning_rate': 1.4704324801412183e-05, 'epoch': 1.32}


 29%|██▉       | 5000/16995 [34:31<1:22:30,  2.42it/s]

{'loss': 0.0056, 'learning_rate': 1.4115916446013535e-05, 'epoch': 1.47}


 32%|███▏      | 5500/16995 [37:51<1:18:17,  2.45it/s]

{'loss': 0.0097, 'learning_rate': 1.3527508090614887e-05, 'epoch': 1.62}


 35%|███▌      | 6000/16995 [41:11<1:14:38,  2.46it/s]

{'loss': 0.0107, 'learning_rate': 1.293909973521624e-05, 'epoch': 1.77}


 38%|███▊      | 6500/16995 [44:29<1:08:57,  2.54it/s]

{'loss': 0.006, 'learning_rate': 1.2350691379817594e-05, 'epoch': 1.91}


 40%|████      | 6798/16995 [46:28<1:05:08,  2.61it/s]***** Running Evaluation *****
  Num examples = 3399
  Batch size = 4
                                                      
 40%|████      | 6798/16995 [47:12<1:05:08,  2.61it/s]Saving model checkpoint to E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-6798
Configuration saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-6798\config.json


{'eval_loss': 0.00711665628477931, 'eval_f1': 0.9917622830244189, 'eval_roc_auc': 0.9954234905691217, 'eval_accuracy': 0.9917622830244189, 'eval_runtime': 44.2468, 'eval_samples_per_second': 76.819, 'eval_steps_per_second': 19.21, 'epoch': 2.0}


Model weights saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-6798\pytorch_model.bin
tokenizer config file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-6798\tokenizer_config.json
Special tokens file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-6798\special_tokens_map.json
 41%|████      | 7000/16995 [48:57<1:00:42,  2.74it/s] 

{'loss': 0.0048, 'learning_rate': 1.1762283024418948e-05, 'epoch': 2.06}


 44%|████▍     | 7500/16995 [52:08<56:33,  2.80it/s]  

{'loss': 0.0047, 'learning_rate': 1.1173874669020302e-05, 'epoch': 2.21}


 47%|████▋     | 8000/16995 [55:16<54:10,  2.77it/s]  

{'loss': 0.005, 'learning_rate': 1.0585466313621656e-05, 'epoch': 2.35}


 50%|█████     | 8500/16995 [58:27<54:04,  2.62it/s]  

{'loss': 0.0056, 'learning_rate': 9.997057958223008e-06, 'epoch': 2.5}


 53%|█████▎    | 9000/16995 [1:01:46<47:46,  2.79it/s]  

{'loss': 0.0033, 'learning_rate': 9.40864960282436e-06, 'epoch': 2.65}


 56%|█████▌    | 9500/16995 [1:04:50<46:15,  2.70it/s]

{'loss': 0.0035, 'learning_rate': 8.820241247425713e-06, 'epoch': 2.79}


 59%|█████▉    | 10000/16995 [1:07:55<43:46,  2.66it/s]

{'loss': 0.0026, 'learning_rate': 8.231832892027067e-06, 'epoch': 2.94}


 60%|██████    | 10197/16995 [1:09:13<40:04,  2.83it/s]***** Running Evaluation *****
  Num examples = 3399
  Batch size = 4
                                                       
 60%|██████    | 10197/16995 [1:09:57<40:04,  2.83it/s]Saving model checkpoint to E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-10197
Configuration saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-10197\config.json


{'eval_loss': 0.0075832842849195, 'eval_f1': 0.9930872187086336, 'eval_roc_auc': 0.9962243797195254, 'eval_accuracy': 0.9929390997352162, 'eval_runtime': 44.2561, 'eval_samples_per_second': 76.803, 'eval_steps_per_second': 19.206, 'epoch': 3.0}


Model weights saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-10197\pytorch_model.bin
tokenizer config file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-10197\tokenizer_config.json
Special tokens file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-10197\special_tokens_map.json
 62%|██████▏   | 10500/16995 [1:12:10<39:01,  2.77it/s]   

{'loss': 0.0042, 'learning_rate': 7.643424536628421e-06, 'epoch': 3.09}


 65%|██████▍   | 11000/16995 [1:15:17<38:17,  2.61it/s]

{'loss': 0.003, 'learning_rate': 7.055016181229773e-06, 'epoch': 3.24}


 68%|██████▊   | 11500/16995 [1:18:23<33:16,  2.75it/s]

{'loss': 0.0022, 'learning_rate': 6.466607825831127e-06, 'epoch': 3.38}


 71%|███████   | 12000/16995 [1:20:58<23:45,  3.50it/s]

{'loss': 0.0007, 'learning_rate': 5.8781994704324805e-06, 'epoch': 3.53}


 74%|███████▎  | 12500/16995 [1:23:19<21:01,  3.56it/s]

{'loss': 0.0014, 'learning_rate': 5.2897911150338345e-06, 'epoch': 3.68}


 76%|███████▋  | 13000/16995 [1:25:41<18:36,  3.58it/s]

{'loss': 0.0016, 'learning_rate': 4.701382759635188e-06, 'epoch': 3.82}


 79%|███████▉  | 13500/16995 [1:28:01<16:52,  3.45it/s]

{'loss': 0.0032, 'learning_rate': 4.112974404236541e-06, 'epoch': 3.97}


 80%|████████  | 13596/16995 [1:28:28<15:48,  3.58it/s]***** Running Evaluation *****
  Num examples = 3399
  Batch size = 4
                                                       
 80%|████████  | 13596/16995 [1:29:05<15:48,  3.58it/s]Saving model checkpoint to E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-13596


{'eval_loss': 0.008091443218290806, 'eval_f1': 0.9938217122683142, 'eval_roc_auc': 0.9965676179268412, 'eval_accuracy': 0.9938217122683142, 'eval_runtime': 37.023, 'eval_samples_per_second': 91.808, 'eval_steps_per_second': 22.959, 'epoch': 4.0}


Configuration saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-13596\config.json
Model weights saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-13596\pytorch_model.bin
tokenizer config file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-13596\tokenizer_config.json
Special tokens file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-13596\special_tokens_map.json
 82%|████████▏ | 14000/16995 [1:33:13<14:37,  3.41it/s]   

{'loss': 0.0019, 'learning_rate': 3.5245660488378936e-06, 'epoch': 4.12}


 85%|████████▌ | 14500/16995 [1:35:44<15:34,  2.67it/s]

{'loss': 0.0007, 'learning_rate': 2.9361576934392472e-06, 'epoch': 4.27}


 88%|████████▊ | 15000/16995 [1:38:49<12:45,  2.61it/s]

{'loss': 0.001, 'learning_rate': 2.3477493380406004e-06, 'epoch': 4.41}


 91%|█████████ | 15500/16995 [1:41:54<09:34,  2.60it/s]

{'loss': 0.0022, 'learning_rate': 1.7593409826419536e-06, 'epoch': 4.56}


 94%|█████████▍| 16000/16995 [1:44:59<05:58,  2.78it/s]

{'loss': 0.0005, 'learning_rate': 1.170932627243307e-06, 'epoch': 4.71}


 97%|█████████▋| 16500/16995 [1:48:05<02:59,  2.75it/s]

{'loss': 0.0019, 'learning_rate': 5.825242718446603e-07, 'epoch': 4.85}


100%|██████████| 16995/16995 [1:51:11<00:00,  2.60it/s]***** Running Evaluation *****
  Num examples = 3399
  Batch size = 4
                                                       
100%|██████████| 16995/16995 [1:51:55<00:00,  2.60it/s]

{'eval_loss': 0.007677115499973297, 'eval_f1': 0.9938217122683142, 'eval_roc_auc': 0.9965676179268412, 'eval_accuracy': 0.9938217122683142, 'eval_runtime': 44.1675, 'eval_samples_per_second': 76.957, 'eval_steps_per_second': 19.245, 'epoch': 5.0}


Saving model checkpoint to E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-16995
Configuration saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-16995\config.json
Model weights saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-16995\pytorch_model.bin
tokenizer config file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-16995\tokenizer_config.json
Special tokens file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-16995\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\checkpoint-13596 (score: 0.9938217122683142).
100%|██████████| 16995/16995 [1:52:14<00:00,  2.52it/s]

{'train_runtime': 6734.9401, 'train_samples_per_second': 10.094, 'train_steps_per_second': 2.523, 'train_loss': 0.008363515769919777, 'epoch': 5.0}





TrainOutput(global_step=16995, training_loss=0.008363515769919777, metrics={'train_runtime': 6734.9401, 'train_samples_per_second': 10.094, 'train_steps_per_second': 2.523, 'train_loss': 0.008363515769919777, 'epoch': 5.0})

In [19]:
trainer.save_model("E:\\Pytorch\\Saved_Models\\CLASSIFICATION_MODEL\\FINAL")

Saving model checkpoint to E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\FINAL
Configuration saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\FINAL\config.json
Model weights saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\FINAL\pytorch_model.bin
tokenizer config file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\FINAL\tokenizer_config.json
Special tokens file saved in E:\Pytorch\Saved_Models\CLASSIFICATION_MODEL\FINAL\special_tokens_map.json
