In [53]:
import re
import pandas as pd
from io import StringIO
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")



In [54]:
FILENAME = 'data/train.csv'

with open(FILENAME) as file:
  lines = [re.sub(r'([^,])"(\s*[^\n])', r'\1/"\2', line) for line in file]
  df = pd.read_csv(StringIO(''.join(lines)), escapechar="/")
  df = df.head()

In [55]:
id2label = {
  0: 1,
  1: 2,
  2: 3,
  3: 4,
  4: 5,
  5: 6,
  6: 7,
  7: 8,
  8: 9,
  9: 10,
  10: 11,
  11: 12,
  12: 13,
  13: 14,
  14: 15,
  15: 16,
  16: 17,
  17: 18,
  18: 19
}
label2id = {
  1: 0,
  2: 1,
  3: 2,
  4: 3,
  5: 4,
  6: 5,
  7: 6,
  8: 7,
  9: 8,
  10: 9,
  11: 10,
  12: 11,
  13: 12,
  14: 13,
  15: 14,
  16: 15,
  17: 16,
  18: 17,
  19: 18
}

In [56]:
def oneHotEncodeLabel(labels):
  classes = [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]
  intLabels = [int(label) for label in labels.split(" ")]
  for label in intLabels:
     classes[label - 1] = 1.
  return classes

In [57]:
input_ids = []
token_type_ids = []
attention_mask = []

for idx, row in enumerate(df.itertuples()):
    # df.at[row.Index, 'ImageID'] = normaliseImage(row.ImageID)
    df.at[row.Index, 'Labels'] = oneHotEncodeLabel(row.Labels)
    # df.at[row.Index, 'Caption'] = tokenise(row.Caption)
    encodings = tokenizer(row.Caption, padding="max_length", truncation=True, max_length=20)
    input_ids.append(encodings['input_ids'])
    token_type_ids.append(encodings['token_type_ids'])
    attention_mask.append(encodings['attention_mask'])
    # df['input_ids'].loc[0] = encoding['input_ids']
    # print(df)
    # print(idx)
    # print(tokenizer.decode(encoding['input_ids']))

In [58]:
df.insert(3, 'input_ids', input_ids)
df.insert(4, 'token_type_ids', token_type_ids)
df.insert(5, 'attention_mask', attention_mask)

In [59]:
print(df.head())

  ImageID                                             Labels  \
0   0.jpg  [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
1   1.jpg  [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
2   2.jpg  [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...   
3   3.jpg  [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...   
4   4.jpg  [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, ...   

                                             Caption  \
0   Woman in swim suit holding parasol on sunny day.   
1  A couple of men riding horses on top of a gree...   
2  They are brave for riding in the jungle on tho...   
3  a black and silver clock tower at an intersect...   
4   A train coming to a stop on the tracks out side.   

                                           input_ids  \
0  [101, 2450, 1999, 9880, 4848, 3173, 11498, 194...   
1  [101, 1037, 3232, 1997, 2273, 5559, 5194, 2006...   
2  [101, 2027, 2024, 9191, 2005, 5559, 1999, 1996...   
3  [101, 1037, 2304, 1998, 3165, 5119, 3578, 2012...  

In [60]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
  "bert-base-uncased",
  problem_type="multi_label_classification",
  num_labels=19,
  id2label=id2label,
  label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [61]:
batch_size = 8
metric_name = "f1"

In [62]:
from transformers import TrainingArguments

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [63]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [64]:
df['Labels'][0]

[1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [65]:
torch.tensor(df['input_ids'][0]).unsqueeze(0)

tensor([[  101,  2450,  1999,  9880,  4848,  3173, 11498, 19454,  2006, 11559,
          2154,  1012,   102,     0,     0,     0,     0,     0,     0,     0]])

In [66]:
outputs = model(input_ids=torch.tensor(df['input_ids'][0]).unsqueeze(0), labels=torch.tensor(df['Labels'][0]).unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.7139, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.2458,  0.0584, -0.8073,  0.0442, -0.5776,  0.6092, -0.4927,  0.3446,
          0.2448, -0.5293, -0.5679, -0.4635, -0.3004,  0.1149,  0.4977,  0.7999,
         -0.1824,  0.2172,  0.5422]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [67]:
param_size = 0
for param in model.parameters():
    param_size += param.nelement() * param.element_size()
buffer_size = 0
for buffer in model.buffers():
    buffer_size += buffer.nelement() * buffer.element_size()

size_all_mb = (param_size + buffer_size) / 1024**2
print('model size: {:.3f}MB'.format(size_all_mb))

model size: 417.705MB
