<a href="https://colab.research.google.com/github/NUMAIRn/NUMAIRn/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers datasets

In [2]:
import pandas as pd
from datasets import Dataset
df = pd.read_csv("/content/Book2.csv", encoding = 'unicode_escape')
dataset = Dataset.from_pandas(df)

In [3]:
dataset

Dataset({
    features: ['Project Abstracts', 'SDG1', 'SDG2', 'SDG3', 'SDG4', 'SDG5', 'SDG6', 'SDG7', 'SDG8', 'SDG9', 'SDG10', 'SDG11', 'SDG12', 'SDG13', 'SDG14', 'SDG15', 'SDG16', 'SDG17'],
    num_rows: 816
})

In [4]:
labels = [label for label in dataset.features.keys() if label not in ['Project Abstracts']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['SDG1',
 'SDG2',
 'SDG3',
 'SDG4',
 'SDG5',
 'SDG6',
 'SDG7',
 'SDG8',
 'SDG9',
 'SDG10',
 'SDG11',
 'SDG12',
 'SDG13',
 'SDG14',
 'SDG15',
 'SDG16',
 'SDG17']

In [5]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Project Abstracts"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [6]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset.column_names)


Map:   0%|          | 0/816 [00:00<?, ? examples/s]

In [7]:
example = encoded_dataset[0]
print(example.keys())


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [8]:
tokenizer.decode(example['input_ids'])

'[CLS] the objective of this project is to develop a suite of mobile and web applications that shall serve as a platform for food vendors to connect with their customers and enhance their reach. it shall let food critics review different vendors as well. it shall guide food lovers especially tourists about the best eating spots in a city. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [9]:
example['labels']

[0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [10]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['SDG2', 'SDG8', 'SDG11']

In [11]:
encoded_dataset.set_format("torch")

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
batch_size = 8
metric_name = "f1"

In [14]:
!pip install transformers[torch]



In [23]:
!pip install accelerate -U



In [15]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [16]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [18]:
encoded_dataset[0]['labels'].type()

'torch.FloatTensor'

In [19]:
encoded_dataset['input_ids'][0]

tensor([  101,  1996,  7863,  1997,  2023,  2622,  2003,  2000,  4503,  1037,
         7621,  1997,  4684,  1998,  4773,  5097,  2008,  4618,  3710,  2004,
         1037,  4132,  2005,  2833, 17088,  2000,  7532,  2007,  2037,  6304,
         1998, 11598,  2037,  3362,  1012,  2009,  4618,  2292,  2833,  4401,
         3319,  2367, 17088,  2004,  2092,  1012,  2009,  4618,  5009,  2833,
        10205,  2926,  9045,  2055,  1996,  2190,  5983,  7516,  1999,  1037,
         2103,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [21]:
#forward pass
outputs = model(input_ids=encoded_dataset['input_ids'][0].unsqueeze(0), labels=encoded_dataset[0]['labels'].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.7627, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.2443, -0.1421, -0.0524,  0.2627,  0.0724,  0.1057, -0.1435,  0.0481,
         -0.1535,  0.2955, -0.1108, -0.3239,  0.1919, -0.0674,  1.0800,  0.4989,
          0.1738]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [24]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [25]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.307061,0.144676,0.538529,0.050245
2,No log,0.269206,0.301328,0.590092,0.061275


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.307061,0.144676,0.538529,0.050245
2,No log,0.269206,0.301328,0.590092,0.061275
3,No log,0.243847,0.453125,0.652176,0.185049
4,No log,0.22986,0.512843,0.679657,0.213235
5,0.291500,0.224703,0.534894,0.691281,0.231618


TrainOutput(global_step=510, training_loss=0.2900403195736455, metrics={'train_runtime': 7635.1908, 'train_samples_per_second': 0.534, 'train_steps_per_second': 0.067, 'total_flos': 268409420697600.0, 'train_loss': 0.2900403195736455, 'epoch': 5.0})

In [26]:
text = "The objective of this project is to build a chatbot using Open AI's GPT-3. The chatbot should be able to understand and generate text in natural language and carry out a conversation with users on various topics. The chatbot will be trained on a diverse range of topics and should be able to answer questions, provide recommendations, and engage in small talk."

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [27]:
logits = outputs.logits
logits.shape

torch.Size([1, 17])

In [28]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['SDG9']
