<a href="https://colab.research.google.com/github/NUMAIRn/NUMAIRn/blob/main/BertBasedClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q transformers datasets

In [2]:
import pandas as pd
from datasets import Dataset
df = pd.read_csv("/content/output1.csv", encoding = 'unicode_escape')
dataset = Dataset.from_pandas(df)

In [3]:
dataset

Dataset({
    features: ['Project Abstracts', 'SDG1', 'SDG2', 'SDG3', 'SDG4', 'SDG5', 'SDG6', 'SDG7', 'SDG8', 'SDG9', 'SDG10', 'SDG11', 'SDG12', 'SDG13', 'SDG14', 'SDG15', 'SDG16', 'SDG17'],
    num_rows: 3264
})

In [4]:
labels = [label for label in dataset.features.keys() if label not in ['Project Abstracts']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['SDG1',
 'SDG2',
 'SDG3',
 'SDG4',
 'SDG5',
 'SDG6',
 'SDG7',
 'SDG8',
 'SDG9',
 'SDG10',
 'SDG11',
 'SDG12',
 'SDG13',
 'SDG14',
 'SDG15',
 'SDG16',
 'SDG17']

In [5]:
from transformers import AutoTokenizer
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["Project Abstracts"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()

  return encoding

In [6]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset.column_names)


Map:   0%|          | 0/3264 [00:00<?, ? examples/s]

In [7]:
example = encoded_dataset[0]
print(example.keys())


dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [8]:
tokenizer.decode(example['input_ids'])

'[CLS] the objective of this project is to develop a suite of mobile and web applications that shall serve as a platform for food vendors to connect with their customers and enhance their reach. it shall let food critics review different vendors as well. it shall guide food lovers especially tourists about the best eating spots in a city. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [9]:
example['labels']

[0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [10]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['SDG2', 'SDG8', 'SDG11']

In [11]:
encoded_dataset.set_format("torch")

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased",
                                                           problem_type="multi_label_classification",
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
batch_size = 8
metric_name = "f1"

In [20]:
!pip install transformers[torch]

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.21.0


In [22]:
!pip install accelerate -U



In [14]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [15]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [16]:
encoded_dataset[0]['labels'].type()

'torch.FloatTensor'

In [17]:
encoded_dataset['input_ids'][0]

tensor([  101,  1996,  7863,  1997,  2023,  2622,  2003,  2000,  4503,  1037,
         7621,  1997,  4684,  1998,  4773,  5097,  2008,  4618,  3710,  2004,
         1037,  4132,  2005,  2833, 17088,  2000,  7532,  2007,  2037,  6304,
         1998, 11598,  2037,  3362,  1012,  2009,  4618,  2292,  2833,  4401,
         3319,  2367, 17088,  2004,  2092,  1012,  2009,  4618,  5009,  2833,
        10205,  2926,  9045,  2055,  1996,  2190,  5983,  7516,  1999,  1037,
         2103,  1012,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [18]:
#forward pass
outputs = model(input_ids=encoded_dataset['input_ids'][0].unsqueeze(0), labels=encoded_dataset[0]['labels'].unsqueeze(0))
outputs

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


SequenceClassifierOutput(loss=tensor(0.6636, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[-0.0794, -0.1809, -0.4023,  0.1158, -0.1793,  0.0263, -0.2143, -0.2779,
         -0.3126,  0.6101,  0.7526, -0.7535,  0.1203, -0.2605,  0.3343, -0.3134,
          0.0405]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [19]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset,
    eval_dataset=encoded_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [20]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.21069,0.580386,0.71384,0.241728


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.21069,0.580386,0.71384,0.241728
2,0.277100,0.17379,0.719921,0.791674,0.351409


TrainOutput(global_step=816, training_loss=0.24393237338346593, metrics={'train_runtime': 12118.9102, 'train_samples_per_second': 0.539, 'train_steps_per_second': 0.067, 'total_flos': 429455073116160.0, 'train_loss': 0.24393237338346593, 'epoch': 2.0})

In [37]:
text = "In Pakistan, where Urdu is considered as a National Language and almost 14% of children are born with speech or vocal issues in 2010. Cases of Speech problems in youngsters and adults have expanded complex, however, such therapy is expensive, and many individuals in Pakistan cannot afford it. Recent research has demonstrated that smart applications might serve as a teaching tool for children. So, we developed a mobile application for those who cannot afford these expensive sessions. AI-Speech Therapist app is a software application specially designed to provide speech therapy to people with speech and language impairments. The ideology is to use a variety of entertaining and stimulating activities and games to assist in teaching your child how to pronounce words in the Urdu language. The application is designed using Android Studio and games are designed in Unity. This application would have different activities for the children to learn systematically. It would play a word or a sentence, children would then recurrent the word or a sentence and the application will compare that user spoken word with the stored word and show the accuracy. The precision rate would be set, and the next level would not be unlocked until the children met the accuracy rate. For database storage and functionalities, we have used a cloud database system (Firebase), and then we implemented the backend of the application using Java. The conversion of spoken word into text in Urdu has been done using the Google Speech-to-Text API. For accuracy rate NLP (Cosine Similarity algorithm) has been used. The code for cosine similarity is written in Python and then integrated into Android Studio. If two words are same, Cosine Similarity provides 100% accuracy rate. At the end of every activity a game will be played that will enhance the learning of the user. This application is not only for children but also for adults who are suffering from speech and language disorders"

encoding = tokenizer(text, return_tensors="pt")
encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

outputs = trainer.model(**encoding)

In [38]:
logits = outputs.logits
logits.shape

torch.Size([1, 17])

In [39]:
# apply sigmoid + threshold
sigmoid = torch.nn.Sigmoid()
probs = sigmoid(logits.squeeze().cpu())
predictions = np.zeros(probs.shape)
predictions[np.where(probs >= 0.5)] = 1
# turn predicted id's into actual label names
predicted_labels = [id2label[idx] for idx, label in enumerate(predictions) if label == 1.0]
print(predicted_labels)

['SDG3', 'SDG4', 'SDG9']


In [35]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [36]:
model_save_name = 'Bertclassifier.pt'
path = F"/content/gdrive/My Drive/{model_save_name}"
torch.save(model.state_dict(), path)

In [41]:
!ls /content/gdrive/MyDrive

 Bertclassifier.pt  'Colab Notebooks'


In [42]:
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283838 sha256=d2954bad948556c9fcd8ad34b0c49755732bc84603cef0b240136371b2b1de14
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [43]:
import lime

In [45]:
from lime import lime_text
from sklearn.pipeline import make_pipeline
c = make_pipeline(model, Trainer)