In [1]:
import torch
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
df = pd.read_excel('/content/CyberBulling_Dataset_Bangla.xlsx', names=['SL', 'Description', 'Label'])
df.set_index('SL', inplace=True)
df.head()

Unnamed: 0_level_0,Description,Label
SL,Unnamed: 1_level_1,Unnamed: 2_level_1
0,ওই হালার পুত এখন কি মদ খাওয়ার সময় রাতের বেলা...,sexual
1,আপনার জন্ম প্রক্রিয়ার সময় আপনার মায়ের ভিতর কি ...,sexual
2,ধজভংগ দের আর ভায়াগ্রা লাগবো না। ধংস হোক এই সব ...,sexual
3,বোকাচোদা একটা।,sexual
4,তোর দেশে ফেরার অপেক্ষায় রইলাম। জেলে একটা কামরা...,sexual


In [4]:
df.Label.value_counts()

Political    1205
troll        1202
sexual       1201
Threat       1201
Neutral      1201
Name: Label, dtype: int64

In [5]:
possible_labels = df.Label.unique()
possible_labels

array(['sexual', 'Threat', 'Political', 'troll', 'Neutral'], dtype=object)

In [6]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
df['label'] = df.Label.replace(label_dict)
df

Unnamed: 0_level_0,Description,Label,label
SL,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,ওই হালার পুত এখন কি মদ খাওয়ার সময় রাতের বেলা...,sexual,0
1,আপনার জন্ম প্রক্রিয়ার সময় আপনার মায়ের ভিতর কি ...,sexual,0
2,ধজভংগ দের আর ভায়াগ্রা লাগবো না। ধংস হোক এই সব ...,sexual,0
3,বোকাচোদা একটা।,sexual,0
4,তোর দেশে ফেরার অপেক্ষায় রইলাম। জেলে একটা কামরা...,sexual,0
...,...,...,...
6005,কপাল পোড়া,Neutral,4
6006,আল্লাহ আপনাকে হেদায়েত দান করুক আমীন।আমিও একজন ...,Neutral,4
6007,"শুধু সুন্দর চেহারার হিরো আপনি নন, সুন্দর মনের ...",Neutral,4
6008,আল্লাহ আপনাকে সুস্থ রাখুক। ভাইরাস থেকে দেশের ম...,Neutral,4


In [7]:
#Train-Test split
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(df.index.values,
                                                  df.label.values,
                                                  test_size=0.3,
                                                  random_state=17,
                                                  stratify=df.label.values)

In [8]:
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

In [9]:
df.groupby(['Label', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Description
Label,label,data_type,Unnamed: 3_level_1
Neutral,4,train,841
Neutral,4,val,360
Political,2,train,843
Political,2,val,362
Threat,1,train,841
Threat,1,val,360
sexual,0,train,841
sexual,0,val,360
troll,3,train,841
troll,3,val,361


TOKENIZER AND ENCODING

In [10]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m71.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m105.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
Co

In [11]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

In [12]:
tokenizer = BertTokenizer.from_pretrained('sagorsarker/bangla-bert-base',
                                          do_lower_case=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

In [13]:
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].Description.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].Description.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [14]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
print(len(dataset_train))
print(len(dataset_val))

4207
1803


BERT PRETRAINED

In [16]:
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("sagorsarker/bangla-bert-base",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Downloading model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
#cleaning data loader
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
batch_size = 16

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=batch_size)

In [46]:
#optimizer & scheduler
from transformers import AdamW, get_linear_schedule_with_warmup
optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)
epochs = 6

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



In [41]:
#Performance Matrix
import numpy as np
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    class_accuracies = {}
    overall_correct = 0
    overall_total = len(labels_flat)

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        correct = len(y_preds[y_preds == label])
        total = len(y_true)
        class_accuracy = correct / total

        class_accuracies[label_dict_inverse[label]] = class_accuracy

        overall_correct += correct

        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {correct}/{total}\n')

    overall_accuracy = overall_correct / overall_total
    print(f'Overall Accuracy: {overall_correct}/{overall_total} = {overall_accuracy * 100:.2f}%\n')


In [30]:
#Training Loop
import random

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [32]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals

In [47]:
for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

  0%|          | 0/6 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.3977519193302996
Validation loss: 0.7705591504865911
F1 Score (Weighted): 0.7405335808361313


Epoch 2:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.23956822100389366
Validation loss: 0.8747365039987572
F1 Score (Weighted): 0.7481933117930959


Epoch 3:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.1365226711157157
Validation loss: 0.990530886543107
F1 Score (Weighted): 0.7463293070450652


Epoch 4:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.07791659479673967
Validation loss: 1.1386809400025244
F1 Score (Weighted): 0.746621933455564


Epoch 5:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.05035817968931772
Validation loss: 1.2271724300900992
F1 Score (Weighted): 0.7524479433693717


Epoch 6:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.027645310053704554
Validation loss: 1.2563123658146973
F1 Score (Weighted): 0.7557819824556458


In [48]:
model = BertForSequenceClassification.from_pretrained('sagorsarker/bangla-bert-base',
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(102025, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [49]:
model.load_state_dict(torch.load('/content/finetuned_BERT_epoch_6.model', map_location=torch.device('cuda')))

<All keys matched successfully>

In [50]:
_, predictions, true_vals = evaluate(dataloader_validation)

In [51]:
accuracy_per_class(predictions, true_vals)

Class: sexual
Accuracy: 260/360

Class: Threat
Accuracy: 294/360

Class: Political
Accuracy: 329/362

Class: troll
Accuracy: 226/361

Class: Neutral
Accuracy: 252/360

Overall Accuracy: 1361/1803 = 75.49%

