In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install tensorboard
!pip install seqeval

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.0 MB/s[0m eta [36m0:00:0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns

# Define the label to index mapping
label2idx = {"O": 0, "B-short": 1, "I-short": 2, "B-long": 3, "I-long": 4}

# Load the dataset
dataset = load_dataset('json', data_files='/content/drive/MyDrive/Dissertation/Models/train.json')

# Split the dataset into train and test
dataset = dataset['train'].train_test_split(test_size=0.2)

# Use a BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize(examples):
    # Tokenize the examples
    tokenized_examples = tokenizer(examples['tokens'], is_split_into_words=True, truncation=True, max_length=512, padding='max_length')

    # Prepare the labels
    labels = []
    for i, label in enumerate(examples['labels']):
        # Convert the labels to integers
        label = [label2idx[l] for l in label]
        padded_label = label + [-100] * (512 - len(label))  # Pad the labels
        labels.append(padded_label)

    tokenized_examples['labels'] = labels

    return tokenized_examples

train_dataset = dataset['train'].map(tokenize, batched=True, batch_size=1000)
test_dataset = dataset['test'].map(tokenize, batched=True, batch_size=1000)

# Load a BERT model for token classification
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=5)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/11204 [00:00<?, ? examples/s]

Map:   0%|          | 0/2802 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    save_total_limit=3,  # Only keep the 3 most recent checkpoints
)

# Define the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [None]:
# Train the model
trainer.train()

Step,Training Loss
500,0.4562
1000,0.2146
1500,0.165
2000,0.1281


TrainOutput(global_step=2103, training_loss=0.23462498919259805, metrics={'train_runtime': 3031.9181, 'train_samples_per_second': 11.086, 'train_steps_per_second': 0.694, 'total_flos': 8782944799150080.0, 'train_loss': 0.23462498919259805, 'epoch': 3.0})

In [None]:

# Save the model
model.save_pretrained("/content/drive/MyDrive/Dissertation/Models/bert_acronym")
tokenizer.save_pretrained("/content/drive/MyDrive/Dissertation/Models/bert_acronym")


('/content/drive/MyDrive/Dissertation/Models/bert_acronym/tokenizer_config.json',
 '/content/drive/MyDrive/Dissertation/Models/bert_acronym/special_tokens_map.json',
 '/content/drive/MyDrive/Dissertation/Models/bert_acronym/vocab.txt',
 '/content/drive/MyDrive/Dissertation/Models/bert_acronym/added_tokens.json',
 '/content/drive/MyDrive/Dissertation/Models/bert_acronym/tokenizer.json')

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from sklearn.metrics import precision_recall_fscore_support
# Load the model
model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/Dissertation/Models/bert_acronym")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/Dissertation/Models/bert_acronym")

In [None]:
from itertools import chain
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix

# Evaluate the model
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

# Only predict the real tokens, not the padding tokens
true_predictions = [
    [p for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [l for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

true_labels_flattened = list(chain.from_iterable(true_labels))
true_predictions_flattened = list(chain.from_iterable(true_predictions))


confusion = confusion_matrix(true_labels_flattened, true_predictions_flattened)

label_names = list(label2idx.keys())
for i, row in enumerate(confusion):
    row_str = [label_names[i]] + [str(cell) for cell in row]
    print('\t'.join(row_str))

O	24461	15580	7617	12520	17096
B-short	1004	927	274	880	1675
I-short	81	66	17	55	64
B-long	733	684	235	265	659
I-long	1466	1259	311	1167	1270


In [None]:
import numpy as np
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score

# Define your confusion matrix
confusion_matrix_data = confusion

# Extract the relevant rows and columns for the classes of interest
confusion_matrix_data = confusion_matrix_data[1:, 1:]
row_sums = confusion_matrix_data.sum(axis=1)
col_sums = confusion_matrix_data.sum(axis=0)

# Calculate precision, recall, and F1-score for each class
precision = np.diag(confusion_matrix_data) / col_sums
recall = np.diag(confusion_matrix_data) / row_sums
f1 = 2 * (precision * recall) / (precision + recall)

# Calculate overall precision, recall, F1-score, and accuracy
overall_precision = np.mean(precision)
overall_recall = np.mean(recall)
overall_f1 = np.mean(f1)

# Calculate overall accuracy
overall_accuracy = np.sum(np.diag(confusion_matrix_data)) / np.sum(confusion_matrix_data)

# Print overall metrics
print(f"Precision: {overall_precision:.4f}")
print(f"Recall: {overall_recall:.4f}")
print(f"F1-Score: {overall_f1:.4f}")
print(f"Accuracy: {overall_accuracy:.4f}")


Precision: 0.1986
Recall: 0.1979
F1-Score: 0.1917
Accuracy: 0.2528


In [None]:
text = 'In this context, the combination of Forward Error Correction (FEC) and Unequal Error Protection (UEP) approaches is known to provide the distribution of video applications for wireless users with Quality of Experience (QoE) assurance.'

In [None]:
import torch
from torch.nn import functional as F

inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")

# Make the predictions
outputs = model(**inputs)

# Get the predicted labels and their probabilities
logits = outputs.logits
probabilities = F.softmax(logits, dim=-1)
predicted_labels = torch.argmax(logits, dim=-1)

# Convert the predicted labels to their string representations
predicted_label_strings = [model.config.id2label[label] for label in predicted_labels[0].tolist()]

# Print the tokens, their predicted labels, and associated probabilities
for token, label, prob in zip(tokenizer.tokenize(text), predicted_label_strings, probabilities[0]):
    label_index = model.config.label2id[label]  # Get the index of the label
    label_prob = prob[label_index]  # Probability associated with the predicted label
    print(f"{token}: {label} (Probability: {label_prob.item():.4f})")


in: LABEL_0 (Probability: 0.9998)
this: LABEL_0 (Probability: 0.9999)
context: LABEL_0 (Probability: 0.9999)
,: LABEL_0 (Probability: 0.9999)
the: LABEL_0 (Probability: 0.9999)
combination: LABEL_0 (Probability: 0.9998)
of: LABEL_0 (Probability: 0.9998)
forward: LABEL_3 (Probability: 0.9934)
error: LABEL_4 (Probability: 0.9960)
correction: LABEL_4 (Probability: 0.9963)
(: LABEL_0 (Probability: 0.9994)
fe: LABEL_1 (Probability: 0.9968)
##c: LABEL_0 (Probability: 0.9996)
): LABEL_0 (Probability: 0.9993)
and: LABEL_3 (Probability: 0.9941)
une: LABEL_4 (Probability: 0.9955)
##qual: LABEL_4 (Probability: 0.9949)
error: LABEL_0 (Probability: 0.9982)
protection: LABEL_1 (Probability: 0.9949)
(: LABEL_0 (Probability: 0.9995)
u: LABEL_0 (Probability: 0.9997)
##ep: LABEL_0 (Probability: 0.9998)
): LABEL_0 (Probability: 0.9998)
approaches: LABEL_0 (Probability: 0.9999)
is: LABEL_0 (Probability: 0.9999)
known: LABEL_0 (Probability: 0.9999)
to: LABEL_0 (Probability: 0.9999)
provide: LABEL_0 (Probab