<a href="https://colab.research.google.com/github/Li-Tuen/PA2-COMP4211/blob/main/transfomer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [92]:
!pip install datasets huggingface_hub[hf_xet] evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [93]:
import pandas as pd
import ast
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback,
    TrainerCallback
)
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
import nltk
from nltk.corpus import wordnet
import random
import torch
import torch.nn as nn
import numpy as np
import evaluate

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [48]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [49]:
# Synonym replacement method to improve model's generalization power, robustness and semantic understanding
nltk.download('wordnet')

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return synonyms

def augment_sentence(sentence, replace_ratio=0.1):
    augmented_sentence = []
    num_to_replace = int(len(sentence) * replace_ratio)
    indices_to_replace = random.sample(range(len(sentence)), num_to_replace)
    for i, word in enumerate(sentence):
        if i in indices_to_replace:
            syns = get_synonyms(word)
            if syns:
                augmented_sentence.append(random.choice(syns))
            else:
                augmented_sentence.append(word)
        else:
            augmented_sentence.append(word)
    return augmented_sentence

# Apply augmentation to training data
train_df['Sentence'] = train_df['Sentence'].apply(ast.literal_eval) # Parse the string representations of lists into actual lists
train_df['Sentence'] = train_df['Sentence'].apply(augment_sentence)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [50]:
# Print a few samples
print(train_df['Sentence'].head().tolist())

[['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'indium', 'Iraq', 'and', 'demand', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'], ['Iranian', 'officials', 'say', 'they', 'expect', 'to', 'get', 'access', 'to', 'sealed', 'sensitive', 'parts', 'of', 'the', 'plant', 'Wednesday', ',', 'after', 'an', 'IAEA', 'surveillance', 'system', 'begin', 'functioning', '.'], ['Helicopter', 'gunships', 'Sat', 'pound', 'militant', 'hideouts', 'in', 'the', 'Orakzai', 'tribal', 'region', ',', 'where', 'many', 'Taliban', 'militants', 'are', 'believed', 'to', 'have', 'fled', 'to', 'avoid', 'an', 'earlier', 'military', 'offensive', 'in', 'nearby', 'south', 'Waziristan', '.'], ['They', 'left', 'after', 'a', 'tense', 'hour-long', 'standoff', 'with', 'howler', 'police', '.'], ['U.N.', 'relief', 'coordinator', 'Jan', 'Egeland', 'said', 'Sunday', ',', 'U.S.', ',', 'Indonesian', 'and', 'Australian', 'military', 'helicopter

In [51]:
train_df['NER Tag'] = train_df['NER Tag'].apply(ast.literal_eval)
test_df['Sentence'] = test_df['Sentence'].apply(ast.literal_eval)

In [37]:
from collections import Counter

# Inspect class distribution
train_labels = [label for sample in train_df["NER Tag"] for label in sample]
label_counts = Counter(train_labels)
print("Class distribution:", label_counts)

Class distribution: Counter({'O': 741576, 'B-geo': 31368, 'B-tim': 16982, 'B-org': 16825, 'I-per': 14456, 'B-per': 14204, 'I-org': 14138, 'B-gpe': 13080, 'I-geo': 6154, 'I-tim': 5510, 'B-art': 333, 'I-art': 257, 'B-eve': 250, 'I-eve': 211, 'B-nat': 170, 'I-gpe': 160, 'I-nat': 35})


In [52]:
# Encode NER tags as integers
unique_tags = sorted(set(tag for tags in train_df['NER Tag'] for tag in tags))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}
train_df['NER Tag'] = train_df['NER Tag'].apply(lambda tags: [tag2id[tag] for tag in tags])

In [53]:
# Split the training set to training and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=4211)

In [54]:
# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [72]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('roberta-base', add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained('roberta-base', num_labels=len(unique_tags))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
# Calculate the total number of parameters, ensure not exceed 1 billion
total_params = 0
for param in model.parameters():
    total_params += param.numel()
print("Number of parameters: ", total_params)

Number of parameters:  354327569


In [16]:
# Tokenize the dataset
def tokenize_function(examples, is_test=False):
    tokenized_inputs = tokenizer(
        examples["Sentence"],
        truncation=True,
        is_split_into_words=True,
        padding=True,
        return_tensors=None
    )

    if not is_test:
        labels = []
        for i, label in enumerate(examples["NER Tag"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels

    # Store word_ids for later use
    tokenized_inputs["word_ids"] = [tokenized_inputs.word_ids(batch_index=i) for i in range(len(examples["Sentence"]))]

    return tokenized_inputs

In [73]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(lambda x: tokenize_function(x, is_test=True), batched=True)

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [74]:
# Create data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [75]:
# Self-defined loss function
class GeneralizedCrossEntropyLoss(nn.Module):
    def __init__(self, q=0.7, weight=None, ignore_index=-100):
        super(GeneralizedCrossEntropyLoss, self).__init__()
        self.q = q                # Hyperparameter for noise robustness
        self.weight = weight      # Class weights tensor
        self.ignore_index = ignore_index  # Label to ignore

    def forward(self, logits, labels):
        # Find non-ignored positions
        mask = labels != self.ignore_index
        # Extract non-ignored logits and labels
        logits_flat = logits[mask]
        labels_flat = labels[mask]

        # Compute softmax probabilities
        probs = torch.softmax(logits_flat, dim=-1)
        # Get predicted probabilities for true labels
        probs_true = probs.gather(dim=-1, index=labels_flat.unsqueeze(-1)).squeeze(-1)

        # Compute generalized cross-entropy loss
        loss = (1 - probs_true ** self.q) / self.q

        # Apply class weights
        weights = self.weight[labels_flat]
        loss = loss * weights

        # Return the mean loss over non-ignored tokens
        return loss.mean()

In [76]:
# Calculate weight for each tags
class_weights = []
total_samples = sum(label_counts.values())
for tag in unique_tags:
    class_weights.append(total_samples / label_counts[tag])

# Convert to tensor
class_weights = torch.FloatTensor(class_weights).to(device)

# Define the generalized cross-entropy loss
criterion = GeneralizedCrossEntropyLoss(q=0.7, weight=class_weights, ignore_index=-100)

In [None]:
metric = evaluate.load('seqeval')
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_labels = [[label for (label, word_id) in zip(label_list, word_ids) if word_id!= -100]
                   for label_list, word_ids in zip(labels, tokenized_val_dataset["word_ids"])]
    true_predictions = [[prediction for (prediction, word_id) in zip(prediction_list, word_ids) if word_id!= -100]
                        for prediction_list, word_ids in zip(predictions, tokenized_val_dataset["word_ids"])]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "eval_f1": results["overall_f1"],
        "eval_precision": results["overall_precision"],
        "eval_recall": results["overall_recall"]
    }

In [80]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    warmup_steps=800,
    weight_decay=0.01,
    fp16=True,
    logging_dir="logs",
    logging_strategy="steps",
    logging_steps=1000,
    eval_strategy="epoch",
    save_strategy="epoch",
    metric_for_best_model="eval_f1_score",
    greater_is_better=True,
    load_best_model_at_end=True,
    report_to="none"
)

In [81]:
# Custom trainer
class CustomTrainer(Trainer):
    def __init__(self, *args, loss_fn=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.loss_fn = loss_fn

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        loss = self.loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss

In [82]:
# Create a Trainer
early_stopping = EarlyStoppingCallback(early_stopping_patience = 15)
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    callbacks = [early_stopping],
    loss_fn = criterion,
    compute_metrics=compute_metrics
)

In [83]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,8.691,9.392341


early stopping required metric_for_best_model, but did not find eval_f1_score so early stopping is disabled


KeyError: "The `metric_for_best_model` training argument is set to 'eval_f1_score', which is not found in the evaluation metrics. The available evaluation metrics are: ['eval_loss']. Consider changing the `metric_for_best_model` via the TrainingArguments."

In [68]:
# Save the model and tokenizer
model.save_pretrained("ner_model")
tokenizer.save_pretrained("ner_model")

('ner_model/tokenizer_config.json',
 'ner_model/special_tokens_map.json',
 'ner_model/vocab.json',
 'ner_model/merges.txt',
 'ner_model/added_tokens.json',
 'ner_model/tokenizer.json')

In [69]:
# Make predictions
predictions = trainer.predict(tokenized_test_dataset)
# Convert numpy array to PyTorch tensor
predictions_tensor = torch.from_numpy(predictions.predictions)
predicted_labels = torch.argmax(predictions_tensor, dim=2)

In [70]:
# Generate submission file
submission_tags = []
for i, prediction in enumerate(predicted_labels):
    word_ids = tokenized_test_dataset[i]['word_ids']
    previous_word_idx = None
    tags = []
    for j, word_idx in enumerate(word_ids):
        if word_idx is not None and word_idx != previous_word_idx:
            tags.append(id2tag[int(prediction[j])])
        previous_word_idx = word_idx
    submission_tags.append(str(tags))

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'NER Tag': submission_tags
})

submission_df.to_csv('submission.csv', index=False)

In [26]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
