<a href="https://colab.research.google.com/github/Li-Tuen/PA2-COMP4211/blob/main/transfomer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
!pip install datasets huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.0.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.0.5-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (54.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.0/54.0 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.0.5


In [16]:
import pandas as pd
import ast
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    TrainingArguments,
    Trainer,
    DataCollatorForTokenClassification,
    EarlyStoppingCallback,
    TrainerCallback
)
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
import nltk
from nltk.corpus import wordnet
import random
import torch.nn as nn

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [8]:
# Synonym replacement method to improve model's generalization power, robustness and semantic understanding
nltk.download('wordnet')

def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return synonyms

def augment_sentence(sentence, replace_ratio=0.2):
    augmented_sentence = []
    num_to_replace = int(len(sentence) * replace_ratio)
    indices_to_replace = random.sample(range(len(sentence)), num_to_replace)
    for i, word in enumerate(sentence):
        if i in indices_to_replace:
            syns = get_synonyms(word)
            if syns:
                augmented_sentence.append(random.choice(syns))
            else:
                augmented_sentence.append(word)
        else:
            augmented_sentence.append(word)
    return augmented_sentence

# Apply augmentation to training data
train_df['Sentence'] = train_df['Sentence'].apply(ast.literal_eval) # Parse the string representations of lists into actual lists
train_df['Sentence'] = train_df['Sentence'].apply(augment_sentence)

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [10]:
# Print a few samples
print(train_df['Sentence'].head().tolist())

[['Thousands', 'of', 'demonstrators', 'have', 'marched', 'through', 'London', 'to', 'protest', 'the', 'war', 'in', 'Iraq', 'and', 'exact', 'the', 'withdrawal', 'of', 'British', 'troops', 'from', 'that', 'country', '.'], ['Iranian', 'officials', 'say', 'they', 'expect', 'to', 'get', 'access', 'to', 'sealed', 'sensitive', 'parts', 'of', 'the', 'plant', 'Wednesday', ',', 'after', 'an', 'International_Atomic_Energy_Agency', 'surveillance', 'system', 'begins', 'function', '.'], ['Helicopter', 'gunships', 'Saturday', 'pounded', 'militant', 'hideouts', 'indium', 'the', 'Orakzai', 'tribal', 'region', ',', 'where', 'many', 'Taliban', 'militants', 'are', 'believed', 'to', 'give', 'fled', 'to', 'avoid', 'AN', 'earlier', 'military', 'offensive', 'in', 'nearby', 'South', 'Waziristan', '.'], ['They', 'left', 'after', 'A', 'tense', 'hour-long', 'standoff', 'with', 'riot', 'police', '.'], ['U.N.', 'relief', 'coordinator', 'Jan', 'Egeland', 'said', 'Sunday', ',', 'U.S.', ',', 'Indonesian', 'and', 'Aust

In [11]:
train_df['NER Tag'] = train_df['NER Tag'].apply(ast.literal_eval)
test_df['Sentence'] = test_df['Sentence'].apply(ast.literal_eval)

In [14]:
from collections import Counter

# Inspect class distribution
train_labels = [label for sample in train_df["NER Tag"] for label in sample]
label_counts = Counter(train_labels)
print("Class distribution:", label_counts)

Class distribution: Counter({'O': 741576, 'B-geo': 31368, 'B-tim': 16982, 'B-org': 16825, 'I-per': 14456, 'B-per': 14204, 'I-org': 14138, 'B-gpe': 13080, 'I-geo': 6154, 'I-tim': 5510, 'B-art': 333, 'I-art': 257, 'B-eve': 250, 'I-eve': 211, 'B-nat': 170, 'I-gpe': 160, 'I-nat': 35})


In [17]:
# Encode NER tags as integers
unique_tags = sorted(set(tag for tags in train_df['NER Tag'] for tag in tags))
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}
train_df['NER Tag'] = train_df['NER Tag'].apply(lambda tags: [tag2id[tag] for tag in tags])

In [18]:
# Calculate weight for each tags
class_weights = []
total_samples = sum(label_counts.values())
for tag in unique_tags:
    class_weights.append(total_samples / label_counts[tag])

# Convert to tensor
class_weights = torch.FloatTensor(class_weights).to(device)

# Define cross-entropy loss
criterion = nn.CrossEntropyLoss(weight=class_weights)

In [19]:
# Split the training set to training and validation sets
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=4211)

In [20]:
# Convert DataFrame to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [30]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('roberta-large', add_prefix_space=True)
model = AutoModelForTokenClassification.from_pretrained('roberta-large', num_labels=len(unique_tags))

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
total_params = 0
for param in model.parameters():
    total_params += param.numel()
print("Number of parameters: ", total_params)

Number of parameters:  354327569


In [26]:
# Tokenize the dataset
def tokenize_function(examples, is_test=False):
    tokenized_inputs = tokenizer(
        examples["Sentence"],
        truncation=True,
        is_split_into_words=True,
        padding=True,
        return_tensors=None
    )

    if not is_test:
        labels = []
        for i, label in enumerate(examples["NER Tag"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(label[word_idx])
                else:
                    label_ids.append(-100)
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels

    # Store word_ids for later use
    tokenized_inputs["word_ids"] = [tokenized_inputs.word_ids(batch_index=i) for i in range(len(examples["Sentence"]))]

    return tokenized_inputs

In [27]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(lambda x: tokenize_function(x, is_test=True), batched=True)

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [35]:
# Create data collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [36]:
# Self-defined loss function
class CustomLossCallback(TrainerCallback):
    def on_train_begin(self, args, state, control, **kwargs):
        kwargs['model'].config.problem_type = "single_label_classification"
        kwargs['model'].loss_fct = criterion

In [37]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    warmup_steps=750,
    weight_decay=0.01,
    fp16=True,
    logging_dir="logs",
    logging_steps=500,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)

In [38]:
# Create a Trainer
early_stopping = EarlyStoppingCallback(early_stopping_patience = 5)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    callbacks = [early_stopping, CustomLossCallback()]
)

In [None]:
# Fine-tune the model
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Save the model and tokenizer
model.save_pretrained("./ner_model")
tokenizer.save_pretrained("./ner_model")

In [None]:
# Make predictions
predictions = trainer.predict(tokenized_test_dataset)
# Convert numpy array to PyTorch tensor
predictions_tensor = torch.from_numpy(predictions.predictions)
predicted_labels = torch.argmax(predictions_tensor, dim=2)

In [None]:
# Generate submission file
submission_tags = []
for i, prediction in enumerate(predicted_labels):
    word_ids = tokenized_test_dataset[i]['word_ids']
    previous_word_idx = None
    tags = []
    for j, word_idx in enumerate(word_ids):
        if word_idx is not None and word_idx != previous_word_idx:
            tags.append(id2tag[int(prediction[j])])
        previous_word_idx = word_idx
    submission_tags.append(str(tags))

submission_df = pd.DataFrame({
    'id': test_df['id'],
    'NER Tag': submission_tags
})

submission_df.to_csv('submission.csv', index=False)