In [4]:
# Import libraries
import transformers
import torch
import numpy as np
import sklearn

# Define model and tokenizer
model_name = "bert-base-multilingual-cased"
model = transformers.BertForTokenClassification.from_pretrained(model_name)
tokenizer = transformers.BertTokenizer.from_pretrained(model_name)

# Create dataset
sentences = ["Presiden Joko Widodo mengunjungi kota Surabaya pada hari Selasa untuk meninjau vaksinasi massal Covid-19."]
labels = ["B-PER I-PER I-PER O B-LOC I-LOC O O O O O O O B-DIS O"]

# Split dataset into train, validation, and test sets
train_sentences, test_sentences, train_labels, test_labels = sklearn.model_selection.train_test_split(sentences, labels, test_size=0.2)
train_sentences, val_sentences, train_labels, val_labels = sklearn.model_selection.train_test_split(train_sentences, train_labels, test_size=0.2)

# Define function to convert dataset into BERT format
def convert_to_bert_format(sentences, labels):
  input_ids = []
  attention_mask = []
  label_ids = []
  for sentence, label in zip(sentences, labels):
    # Tokenize sentence and label
    tokens = tokenizer.tokenize(sentence)
    label_tokens = label.split()
    # Add special tokens
    tokens = ["[CLS]"] + tokens + ["[SEP]"]
    label_tokens = ["O"] + label_tokens + ["O"]
    # Convert tokens and labels to ids
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    label_id = [label_to_id[l] for l in label_tokens] # label_to_id is a dictionary that maps labels to ids
    # Create attention mask
    att_mask = [1] * len(input_id)
    # Pad or truncate to max length
    max_length = 128
    if len(input_id) > max_length:
      input_id = input_id[:max_length]
      label_id = label_id[:max_length]
      att_mask = att_mask[:max_length]
    else:
      padding_length = max_length - len(input_id)
      input_id = input_id + [0] * padding_length
      label_id = label_id + [-100] * padding_length # -100 is the ignore index for the loss function
      att_mask = att_mask + [0] * padding_length
    # Append to lists
    input_ids.append(input_id)
    attention_mask.append(att_mask)
    label_ids.append(label_id)
  # Convert lists to tensors
  input_ids = torch.tensor(input_ids)
  attention_mask = torch.tensor(attention_mask)
  label_ids = torch.tensor(label_ids)
  return input_ids, attention_mask, label_ids

# Convert train, validation, and test sets into BERT format
train_input_ids, train_attention_mask, train_label_ids = convert_to_bert_format(train_sentences, train_labels)
val_input_ids, val_attention_mask, val_label_ids = convert_to_bert_format(val_sentences, val_labels)
test_input_ids, test_attention_mask, test_label_ids = convert_to_bert_format(test_sentences, test_labels)

# Define function to train BERT model
def train_bert_model(model, train_input_ids, train_attention_mask, train_label_ids, val_input_ids, val_attention_mask, val_label_ids):
  # Define parameters
  learning_rate = 2e-5
  batch_size = 32
  epoch = 3
  # Define optimizer, scheduler, and loss function
  optimizer = transformers.AdamW(model.parameters(), lr=learning_rate)
  scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_input_ids) // batch_size * epoch)
  loss_function = torch.nn.CrossEntropyLoss(ignore_index=-100)
  # Define device
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)
  # Define best validation loss
  best_val_loss = float("inf")
  # Loop over epochs
  for e in range(epoch):
    # Set model to training mode
    model.train()
    # Initialize training loss
    train_loss = 0
    # Loop over batches
    for i in range(0, len(train_input_ids), batch_size):
      # Get batch data
      batch_input_ids = train_input_ids[i:i+batch_size].to(device)
      batch_attention_mask = train_attention_mask[i:i+batch_size].to(device)
      batch_label_ids = train_label_ids[i:i+batch_size].to(device)
      # Zero the gradients
      optimizer.zero_grad()
      # Forward pass
      output = model(batch_input_ids, attention_mask=batch_attention_mask, labels=batch_label_ids)
      # Get loss
      loss = output[0]
      # Backward pass
      loss.backward()
      # Update parameters
      optimizer.step()
      scheduler.step()
      # Update training loss
      train_loss += loss.item()
    # Calculate average training loss
    avg_train_loss = train_loss / len(train_input_ids)
    # Set model to evaluation mode
    model.eval()
    # Initialize validation loss
    val_loss = 0
    # Loop over batches
    for i in range(0, len(val_input_ids), batch_size):
      # Get batch data
      batch_input_ids = val_input_ids[i:i+batch_size].to(device)
      batch_attention_mask = val_attention_mask[i:i+batch_size].to(device)
      batch_label_ids = val_label_ids[i:i+batch_size].to(device)
      # Forward pass with no gradient calculation
      with torch.no_grad():
        output = model(batch_input_ids, attention_mask=batch_attention_mask, labels=batch_label_ids)
        # Get loss
        loss = output[0]
        # Update validation loss
        val_loss += loss.item()
    # Calculate average validation loss
    avg_val_loss = val_loss / len(val_input_ids)
    # Print epoch summary
    print(f"Epoch {e+1}: Train loss: {avg_train_loss:.4f}, Val loss: {avg_val_loss:.4f}")
    # Save the best model
    if avg_val_loss < best_val_loss:
      best_val_loss = avg_val_loss
      torch.save(model.state_dict(), "best_model.pt")
  
# Train BERT model using train and validation sets
train_bert_model(model, train_input_ids, train_attention_mask, train_label_ids, val_input_ids, val_attention_mask, val_label_ids)

# Load the best model
model.load_state_dict(torch.load("best_model.pt"))

# Define function to evaluate BERT model using test set
def evaluate_bert_model(model, test_input_ids, test_attention_mask, test_label_ids):
  # Define device
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)
  # Set model to evaluation mode
  model.eval()
  # Initialize predictions and true labels lists
  predictions = []
  true_labels = []
  # Loop over batches
  for i in range(0, len(test_input_ids), batch_size):
    # Get batch data
    batch_input_ids = test_input_ids[i:i+batch_size].to(device)
    batch_attention_mask = test_attention_mask[i:i+batch_size].to(device)
    batch_label_ids = test_label_ids[i:i+batch_size].to(device)
    # Forward pass with no gradient calculation
    with torch.no_grad():
      output = model(batch_input_ids, attention_mask=batch_attention_mask)
      # Get logits and labels
      logits = output[0]
      labels = batch_label_ids.cpu().numpy()
      # Convert logits to predictions
      preds = np.argmax(logits.cpu().numpy(), axis=2)
      # Append predictions and true labels to lists
      predictions.extend(preds.tolist())
      true_labels.extend(labels.tolist())
  
  return predictions, true_labels



Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: module 'sklearn' has no attribute 'model_selection'