In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import time
import random
import numpy as np
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForTokenClassification
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
start_time = time.time()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Read CSV files
# To test the models' performance in low-resource settings, the training files include 9k words. The complete files are available in the main branch.
# Adjust the file path according to your needs.
traindata = pd.read_csv('/content/drive/MyDrive/segmentation/Evahan_train_WBD_processed.csv')
testdata =  pd.read_csv('/content/drive/MyDrive/segmentation/EvaHan_WBD_testb_gold_processed.csv')
# Convert labels to integer type
traindata['label'] = traindata['label'].apply(lambda x: [int(l) if l.isdigit() else None for l in str(x)])
testdata['label'] = testdata['label'].apply(lambda x: [int(l) if l.isdigit() else None for l in str(x)])
# Drop samples containing NaN
traindata.dropna(inplace=True)
traindata = traindata.reset_index(drop=True)
testdata.dropna(inplace=True)
testdata = testdata.reset_index(drop=True)
# Split the training set and validation set
train_df, val_df = train_test_split(traindata, test_size=0.2)
test_df = testdata

# Load the BERT tokenizer and model
model_name = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForTokenClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

# Define data processing function

def process_data(texts, labels):
        input_ids = []
        attention_masks = []
        padded_labels = []
        for text, label in zip(texts, labels):
            if len(text) > 256:
                continue
            encoded_text = tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                truncation=True,
                max_length=256,
                padding='max_length',
                return_tensors='pt'
            )
            input_ids.append(encoded_text['input_ids'].to(device))
            attention_masks.append(encoded_text['attention_mask'].to(device))
            padded_label = F.pad(torch.tensor(label).to(device), (0, 256 - len(label)))
            padded_labels.append(padded_label)
        input_ids = torch.cat(input_ids, dim=0)
        attention_masks = torch.cat(attention_masks, dim=0)
        labels = torch.stack(padded_labels).to(device)
        return input_ids, attention_masks, labels

# Create data loaders
batch_size = 32
# Process training set data
train_inputs, train_masks, train_labels = process_data(train_df['input'], train_df['label'])
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Process validation set data
val_inputs, val_masks, val_labels = process_data(val_df['input'], val_df['label'])
val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_loader = DataLoader(val_data, batch_size=batch_size)

# Process test set data
test_inputs, test_masks, test_labels = process_data(test_df['input'], test_df['label'])
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_loader = DataLoader(test_data, batch_size=batch_size)

# Set the model to training mode
model.train().to(device)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
loss_fn = torch.nn.CrossEntropyLoss()

# Train the model
epochs = 5
best_loss = float('inf')
best_model_weights = None
val_losses = []

for epoch in range(epochs):
    total_loss = 0
    model.train()
    for batch in train_loader:
        input_ids, attention_masks, labels = batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1}/{epochs} - Average Loss: {average_loss}')

    # Validation step
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_masks, labels = batch

            outputs = model(input_ids, attention_mask=attention_masks, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()

        average_val_loss = val_loss / len(val_loader)
        print(f'Epoch {epoch+1}/{epochs} - Validation Loss: {average_val_loss}')

        # Update the best model weights based on validation set loss
        if average_val_loss < best_loss:
            best_loss = average_val_loss
            best_model_weights = model.state_dict().copy
end_time = time.time()
run_time = end_time - start_time
print("Run time:", run_time, "seconds")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5 - Average Loss: 0.10515152535787443
Epoch 1/5 - Validation Loss: 0.024668315002186733
Epoch 2/5 - Average Loss: 0.020633490733438876
Epoch 2/5 - Validation Loss: 0.011538354129615154
Epoch 3/5 - Average Loss: 0.012543127099733526
Epoch 3/5 - Validation Loss: 0.009033469864251938
Epoch 4/5 - Average Loss: 0.009039943413127487
Epoch 4/5 - Validation Loss: 0.007260194259949706
Epoch 5/5 - Average Loss: 0.006820232618781851
Epoch 5/5 - Validation Loss: 0.006641870131716132
Run time: 301.0223116874695 seconds


In [None]:
# Predict and evaluate
model.eval()
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_masks, labels = batch

        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=2)

        predictions.extend(predicted_labels.tolist())

# transfer the results in to string
predicted_labels_str = []
for pred in predictions:
    labels_str = ''.join([str(p) for p in pred])
    predicted_labels_str.append(labels_str)

In [None]:
ceshi_labels = test_labels

In [None]:
text_changdu = []
for i in test_df['input']:
  if len(i) >256:
    continue
  else:
    text_changdu.append(i)
print(len(text_changdu))
print(len(predictions))

2144
2144


In [None]:
ceshi_labels_trimmed = []
predictions_trimmed = []

for i in range(len(predictions)):
    indices = len(text_changdu[i])
    ceshi_labels_trimmed.append(ceshi_labels[i][: indices])
    predictions_trimmed.append(predictions[i][: indices])

In [None]:
from sklearn.metrics import classification_report

# transfer the results into int
predicted_labels_flat = [int(p) for pred in predictions_trimmed for p in pred]
filtered_labels_flat = [int(l) for label in ceshi_labels_trimmed for l in label]

# calculate precision、recall、f1-score
report = classification_report(filtered_labels_flat, predicted_labels_flat, digits = 4)
print(report)
#Remind! This is not the actual evaluation of segmentation.
#This is only metric for four labels.
#Please refer to the evaluation code for the actual ones.

              precision    recall  f1-score   support

           0     0.7483    0.8918    0.8137      8970
           1     0.9808    0.9486    0.9644     52310

    accuracy                         0.9402     61280
   macro avg     0.8645    0.9202    0.8891     61280
weighted avg     0.9468    0.9402    0.9424     61280



In [None]:
text = []
for i in test_df['input']:
  if len(i) >256:
    continue
  else:
    text.append(i)

In [None]:
ceshi_labels_trimmed = [t.tolist() for t in ceshi_labels_trimmed]

In [None]:
import csv
# Write the results into the csv file
with open('WBD_BERT_output.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['text', 'label', 'prediction'])
    for i in range(len(text)):
        writer.writerow([text[i], ceshi_labels_trimmed[i], predictions_trimmed[i]])