In [1]:
!nvidia-smi

Tue Nov 25 10:54:37 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.5     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   53C    P0              68W / 300W |  81044MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:31:00.0 Off |  

In [2]:
import os
print(os.getcwd())
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

/workspace/Approach1


In [9]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Device setup
device = torch.device("cpu")
print("Using device:", device)

# Load tokenizer and model
model_name = "thenlpresearcher/bert_punct_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model.to(device)

Using device: cpu


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024

In [10]:
import re
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast
import os
from tqdm import tqdm

# 1. define punctuation map
punctuation_map = {
    ',': 'COMMA',
    '.': 'PERIOD',
    '?': 'QUESTION',
    '!': 'EXCLAMATION',
    ';': 'SEMICOLON',
    ':': 'COLON',
    '-': 'HYPHEN',
    '–': 'EN_DASH',
    '—': 'EM_DASH',
    '(': 'LEFT_PAREN',
    ')': 'RIGHT_PAREN',
    '[': 'LEFT_BRACKET',
    ']': 'RIGHT_BRACKET',
    '{': 'LEFT_BRACE',
    '}': 'RIGHT_BRACE',
    '"': 'DOUBLE_QUOTE',
    "'": 'SINGLE_QUOTE',
    '…': 'ELLIPSIS',
    '/': 'SLASH',
    '\\': 'BACKSLASH',
    '@': 'AT_SYMBOL',
    '#': 'HASH',
    '$': 'DOLLAR',
    '%': 'PERCENT',
    '&': 'AMPERSAND',
    '*': 'ASTERISK',
    '+': 'PLUS',
    '=': 'EQUALS',
    '<': 'LESS_THAN',
    '>': 'GREATER_THAN',
    '|': 'PIPE',
    '^': 'CARET',
    '`': 'BACKTICK',
    '~': 'TILDE'
}

# Automatically create label_list from punctuation_map
label_list = ["O"] + list(punctuation_map.values())
label_to_id = {l: i for i, l in enumerate(label_list)}

print("Label list:", label_list)

Label list: ['O', 'COMMA', 'PERIOD', 'QUESTION', 'EXCLAMATION', 'SEMICOLON', 'COLON', 'HYPHEN', 'EN_DASH', 'EM_DASH', 'LEFT_PAREN', 'RIGHT_PAREN', 'LEFT_BRACKET', 'RIGHT_BRACKET', 'LEFT_BRACE', 'RIGHT_BRACE', 'DOUBLE_QUOTE', 'SINGLE_QUOTE', 'ELLIPSIS', 'SLASH', 'BACKSLASH', 'AT_SYMBOL', 'HASH', 'DOLLAR', 'PERCENT', 'AMPERSAND', 'ASTERISK', 'PLUS', 'EQUALS', 'LESS_THAN', 'GREATER_THAN', 'PIPE', 'CARET', 'BACKTICK', 'TILDE']


In [11]:
# -------------------------------
# 2. Token-label creation function
# -------------------------------
def create_token_labels(sentence):
    tokens = []
    labels = []
    parts = re.findall(r"\w+|[^\w\s]", sentence)
    for i, part in enumerate(parts):
        if re.match(r"\w+", part):  # token
            tokens.append(part)
            if i+1 < len(parts) and parts[i+1] in punctuation_map:
                labels.append(punctuation_map[parts[i+1]])
            else:
                labels.append("O")
    return tokens, labels

# -------------------------------
# 3. Load CSV and create token-label dataset
# -------------------------------
def load_and_process(csv_file):
    df = pd.read_csv(csv_file)
    all_tokens = []
    all_labels = []

    for sent in df['text']:
        tokens, labels = create_token_labels(str(sent))
        all_tokens.append(tokens)
        all_labels.append([label_to_id[l] for l in labels])

    return Dataset.from_dict({"tokens": all_tokens, "labels": all_labels})


test_dataset  = load_and_process("iwslt2017_en_test.csv")

def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(batch["tokens"], is_split_into_words=True, truncation=True, padding="max_length", max_length=128)
    new_labels = []
    for i, label in enumerate(batch["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            else:
                aligned_labels.append(label[word_id])
        new_labels.append(aligned_labels)
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

test_dataset  = test_dataset.map(tokenize_and_align_labels, batched=True)

test_dataset  = test_dataset.remove_columns(["tokens"])

test_dataset.set_format(type="torch")

Map: 100%|█████████████████████| 8079/8079 [00:01<00:00, 6713.67 examples/s]


In [12]:
print(len(test_dataset))

8079


In [13]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer

trainer = Trainer(
    model=model,
    tokenizer=tokenizer
)

# -------------------------------
# 4. Make predictions on test dataset
# -------------------------------
predictions, labels, _ = trainer.predict(test_dataset)  # test_dataset must be already prepared
pred_ids = np.argmax(predictions, axis=-1)

# -------------------------------
# 5. Map predictions and labels back to strings
# -------------------------------
true_labels_list = []
pred_labels_list = []

for label_row, pred_row in zip(labels, pred_ids):
    true_row = []
    pred_row_labels = []
    for l, p in zip(label_row, pred_row):
        if l != -100:  # ignore padding
            true_row.append(label_list[l])
            pred_row_labels.append(label_list[p])
    true_labels_list.append(true_row)
    pred_labels_list.append(pred_row_labels)


  trainer = Trainer(
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:
from sklearn.metrics import classification_report

# Flatten lists for sklearn
y_true_flat = [label for seq in true_labels_list for label in seq]
y_pred_flat = [label for seq in pred_labels_list for label in seq]

print("\nDetailed classification report (per label) using sklearn:")
print(classification_report(y_true_flat, y_pred_flat, digits=4))


Detailed classification report (per label) using sklearn:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

    AMPERSAND     1.0000    1.0000    1.0000         8
        COLON     0.5710    0.6336    0.6006       292
        COMMA     0.7985    0.8220    0.8101      9916
       DOLLAR     0.6000    0.3750    0.4615         8
 DOUBLE_QUOTE     0.5837    0.3898    0.4674       313
      EM_DASH     0.0000    0.0000    0.0000        27
       EQUALS     0.0000    0.0000    0.0000         2
  EXCLAMATION     0.7500    0.1552    0.2571        58
         HASH     0.0000    0.0000    0.0000         2
       HYPHEN     0.6538    0.5157    0.5766      1117
 LEFT_BRACKET     0.0000    0.0000    0.0000        15
            O     0.9844    0.9858    0.9851    117294
      PERCENT     0.0000    0.0000    0.0000         0
       PERIOD     0.9662    0.9714    0.9688      8729
         PLUS     0.0000    0.0000    0.0000         2
     QUESTION     0.9108    0.8994    0.9051       795
RIGHT_BRACKET     0.0000    0.0000    0.0000        34
    SEMIC

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
