In [1]:
import re
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast
import os
from tqdm import tqdm

# 1. define punctuation map
punctuation_map = {
    ',': 'COMMA',
    '.': 'PERIOD',
    '?': 'QUESTION',
    '!': 'EXCLAMATION',
    ';': 'SEMICOLON',
    ':': 'COLON',
    '-': 'HYPHEN',
    '–': 'EN_DASH',
    '—': 'EM_DASH',
    '(': 'LEFT_PAREN',
    ')': 'RIGHT_PAREN',
    '[': 'LEFT_BRACKET',
    ']': 'RIGHT_BRACKET',
    '{': 'LEFT_BRACE',
    '}': 'RIGHT_BRACE',
    '"': 'DOUBLE_QUOTE',
    "'": 'SINGLE_QUOTE',
    '…': 'ELLIPSIS',
    '/': 'SLASH',
    '\\': 'BACKSLASH',
    '@': 'AT_SYMBOL',
    '#': 'HASH',
    '$': 'DOLLAR',
    '%': 'PERCENT',
    '&': 'AMPERSAND',
    '*': 'ASTERISK',
    '+': 'PLUS',
    '=': 'EQUALS',
    '<': 'LESS_THAN',
    '>': 'GREATER_THAN',
    '|': 'PIPE',
    '^': 'CARET',
    '`': 'BACKTICK',
    '~': 'TILDE'
}

# Automatically create label_list from punctuation_map
label_list = ["O"] + list(punctuation_map.values())
label_to_id = {l: i for i, l in enumerate(label_list)}

print("Label list:", label_list)

  from .autonotebook import tqdm as notebook_tqdm


Label list: ['O', 'COMMA', 'PERIOD', 'QUESTION', 'EXCLAMATION', 'SEMICOLON', 'COLON', 'HYPHEN', 'EN_DASH', 'EM_DASH', 'LEFT_PAREN', 'RIGHT_PAREN', 'LEFT_BRACKET', 'RIGHT_BRACKET', 'LEFT_BRACE', 'RIGHT_BRACE', 'DOUBLE_QUOTE', 'SINGLE_QUOTE', 'ELLIPSIS', 'SLASH', 'BACKSLASH', 'AT_SYMBOL', 'HASH', 'DOLLAR', 'PERCENT', 'AMPERSAND', 'ASTERISK', 'PLUS', 'EQUALS', 'LESS_THAN', 'GREATER_THAN', 'PIPE', 'CARET', 'BACKTICK', 'TILDE']


In [5]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Load tokenizer and model
model_name = "thenlpresearcher/bert_punct_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model.to(device)

import torch
import numpy as np

# -------------------------------
# 3. Corrected Prediction Function
# -------------------------------
def get_word_and_prediction_ids(text: str, model, tokenizer, device) -> tuple[np.ndarray, np.ndarray]:
    """
    Takes an unpunctuated text string and returns the word IDs and prediction IDs 
    for all tokens in the sequence.
    """
    words = text.lower().split()
    
    if not words:
        return np.array([], dtype=np.int64), np.array([], dtype=np.int64)
    
    encoded_input = tokenizer(
        words, 
        is_split_into_words=True, 
        return_tensors="pt", 
        padding=True, 
        truncation=True
    ).to(device)

    with torch.no_grad():
        outputs = model(**encoded_input)
    
    logits = outputs.logits
    
    # Squeeze to handle single-item batch
    pred_ids = torch.argmax(logits, dim=-1).squeeze().cpu().numpy()
    
    # Convert list of word IDs (including None) to a numpy array, 
    # replacing None with a placeholder -1 for special tokens
    word_ids_list = encoded_input.word_ids()
    word_ids_array = np.array([w if w is not None else -1 for w in word_ids_list], dtype=np.int64)

    return word_ids_array, pred_ids

# -------------------------------
# 4. Example Usage and Cleaned Output
# -------------------------------

def print_predictions_table(input_text: str, model, tokenizer, device, label_list):
    """Runs prediction and prints the formatted table."""
    print(f"\nOriginal Text: {input_text}")
    
    w_ids, p_ids = get_word_and_prediction_ids(input_text, model, tokenizer, device)
    
    # Check if the sentence was truncated or padded (resulting in different lengths)
    if w_ids.ndim == 0 and w_ids.size == 0:
         print("No words found.")
         return
         
    # Handle the case where the output is a scalar (single token)
    if w_ids.ndim == 0:
        w_ids = np.array([w_ids.item()])
        p_ids = np.array([p_ids.item()])
        
    print("\n| Token Word ID | Prediction ID | Predicted Label |")
    print("|---------------|---------------|-----------------|")
    
    for w, p in zip(w_ids, p_ids):
        # Look up the label. Use 'PAD/UNK' if ID is out of bounds (like 17)
        label = label_list[p] if p < len(label_list) else "PAD/UNK"
        print(f"| {w:13d} | {p:13d} | {label:15s} |")
    print("-" * 50)
    
# Example 1
input_text_1 = "how old are you i am a language model"
print_predictions_table(input_text_1, model, tokenizer, device, label_list)

# Example 2
input_text_2 = "what is the capital of france it is paris"
print_predictions_table(input_text_2, model, tokenizer, device, label_list)

Using device: cuda

Original Text: how old are you i am a language model

| Token Word ID | Prediction ID | Predicted Label |
|---------------|---------------|-----------------|
|            -1 |             0 | O               |
|             0 |             0 | O               |
|             1 |             0 | O               |
|             2 |             0 | O               |
|             3 |             3 | QUESTION        |
|             4 |             0 | O               |
|             5 |             0 | O               |
|             6 |             0 | O               |
|             7 |             0 | O               |
|             8 |             2 | PERIOD          |
|            -1 |            17 | SINGLE_QUOTE    |
--------------------------------------------------

Original Text: what is the capital of france it is paris

| Token Word ID | Prediction ID | Predicted Label |
|---------------|---------------|-----------------|
|            -1 |             0 | O 

In [10]:
punctuation_reverse_map = {v: k for k, v in punctuation_map.items()}
punctuation_reverse_map["O"] = ""   # no punctuation

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification


tokenizer = AutoTokenizer.from_pretrained("thenlpresearcher/mpnet_token_cls_model")
model = AutoModelForTokenClassification.from_pretrained("thenlpresearcher/mpnet_token_cls_model")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()



In [12]:
text = "The marshmallow has to be on top"
print(restore_punctuation(text, tokenizer, model, label_list))

The
0
O
marshmallow
0
O
has
0
O
to
1
COMMA
be
0
O
on
0
O
top
0
O
The marshmallow has to, be on top


In [6]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer
from datasets import load_dataset

# -------------------------------
# 1. Device setup
# -------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -------------------------------
# 2. Load tokenizer and model
# -------------------------------
model_name = "thenlpresearcher/mpnet_token_cls_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model.to(device)
model.eval()



def create_token_labels(sentence):
    tokens = []
    labels = []
    parts = re.findall(r"\w+|[^\w\s]", sentence)
    for i, part in enumerate(parts):
        if re.match(r"\w+", part):  # token
            tokens.append(part)
            if i+1 < len(parts) and parts[i+1] in punctuation_map:
                labels.append(punctuation_map[parts[i+1]])
            else:
                labels.append("O")
    return tokens, labels


# ---------------------------
# 3. Load CSV and create token-label dataset
# -------------------------------
def load_and_process(csv_file):
    df = pd.read_csv(csv_file)
    all_tokens = []
    all_labels = []

    for sent in df['text']:
        tokens, labels = create_token_labels(str(sent))
        all_tokens.append(tokens)
        all_labels.append([label_to_id[l] for l in labels])

    return Dataset.from_dict({"tokens": all_tokens, "labels": all_labels})

test_dataset  = load_and_process("../iwslt2017_en_test.csv")

def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(batch["tokens"], is_split_into_words=True, truncation=True, padding="max_length", max_length=128)
    new_labels = []
    for i, label in enumerate(batch["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            else:
                aligned_labels.append(label[word_id])
        new_labels.append(aligned_labels)
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

test_dataset = test_dataset.map(tokenize_and_align_labels, batched=True)

# -------------------------------
# 4. Prepare Trainer
# -------------------------------
trainer = Trainer(model=model, tokenizer=tokenizer)

# -------------------------------
# 5. Make predictions
# -------------------------------
predictions, labels, _ = trainer.predict(test_dataset)
pred_ids = np.argmax(predictions, axis=-1)

# -------------------------------
# 6. Map predictions and labels back to strings
# -------------------------------
true_labels_list = []
pred_labels_list = []

for label_row, pred_row in zip(labels, pred_ids):
    true_row = []
    pred_row_labels = []
    for l, p in zip(label_row, pred_row):
        if l != -100:  # ignore padding
            true_row.append(label_list[l])
            pred_row_labels.append(label_list[p])
    true_labels_list.append(true_row)
    pred_labels_list.append(pred_row_labels)

print("Prediction sample:", pred_labels_list[0])

Using device: cuda


Map: 100%|█████████████████████| 8079/8079 [00:01<00:00, 6188.07 examples/s]
  trainer = Trainer(model=model, tokenizer=tokenizer)
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Prediction sample: ['O', 'O', 'O', 'PERIOD', 'O', 'O', 'PERIOD', 'O', 'COMMA', 'O', 'O', 'O', 'COMMA', 'O', 'O', 'O', 'O', 'O', 'PERIOD']


In [8]:
from sklearn.metrics import classification_report

# Flatten lists for sklearn
y_true_flat = [label for seq in true_labels_list for label in seq]
y_pred_flat = [label for seq in pred_labels_list for label in seq]

print("\nDetailed classification report (per label) using sklearn:")
print(classification_report(y_true_flat, y_pred_flat, digits=4))


Detailed classification report (per label) using sklearn:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

    AMPERSAND     0.0000    0.0000    0.0000         8
        COLON     0.0527    0.0925    0.0672       292
        COMMA     0.2107    0.2850    0.2423      9916
       DOLLAR     0.0000    0.0000    0.0000         8
 DOUBLE_QUOTE     0.0000    0.0000    0.0000       313
      EM_DASH     0.0000    0.0000    0.0000        27
       EQUALS     0.0000    0.0000    0.0000         2
  EXCLAMATION     0.0000    0.0000    0.0000        58
         HASH     0.0000    0.0000    0.0000         2
       HYPHEN     0.0184    0.0206    0.0194      1117
 LEFT_BRACKET     0.0000    0.0000    0.0000        15
            O     0.8914    0.8892    0.8903    117294
       PERIOD     0.7745    0.8394    0.8057      8729
         PLUS     0.0000    0.0000    0.0000         2
     QUESTION     0.0420    0.0201    0.0272       795
RIGHT_BRACKET     0.0000    0.0000    0.0000        34
    SEMICOLON     0.0000    0.0000    0.0000       132
 SINGLE_Q

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
!nvidia-smi

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Mon Nov 24 17:52:17 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.5     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   55C    P0              71W / 300W |  16273MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:31:00.0 Off |  

In [22]:
# -------------------------------
# 3. The Punctuation Restoration Function
# -------------------------------
def restore_punctuation(text: str, model, tokenizer, label_list, device) -> str:
    """
    Restores punctuation to an unpunctuated text string using the BERT Punctuation model.
    """
    words = text.strip().split()
    w_ids, p_ids = get_word_and_prediction_ids(text, model, tokenizer, device)
    w_ids = w_ids[1:-1]
    print(w_ids)
    
    p_ids = p_ids[1:-1]
    print(p_ids)
    
    final_output = []
    for w_id, p_id in zip(w_ids, p_ids):
        if label_list[p_id] != "O":
            punct = label_list[p_id]
        else:
            punct = ""
        final_output.extend([words[w_id], punct])
                 
    # Join the words back into a sentence string, capitalizing the first letter.
    result = " ".join(final_output).strip()
    
    if result:
        # Capitalize the first letter
        return result[0].upper() + result[1:]
    return ""

# -------------------------------
# 4. Apply to Example Sentences
# -------------------------------
print("\n--- Punctuation Restoration Results ---")

# Example 1
input_text_1 = "how old are you i am a language model"
punctuated_text_1 = restore_punctuation(input_text_1, model, tokenizer, label_list, device)
print(f"Original: {input_text_1}")
print(f"Punctuated: **{punctuated_text_1}**")

# Example 2
input_text_2 = "what is the capital of france it is paris"
punctuated_text_2 = restore_punctuation(input_text_2, model, tokenizer, label_list, device)
print(f"\nOriginal: {input_text_2}")
print(f"Punctuated: **{punctuated_text_2}**")

# Example 3 (for demonstration of comma prediction)
input_text_3 = "if you want to know more ask me anything"
punctuated_text_3 = restore_punctuation(input_text_3, model, tokenizer, label_list, device)
print(f"\nOriginal: {input_text_3}")
print(f"Punctuated: **{punctuated_text_3}**")

# Example 4 (for demonstration of exclamation)
input_text_4 = "wow that is amazing"
punctuated_text_4 = restore_punctuation(input_text_4, model, tokenizer, label_list, device)
print(f"\nOriginal: {input_text_4}")
print(f"Punctuated: **{punctuated_text_4}**")


--- Punctuation Restoration Results ---
[0 1 2 3 4 5 6 7 8]
[0 0 0 3 0 0 0 0 2]
Original: how old are you i am a language model
Punctuated: **How  old  are  you QUESTION i  am  a  language  model PERIOD**
[0 1 2 3 4 5 6 7 8]
[0 0 0 0 0 3 0 0 2]

Original: what is the capital of france it is paris
Punctuated: **What  is  the  capital  of  france QUESTION it  is  paris PERIOD**
[0 1 2 3 4 5 6 7 8]
[0 0 0 0 0 1 0 0 2]

Original: if you want to know more ask me anything
Punctuated: **If  you  want  to  know  more COMMA ask  me  anything PERIOD**
[0 1 2 3]
[1 0 0 2]

Original: wow that is amazing
Punctuated: **Wow COMMA that  is  amazing PERIOD**
