In [1]:
!nvidia-smi

Tue Nov 25 12:09:21 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.5     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   53C    P0              68W / 300W |  58395MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:31:00.0 Off |  

In [2]:
import os
print(os.getcwd())
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

/workspace/Approach1


In [12]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Device setup
device = torch.device("cpu")
print("Using device:", device)

# Load tokenizer and model
# model_name = "thenlpresearcher/bert_punct_model"
model_name = "thenlpresearcher/mpnet_token_cls_model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
model.to(device)

Using device: cpu


MPNetForTokenClassification(
  (mpnet): MPNetModel(
    (embeddings): MPNetEmbeddings(
      (word_embeddings): Embedding(30527, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): MPNetEncoder(
      (layer): ModuleList(
        (0-11): 12 x MPNetLayer(
          (attention): MPNetAttention(
            (attn): MPNetSelfAttention(
              (q): Linear(in_features=768, out_features=768, bias=True)
              (k): Linear(in_features=768, out_features=768, bias=True)
              (v): Linear(in_features=768, out_features=768, bias=True)
              (o): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
    

In [1]:
import re
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import BertTokenizerFast
import os
from tqdm import tqdm

# 1. define punctuation map
punctuation_map = {
    ',': 'COMMA',
    '.': 'PERIOD',
    '?': 'QUESTION',
    '!': 'EXCLAMATION',
    ';': 'SEMICOLON',
    ':': 'COLON',
    '-': 'HYPHEN',
    '–': 'EN_DASH',
    '—': 'EM_DASH',
    '(': 'LEFT_PAREN',
    ')': 'RIGHT_PAREN',
    '[': 'LEFT_BRACKET',
    ']': 'RIGHT_BRACKET',
    '{': 'LEFT_BRACE',
    '}': 'RIGHT_BRACE',
    '"': 'DOUBLE_QUOTE',
    "'": 'SINGLE_QUOTE',
    '…': 'ELLIPSIS',
    '/': 'SLASH',
    '\\': 'BACKSLASH',
    '@': 'AT_SYMBOL',
    '#': 'HASH',
    '$': 'DOLLAR',
    '%': 'PERCENT',
    '&': 'AMPERSAND',
    '*': 'ASTERISK',
    '+': 'PLUS',
    '=': 'EQUALS',
    '<': 'LESS_THAN',
    '>': 'GREATER_THAN',
    '|': 'PIPE',
    '^': 'CARET',
    '`': 'BACKTICK',
    '~': 'TILDE'
}

# Automatically create label_list from punctuation_map
label_list = ["O"] + list(punctuation_map.values())
label_to_id = {l: i for i, l in enumerate(label_list)}

print("Label list:", label_list)

  from .autonotebook import tqdm as notebook_tqdm


Label list: ['O', 'COMMA', 'PERIOD', 'QUESTION', 'EXCLAMATION', 'SEMICOLON', 'COLON', 'HYPHEN', 'EN_DASH', 'EM_DASH', 'LEFT_PAREN', 'RIGHT_PAREN', 'LEFT_BRACKET', 'RIGHT_BRACKET', 'LEFT_BRACE', 'RIGHT_BRACE', 'DOUBLE_QUOTE', 'SINGLE_QUOTE', 'ELLIPSIS', 'SLASH', 'BACKSLASH', 'AT_SYMBOL', 'HASH', 'DOLLAR', 'PERCENT', 'AMPERSAND', 'ASTERISK', 'PLUS', 'EQUALS', 'LESS_THAN', 'GREATER_THAN', 'PIPE', 'CARET', 'BACKTICK', 'TILDE']


In [2]:
import re
def create_token_labels(sentence):
    tokens = []
    labels = []
    parts = re.findall(r"\w+|[^\w\s]", sentence)
    for i, part in enumerate(parts):
        if re.match(r"\w+", part):  # token
            tokens.append(part)
            if i+1 < len(parts) and parts[i+1] in punctuation_map:
                labels.append(punctuation_map[parts[i+1]])
            else:
                labels.append("O")
    return tokens, labels

In [8]:
tokens, labels = create_token_labels("Wow, this is amazing!")

In [10]:
list(zip(tokens, labels))

[('Wow', 'COMMA'), ('this', 'O'), ('is', 'O'), ('amazing', 'EXCLAMATION')]

In [15]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

# -------------------------------
# 2. Token-label creation function
# -------------------------------
def create_token_labels(sentence):
    tokens = []
    labels = []
    parts = re.findall(r"\w+|[^\w\s]", sentence)
    for i, part in enumerate(parts):
        if re.match(r"\w+", part):  # token
            tokens.append(part)
            if i+1 < len(parts) and parts[i+1] in punctuation_map:
                labels.append(punctuation_map[parts[i+1]])
            else:
                labels.append("O")
    return tokens, labels

# -------------------------------
# 3. Load CSV and create token-label dataset
# -------------------------------
def load_and_process(csv_file):
    df = pd.read_csv(csv_file)
    all_tokens = []
    all_labels = []

    for sent in df['sent_meant']:
        tokens, labels = create_token_labels(str(sent))
        all_tokens.append(tokens)
        all_labels.append([label_to_id[l] for l in labels])

    return Dataset.from_dict({"tokens": all_tokens, "labels": all_labels})


# test_dataset  = load_and_process("iwslt2017_en_test.csv")
test_dataset = load_dataset("thenlpresearcher/test_data_marathi")["test"]

def load_and_process_hf(ds, text_column="sent_meant"):
    all_tokens = []
    all_labels = []

    for sent in ds[text_column]:
        tokens, labels = create_token_labels(str(sent))
        all_tokens.append(tokens)
        all_labels.append([label_to_id[l] for l in labels])

    return Dataset.from_dict({"tokens": all_tokens, "labels": all_labels})

test_dataset = load_and_process_hf(test_dataset)

def tokenize_and_align_labels(batch):
    tokenized_inputs = tokenizer(batch["tokens"], is_split_into_words=True, truncation=True, padding="max_length", max_length=128)
    new_labels = []
    for i, label in enumerate(batch["labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            else:
                aligned_labels.append(label[word_id])
        new_labels.append(aligned_labels)
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

test_dataset  = test_dataset.map(tokenize_and_align_labels, batched=True)

test_dataset  = test_dataset.remove_columns(["tokens"])

test_dataset.set_format(type="torch")

Map: 100%|█████████████████████████| 54/54 [00:00<00:00, 1963.05 examples/s]


In [16]:
print(len(test_dataset))

54


In [17]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer

trainer = Trainer(
    model=model,
    tokenizer=tokenizer
)

# -------------------------------
# 4. Make predictions on test dataset
# -------------------------------
predictions, labels, _ = trainer.predict(test_dataset)  # test_dataset must be already prepared
pred_ids = np.argmax(predictions, axis=-1)

# -------------------------------
# 5. Map predictions and labels back to strings
# -------------------------------
true_labels_list = []
pred_labels_list = []

for label_row, pred_row in zip(labels, pred_ids):
    true_row = []
    pred_row_labels = []
    for l, p in zip(label_row, pred_row):
        if l != -100:  # ignore padding
            true_row.append(label_list[l])
            pred_row_labels.append(label_list[p])
    true_labels_list.append(true_row)
    pred_labels_list.append(pred_row_labels)

  trainer = Trainer(
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
from sklearn.metrics import classification_report

# Flatten lists for sklearn
y_true_flat = [label for seq in true_labels_list for label in seq]
y_pred_flat = [label for seq in pred_labels_list for label in seq]

# print("\nDetailed classification report (per label) using sklearn:")
print(classification_report(y_true_flat, y_pred_flat, digits=4))

              precision    recall  f1-score   support

       COLON     1.0000    0.6000    0.7500         5
       COMMA     0.7606    0.6835    0.7200        79
      DOLLAR     0.0000    0.0000    0.0000         0
DOUBLE_QUOTE     0.0000    0.0000    0.0000         0
     EM_DASH     0.0000    0.0000    0.0000         1
 EXCLAMATION     0.0000    0.0000    0.0000         1
      HYPHEN     0.9000    0.5294    0.6667        17
           O     0.9574    0.9728    0.9650       808
     PERCENT     0.0000    0.0000    0.0000         2
      PERIOD     0.8750    0.9825    0.9256        57
    QUESTION     1.0000    1.0000    1.0000         1
 RIGHT_PAREN     0.0000    0.0000    0.0000         2
   SEMICOLON     0.0000    0.0000    0.0000         2
SINGLE_QUOTE     0.0000    0.0000    0.0000         2
       SLASH     1.0000    0.3333    0.5000         3

    accuracy                         0.9286       980
   macro avg     0.4329    0.3401    0.3685       980
weighted avg     0.9263   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
