In [20]:
!pip uninstall apex -y


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: apex 0.1
Uninstalling apex-0.1:
  Successfully uninstalled apex-0.1
[0m

In [1]:
!nvidia-smi

Mon Nov 24 14:16:15 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.5     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   56C    P0              71W / 300W |  31780MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:31:00.0 Off |  

In [2]:
import os
print(os.getcwd())
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

/workspace/Approach1/seq2seq


In [5]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

# Step 1: Load dataset
print("üîΩ Loading IWSLT2017 English‚ÄìGerman dataset...")
dataset = load_dataset("IWSLT/iwslt2017", "iwslt2017-en-de")

# Step 2: Extract English sentences from each split
def extract_english(ds):
    """Extract English sentences from a dataset split."""
    return [ex["translation"]["en"] for ex in ds]

print("üìÑ Extracting English sentences from each split...")
train_en = extract_english(dataset["train"])
valid_en = extract_english(dataset["validation"])
test_en  = extract_english(dataset["test"])

# Step 3: Save each split separately
print("üíæ Saving splits...")

pd.DataFrame({"text": train_en}).to_csv("iwslt2017_en_train.csv", index=False)
pd.DataFrame({"text": valid_en}).to_csv("iwslt2017_en_valid.csv", index=False)
pd.DataFrame({"text": test_en}).to_csv("iwslt2017_en_test.csv", index=False)

print(f"""
‚úÖ Done!
Train sentences: {len(train_en):,}
Validation sentences: {len(valid_en):,}
Test sentences: {len(test_en):,}

Saved files:
 - iwslt2017_en_train.csv
 - iwslt2017_en_valid.csv
 - iwslt2017_en_test.csv
""")

# Optional preview
print("üîç Example:")
print(pd.DataFrame({'train_example': train_en[:3]}))

üîΩ Loading IWSLT2017 English‚ÄìGerman dataset...
üìÑ Extracting English sentences from each split...
üíæ Saving splits...

‚úÖ Done!
Train sentences: 206,112
Validation sentences: 888
Test sentences: 8,079

Saved files:
 - iwslt2017_en_train.csv
 - iwslt2017_en_valid.csv
 - iwslt2017_en_test.csv

üîç Example:
                                       train_example
0                          Thank you so much, Chris.
1  And it's truly a great honor to have the oppor...
2  I have been blown away by this conference, and...


In [6]:
import re
import pandas as pd

# -------------------------------
# 1. Function to remove punctuation
# -------------------------------
def remove_puncts(text_series):
    """
    Remove all punctuation from a Pandas Series of text.
    """
    return text_series.str.replace(r"[^\w\s]", "", regex=True)

# -------------------------------
# 2. Function to prepare dataset
# -------------------------------
def prepare_punct_dataset(df, text_col="text"):
    """
    Given a DataFrame, create 'src' (punctuation removed) and 'tgt' (original) columns.
    """
    df = df.copy()
    df['src'] = remove_puncts(df[text_col])
    df['tgt'] = df[text_col]
    return df[['src', 'tgt']]

# -------------------------------
# 3. Example usage
# -------------------------------
# Load CSVs
train_df = pd.read_csv("iwslt2017_en_train.csv")
val_df = pd.read_csv("iwslt2017_en_valid.csv")
test_df = pd.read_csv("iwslt2017_en_test.csv")

# Prepare datasets
train_dataset = prepare_punct_dataset(train_df)
val_dataset = prepare_punct_dataset(val_df)
test_dataset = prepare_punct_dataset(test_df)

# Quick check
print(train_dataset.head())

                                                 src  \
0                            Thank you so much Chris   
1  And its truly a great honor to have the opport...   
2  I have been blown away by this conference and ...   
3  And I say that sincerely partly because  I nee...   
4                      Put yourselves in my position   

                                                 tgt  
0                          Thank you so much, Chris.  
1  And it's truly a great honor to have the oppor...  
2  I have been blown away by this conference, and...  
3  And I say that sincerely, partly because  I ne...  
4                     Put yourselves in my position.  


In [7]:
!huggingface-cli login --token {hf_token}

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: fineGrained).
The token `llm_finetuning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llm_finetuning`


In [8]:
from datasets import Dataset, DatasetDict, Features, Value

In [7]:
features = Features({
    "src": Value("string"),
    "tgt": Value("string")
})

hf_train = Dataset.from_pandas(train_dataset, features=features)
hf_val = Dataset.from_pandas(val_dataset, features=features)
hf_test = Dataset.from_pandas(test_dataset, features=features)

dataset = DatasetDict({
    "train": hf_train,
    "validation": hf_val,
    "test": hf_test
})

# -------------------------------
# 3. Push dataset to Hugging Face Hub
# -------------------------------
dataset_name = "english_punctuation_restoration"
dataset.push_to_hub(dataset_name, private=False)  # set private=True if needed

Uploading the dataset shards:   0%|                   | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format:   0%|           | 0/207 [00:00<?, ?ba/s][A
Creating parquet from Arrow format:  34%|‚ñé| 71/207 [00:00<00:00, 703.41ba/s][A
Creating parquet from Arrow format: 100%|‚ñà| 207/207 [00:00<00:00, 679.82ba/s[A
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:05<00:00,  5.75s/it]
Uploading the dataset shards:   0%|                   | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 178.28ba/s][A
Uploading files as a binary IO buffer is not supported by Xet Storage. Falling back to HTTP upload.
Uploading the dataset shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:01<00:00,  1.11s/it]
Uploading the dataset shards:   0%|                   | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100

CommitInfo(commit_url='https://huggingface.co/datasets/thenlpresearcher/english_punctuation_restoration/commit/9e20f337daf0f00dfe98d454f96908cb53609d17', commit_message='Upload dataset', commit_description='', oid='9e20f337daf0f00dfe98d454f96908cb53609d17', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/thenlpresearcher/english_punctuation_restoration', endpoint='https://huggingface.co', repo_type='dataset', repo_id='thenlpresearcher/english_punctuation_restoration'), pr_revision=None, pr_num=None)

In [2]:
from datasets import load_dataset

dataset = load_dataset("thenlpresearcher/english_punctuation_restoration")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# T5 base checkpoint
model_checkpoint = "google-t5/t5-base"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [9]:
without_punct_sentence = dataset["train"][1]["src"]
punct_sentence = dataset["train"][1]["tgt"]

inputs = tokenizer(without_punct_sentence, text_target=punct_sentence)
print(dataset["train"][1]["src"])
print(dataset["train"][1]["tgt"])

inputs

And its truly a great honor to have the opportunity to come to this stage twice Im extremely grateful
And it's truly a great honor to have the opportunity to come to this stage twice; I'm extremely grateful.


{'input_ids': [275, 165, 1892, 3, 9, 248, 3610, 12, 43, 8, 1004, 12, 369, 12, 48, 1726, 4394, 1318, 2033, 7335, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [275, 34, 31, 7, 1892, 3, 9, 248, 3610, 12, 43, 8, 1004, 12, 369, 12, 48, 1726, 4394, 117, 27, 31, 51, 2033, 7335, 5, 1]}

In [10]:
wrong_targets = tokenizer(punct_sentence)
targets1 = tokenizer(without_punct_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))
print(tokenizer.convert_ids_to_tokens(targets1["input_ids"]))

['‚ñÅAnd', '‚ñÅit', "'", 's', '‚ñÅtruly', '‚ñÅ', 'a', '‚ñÅgreat', '‚ñÅhonor', '‚ñÅto', '‚ñÅhave', '‚ñÅthe', '‚ñÅopportunity', '‚ñÅto', '‚ñÅcome', '‚ñÅto', '‚ñÅthis', '‚ñÅstage', '‚ñÅtwice', ';', '‚ñÅI', "'", 'm', '‚ñÅextremely', '‚ñÅgrateful', '.', '</s>']
['‚ñÅAnd', '‚ñÅit', "'", 's', '‚ñÅtruly', '‚ñÅ', 'a', '‚ñÅgreat', '‚ñÅhonor', '‚ñÅto', '‚ñÅhave', '‚ñÅthe', '‚ñÅopportunity', '‚ñÅto', '‚ñÅcome', '‚ñÅto', '‚ñÅthis', '‚ñÅstage', '‚ñÅtwice', ';', '‚ñÅI', "'", 'm', '‚ñÅextremely', '‚ñÅgrateful', '.', '</s>']
['‚ñÅAnd', '‚ñÅits', '‚ñÅtruly', '‚ñÅ', 'a', '‚ñÅgreat', '‚ñÅhonor', '‚ñÅto', '‚ñÅhave', '‚ñÅthe', '‚ñÅopportunity', '‚ñÅto', '‚ñÅcome', '‚ñÅto', '‚ñÅthis', '‚ñÅstage', '‚ñÅtwice', '‚ñÅIm', '‚ñÅextremely', '‚ñÅgrateful', '</s>']


In [11]:
max_length = 128


def preprocess_function(examples):
    inputs = [ex for ex in examples["src"]]
    targets = [ex for ex in examples["tgt"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, max_length=max_length, truncation=True
    )
    return model_inputs

In [12]:
from datasets import load_dataset

# Suppose you already have `dataset` with splits train/val/test
# and a preprocess_function defined

tokenized_datasets = {}
for split in dataset.keys():  # e.g., "train", "validation", "test"
    tokenized_datasets[split] = dataset[split].map(
        preprocess_function,
        batched=True,
        remove_columns=dataset[split].column_names
    )


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 206112/206112 [00:12<00:00, 16585.09 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 888/888 [00:00<00:00, 13270.23 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8079/8079 [00:00<00:00, 20575.31 examples/s]


In [13]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("google-t5/t5-base")

2025-11-25 06:48:57.763633: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-25 06:48:57.891760: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-25 06:49:00.112919: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [15]:
batch = data_collator([tokenized_datasets["train"][i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])

In [16]:
batch["labels"]

tensor([[  275,    34,    31,     7,  1892,     3,     9,   248,  3610,    12,
            43,     8,  1004,    12,   369,    12,    48,  1726,  4394,   117,
            27,    31,    51,  2033,  7335,     5,     1,  -100,  -100,  -100,
          -100,  -100,  -100,  -100],
        [   27,    43,   118,     3, 17378,   550,    57,    48,  2542,     6,
            11,    27,   241,    12,  2763,    66,    13,    25,    21,     8,
           186,  1245,  2622,    81,   125,    27,   141,    12,   497,     8,
           119,   706,     5,     1]])

In [17]:
batch["decoder_input_ids"]

tensor([[    0,   275,    34,    31,     7,  1892,     3,     9,   248,  3610,
            12,    43,     8,  1004,    12,   369,    12,    48,  1726,  4394,
           117,    27,    31,    51,  2033,  7335,     5,     1,     0,     0,
             0,     0,     0,     0],
        [    0,    27,    43,   118,     3, 17378,   550,    57,    48,  2542,
             6,    11,    27,   241,    12,  2763,    66,    13,    25,    21,
             8,   186,  1245,  2622,    81,   125,    27,   141,    12,   497,
             8,   119,   706,     5]])

In [18]:
for i in range(1, 3):
    print(tokenized_datasets["train"][i]["labels"])

[275, 34, 31, 7, 1892, 3, 9, 248, 3610, 12, 43, 8, 1004, 12, 369, 12, 48, 1726, 4394, 117, 27, 31, 51, 2033, 7335, 5, 1]
[27, 43, 118, 3, 17378, 550, 57, 48, 2542, 6, 11, 27, 241, 12, 2763, 66, 13, 25, 21, 8, 186, 1245, 2622, 81, 125, 27, 141, 12, 497, 8, 119, 706, 5, 1]


In [19]:
import evaluate

metric = evaluate.load("sacrebleu")

In [20]:
import numpy as np


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    # In case the model returns more than the prediction logits
    if isinstance(preds, tuple):
        preds = preds[0]

    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100s in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

In [25]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"iitb-t5-finetuned-punctuation",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)



In [26]:
from transformers import Seq2SeqTrainer

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Seq2SeqTrainer(
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [23]:
trainer.evaluate(max_length=max_length)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


{'eval_loss': 1.501761794090271,
 'eval_model_preparation_time': 0.0163,
 'eval_bleu': 18.28442992448596,
 'eval_runtime': 26.1347,
 'eval_samples_per_second': 33.978,
 'eval_steps_per_second': 0.536}

In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu
1,0.0988,0.094745,52.882251
2,0.0879,0.090962,52.969104
3,0.0832,0.089705,53.029294


TrainOutput(global_step=19323, training_loss=0.0932592289211994, metrics={'train_runtime': 2094.0836, 'train_samples_per_second': 295.278, 'train_steps_per_second': 9.227, 'total_flos': 4.520747334057984e+16, 'train_loss': 0.0932592289211994, 'epoch': 3.0})

In [47]:
trainer.push_to_hub(tags="text2text-generation", commit_message="Training complete")

Processing Files (0 / 0): |                    |  0.00B /  0.00B            
Processing Files (5 / 5): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà|  892MB /  892MB,  179MB/s  [A
New Data Upload: |                             |  0.00B /  0.00B,  0.00B/s  


CommitInfo(commit_url='https://huggingface.co/thenlpresearcher/iitb-t5-finetuned-punctuation/commit/f924c3a798ad838f8f05be932d4bbd0e0d113555', commit_message='Training complete', commit_description='', oid='f924c3a798ad838f8f05be932d4bbd0e0d113555', pr_url=None, repo_url=RepoUrl('https://huggingface.co/thenlpresearcher/iitb-t5-finetuned-punctuation', endpoint='https://huggingface.co', repo_type='model', repo_id='thenlpresearcher/iitb-t5-finetuned-punctuation'), pr_revision=None, pr_num=None)

In [48]:
from transformers import pipeline
# This might accidentally default to a translation task
punctuator_pipeline = pipeline("text2text-generation", model="thenlpresearcher/iitb-t5-finetuned-punctuation")

Device set to use cuda:0


In [56]:
text = "the morning sky stretched over the city like a quiet sheet of pale blue while people hurried through the streets"
punctuator_pipeline(text,
                   max_length=128)

[{'generated_text': 'the morning sky stretched over the city like a quiet sheet of pale blue while people hurried through the streets.'}]

In [57]:
from transformers import pipeline
# This might accidentally default to a translation task
punctuator_pipeline = pipeline("text2text-generation", model="thenlpresearcher/iitb-t5-finetuned-punctuation")

text = "the morning sky stretched over the city like a quiet sheet of pale blue while people hurried through the streets"
punctuator_pipeline(text,
                   max_length=128)

#output
# [{'generated_text': 'the morning sky stretched over the city like a quiet sheet of pale blue while people hurried through the streets.'}]

Device set to use cuda:0


[{'generated_text': 'the morning sky stretched over the city like a quiet sheet of pale blue while people hurried through the streets.'}]

In [3]:
len(dataset["test"])

8079

In [4]:
dataset["test"][0]

{'src': 'Several years ago here at TED Peter Skillman  introduced a design challenge  called the marshmallow challenge',
 'tgt': 'Several years ago here at TED, Peter Skillman  introduced a design challenge  called the marshmallow challenge.'}

In [5]:
!nvidia-smi

Tue Nov 25 12:20:59 2025       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.54.03              Driver Version: 535.54.03    CUDA Version: 12.5     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100 80GB PCIe          Off | 00000000:17:00.0 Off |                    0 |
| N/A   52C    P0              67W / 300W |  57384MiB / 81920MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
|   1  NVIDIA A100 80GB PCIe          Off | 00000000:31:00.0 Off |  

In [6]:
import os
print(os.getcwd())
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

/workspace/Approach1/seq2seq


In [8]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
from transformers import pipeline

# Use batching + GPU (if available)
punctuator_pipeline = pipeline(
    "text2text-generation",
    model="thenlpresearcher/iitb-t5-finetuned-punctuation",
    device=device,            # GPU; use device=-1 for CPU
    batch_size=64        # adjust based on GPU RAM
)

def restore_punctuation_t5_batch(text_list):
    # The pipeline automatically batches under the hood
    outputs = punctuator_pipeline(
        text_list,
        max_length=128
    )
    # Pipeline returns list of dicts
    return [o["generated_text"] for o in outputs]

# Collect all input sentences
src_texts = list(dataset["test"]["src"])

# Run the whole batch in parallel
predicted_sentences = restore_punctuation_t5_batch(src_texts)

# # Optional: print paired outputs
# for src, pred in zip(src_texts, predicted_sentences):
#     print(src)
#     print(pred)
#     print("---")
print(len(predicted_sentences))

Device set to use cuda


8079


In [13]:
import pandas as pd

# Create a brand-new empty dataframe
df = pd.DataFrame()
# Add model predictions
df["prediction"] = predicted_sentences

# Add source fields from HF dataset
df["src"] = dataset["test"]["src"]
df["gt"]   = dataset["test"]["tgt"]

# Save the file
output_file = "approach1_eng_to_eng_t5_outputs_punct_restor_data.csv"
df.to_csv(output_file, index=False)

print("Saved:", output_file)

Saved: approach1_eng_to_eng_t5_outputs_punct_restor_data.csv
