In [1]:
from datasets import load_dataset
from transformers import pipeline, DataCollatorWithPadding, AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
from collections import Counter
from evaluate import load
import numpy as np
import torch

2025-05-05 15:12:33.532412: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746457954.143980   29046 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746457954.357434   29046 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746457955.724370   29046 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746457955.724462   29046 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746457955.724464   29046 computation_placer.cc:177] computation placer alr

In [2]:
dataset = load_dataset("community-datasets/youtube_caption_corrections")
dataset

DatasetDict({
    train: Dataset({
        features: ['video_ids', 'default_seq', 'correction_seq', 'diff_type'],
        num_rows: 10769
    })
})

In [3]:
# Mapping function
def find_punctuation_errors(dataset):
    masked_default_seq_list = []
    correct_punctuation = ''
    first_mask_detected = 0
    
    for i, error_val in enumerate(dataset["diff_type"]):
        masked_default_seq_list.append(dataset['default_seq'][i])
        if error_val == 2 and first_mask_detected:
            break
        elif error_val == 2:
            first_mask_detected += 1
            masked_default_seq_list.append("[MASK]")  
            # at this index, we want to extract the correct punctuation at the same index in correction_seq ()
            # dataset has corrected punctuations stored in this format: "[word][punctuation]"
            correct_punctuation = dataset["correction_seq"][i][-1]

    # handle case where no punctuation error was found
    if correct_punctuation == '': 
        return None
        
    return {
        "masked_default_seq": ' '.join(masked_default_seq_list),
        "punctuation": correct_punctuation
    }

In [4]:
subset = dataset['train'].select([0])
subset

Dataset({
    features: ['video_ids', 'default_seq', 'correction_seq', 'diff_type'],
    num_rows: 1
})

In [5]:
# Apply mapping on subset test to verify correctness
modified_subset = subset.map(find_punctuation_errors)
modified_subset

Dataset({
    features: ['video_ids', 'default_seq', 'correction_seq', 'diff_type', 'masked_default_seq', 'punctuation'],
    num_rows: 1
})

In [6]:
modified_subset['masked_default_seq'][0]

'hey everybody [MASK] ivan from weights and biases here'

In [7]:
modified_subset['punctuation'][0]

','

In [8]:
# Now lets apply masking to entire dataset to create the new 'masked_default_seq' column
new_dataset = dataset.map(find_punctuation_errors)
new_dataset = new_dataset['train']
new_dataset

Dataset({
    features: ['video_ids', 'default_seq', 'correction_seq', 'diff_type', 'masked_default_seq', 'punctuation'],
    num_rows: 10747
})

In [9]:
# Filter out None rows and Nonpunctuations
acceptedPunctuationTypes = ['.', ',', ';', ':', '-', '?', '!']
cleaned_dataset = new_dataset.filter(lambda x: x['masked_default_seq'] is not None and x['punctuation'] in acceptedPunctuationTypes)
cleaned_dataset

Dataset({
    features: ['video_ids', 'default_seq', 'correction_seq', 'diff_type', 'masked_default_seq', 'punctuation'],
    num_rows: 10648
})

In [10]:
# Split into train, test, and validation
cleaned_dataset = cleaned_dataset.train_test_split(test_size=0.2, seed=1227)
cleaned_dataset

DatasetDict({
    train: Dataset({
        features: ['video_ids', 'default_seq', 'correction_seq', 'diff_type', 'masked_default_seq', 'punctuation'],
        num_rows: 8518
    })
    test: Dataset({
        features: ['video_ids', 'default_seq', 'correction_seq', 'diff_type', 'masked_default_seq', 'punctuation'],
        num_rows: 2130
    })
})

In [11]:
# Uncomment Section 1 if CPU/GPU Usage is not a problem, then uncomment the subsequent sections involving 'validationSet'

# 1)
train_validation_dataset = cleaned_dataset['train'].train_test_split(test_size= 0.25, seed=1227)
testSet = cleaned_dataset['test']
trainSet = train_validation_dataset['train']
validationSet = train_validation_dataset['test']

# 2)
# testSet = cleaned_dataset['test']
# trainSet = cleaned_dataset['train']

In [12]:
counts = Counter(trainSet['punctuation'])
counts

Counter({',': 3079, '.': 3017, '?': 168, '-': 66, ':': 41, ';': 11, '!': 6})

In [13]:
train_example = trainSet['masked_default_seq'][0]
train_example

"you might recognize what we have here in yellow as the general form of a p-series and what we're going to do in this video is think about under which conditions under for what Peas will this pea series converge and for it to be a p-series by definition P is going to be greater than zero [MASK] so I've set up some visualizations to think about how we are going to understand when this pea series converges so over here you have the graph"

In [14]:
trainSet['punctuation'][0]

'.'

In [15]:
# Inference model on basic example via a pipeline
fill_mask = pipeline("fill-mask", model="google-bert/bert-base-uncased")
result = fill_mask(train_example)
result

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


[{'score': 0.34914007782936096,
  'token': 1010,
  'token_str': ',',
  'sequence': "you might recognize what we have here in yellow as the general form of a p - series and what we ' re going to do in this video is think about under which conditions under for what peas will this pea series converge and for it to be a p - series by definition p is going to be greater than zero, so i ' ve set up some visualizations to think about how we are going to understand when this pea series converges so over here you have the graph"},
 {'score': 0.3287944495677948,
  'token': 1012,
  'token_str': '.',
  'sequence': "you might recognize what we have here in yellow as the general form of a p - series and what we ' re going to do in this video is think about under which conditions under for what peas will this pea series converge and for it to be a p - series by definition p is going to be greater than zero. so i ' ve set up some visualizations to think about how we are going to understand when this p

In [16]:
# Set up tokenizer, collator and metric for model testing
checkpoint = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, use_fast=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

metric = load('accuracy')
def compute_metrics(preds):
    logits, labels = preds
    predictions = np.argmax(logits, axis=-1)
    mask = labels != -100
    return metric.compute(predictions=predictions[mask], references=labels[mask])

In [17]:
# Tokenize datasets to create ids of the input str list containing the [MASK] token
# def preprocess(examples):
#     model_inputs = tokenizer(examples["masked_default_seq"], truncation=True, padding=True)
#     labels = tokenizer(examples["punctuation"], padding=True, truncation=True)
#     model_inputs["labels"] = labels["input_ids"] # ground truth labels in id form
#     return model_inputs

# Compute loss at [MASK] position
def preprocess(example):
    tokens = tokenizer(example["masked_default_seq"], padding="max_length", truncation=True, max_length=128) # needed to specify padding to max_length, else unable to create tensors
    labels = [-100] * len(tokens["input_ids"])
    mask_token_id = tokenizer.mask_token_id
    if mask_token_id in tokens["input_ids"]:
        mask_index = tokens["input_ids"].index(mask_token_id)
        label_token_id = tokenizer.convert_tokens_to_ids(example["punctuation"])
        labels[mask_index] = label_token_id
    tokens["labels"] = labels
    return tokens

trainSet_tokenized = trainSet.map(preprocess)
validationSet_tokenized = validationSet.map(preprocess)
testSet_tokenized = testSet.map(preprocess)

In [18]:
trainSet_tokenized

Dataset({
    features: ['video_ids', 'default_seq', 'correction_seq', 'diff_type', 'masked_default_seq', 'punctuation', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
    num_rows: 6388
})

In [19]:
# Drop unused features
unused_features = ['video_ids', 'default_seq', 'correction_seq', 'diff_type', 'masked_default_seq', 'punctuation', 'token_type_ids']
trainSet_tokenized = trainSet_tokenized.remove_columns(unused_features)
validationSet_tokenized = validationSet_tokenized.remove_columns(unused_features)
testSet_tokenized = testSet_tokenized.remove_columns(unused_features)

In [20]:
trainSet_tokenized

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 6388
})

In [21]:
# Set up general training args
batch_size = 16
args = TrainingArguments(
    f"BERT-finetuned-punctuationAccuracy", 
    eval_strategy = "no", # kernel kept dying so changed this from "epoch"
    save_strategy = "no", # kernel kept dying so chnaged this from "epoch"
    learning_rate=2e-5, 
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1, # kernel kept dying so changed this from "5"
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# Model Instantiation
model = AutoModelForMaskedLM.from_pretrained(checkpoint) # model architecture for fill-mask operations
trainer = Trainer(
    model, 
    args,  
    train_dataset=trainSet_tokenized, 
    #eval_dataset=validationSet_tokenized, # kernel kept dying so removed validation set entirely
    tokenizer=tokenizer, # note: deprecated argument
    data_collator=data_collator,  
    compute_metrics=compute_metrics
)

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  trainer = Trainer(


In [22]:
trainer.train()
trainer.save_model("finetuned-BERT-punctuation-restoration")

Step,Training Loss


In [22]:
# POST-TRAINING
# Reloading trained model from local directory
finetuned_model = AutoModelForMaskedLM.from_pretrained("finetuned-BERT-punctuation-restoration")
finetuned_trainer = Trainer(model=finetuned_model, compute_metrics=compute_metrics)
results = finetuned_trainer.evaluate(eval_dataset=testSet_tokenized.select(range(500))) # kernel dies here, so operate on subset of test data
results

{'eval_loss': 0.43367841839790344,
 'eval_model_preparation_time': 0.0026,
 'eval_accuracy': 0.8444444444444444,
 'eval_runtime': 95.8793,
 'eval_samples_per_second': 5.215,
 'eval_steps_per_second': 0.657}

In [23]:
# Compare with base model
base = AutoModelForMaskedLM.from_pretrained(checkpoint)
base_trainer = Trainer(model=base, compute_metrics=compute_metrics)
base_results = base_trainer.evaluate(eval_dataset=testSet_tokenized.select(range(500)))
base_results

Some weights of the model checkpoint at google-bert/bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'eval_loss': 1.4030957221984863,
 'eval_model_preparation_time': 0.0026,
 'eval_accuracy': 0.5838383838383838,
 'eval_runtime': 88.5696,
 'eval_samples_per_second': 5.645,
 'eval_steps_per_second': 0.711}