# MiniSeg

## Initialization

In [3]:
!pip install -qU torch accelerate transformers transformers[torch] datasets sentence-transformers evaluate segeval pynvml

[0m

In [4]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

### Data

In [5]:
from datasets import load_dataset

data_files = {
    "train": "data/partitions/yt_seg.train.json",
    "validation": "data/partitions/yt_seg.val.json",
    "test": "data/partitions/yt_seg.test.json"
}

dataset = load_dataset("retkowski/ytseg", data_files=data_files)


def map_fn(data):
  data["targets"] = [1 if c == '1' else 0 for c in data["targets"][2:]] # Remove '|=' prefix
  return data

dataset = dataset.map(
    map_fn,
    remove_columns=["channel_id", "video_id", "audio_path"]
)

dataset = dataset.filter(
    lambda row: len(row["text"]) < 800 and max(len(sentence) for sentence in row["text"]) < 500
)

dataset = dataset.rename_column("targets", "labels")

dataset.set_format('torch')
print(dataset)

Downloading readme:   0%|          | 0.00/5.82k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/314M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/28.5M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/16404 [00:00<?, ? examples/s]

Map:   0%|          | 0/1447 [00:00<?, ? examples/s]

Map:   0%|          | 0/1448 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16404 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1447 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1448 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 14377
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 1256
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 1266
    })
})


In [6]:
dataset.save_to_disk("ytseg_clean")

Saving the dataset (0/1 shards):   0%|          | 0/14377 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1256 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1266 [00:00<?, ? examples/s]

In [7]:
import matplotlib.pyplot as plt

plt.hist([len(text) for text in dataset["train"]["text"]])
plt.hist([len(text) for text in dataset["validation"]["text"]])
plt.hist([len(text) for text in dataset["test"]["text"]])
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
plt.hist([max(len(sentence) for sentence in text) for text in dataset["train"]["text"]])
plt.hist([max(len(sentence) for sentence in text) for text in dataset["validation"]["text"]])
plt.hist([max(len(sentence) for sentence in text) for text in dataset["test"]["text"]])
plt.show()

### The Model

In [9]:
import numpy as np
import torch
import gc
from torch import nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from transformers.modeling_outputs import TokenClassifierOutput

device = 'cuda' if torch.cuda.is_available() else 'cpu'
# assert device == 'cuda'

class MiniSeg(nn.Module):
  def __init__(self, sentence_encoder, doc_encoder):
    super(MiniSeg, self).__init__()
    self.num_labels = 2
    self.sentence_encoder = sentence_encoder
    self.doc_encoder = doc_encoder

  def forward(self, texts, labels_list):
    # encoder_input = np.concatenate(data, axis=0)
    # sentence_encodings = self.sentence_encoder.encode(
    #     encoder_input,
    #     convert_to_tensor=True
    # )
    #
    # cum = np.cumsum([len(b) for b in data])[:-1]
    # embeddings = np.split(sentence_encodings, cum)
    # # Padding
    # # Do this in collate?
    # lengths = torch.tensor([len(sen) for sen in embeddings]).to(device)
    # padded_embeddings = rnn_utils.pad_sequence(embeddings, batch_first=True)

    # Or should I concat these first?
    # print("Calling forward()..., len: ", len(texts[0]["input_ids"]))
    # print_gpu_utilization()

    batch_sentence_embeddings = [
        self._get_sentence_encoding(self.sentence_encoder(**text), text["attention_mask"]) for text in texts
    ]

    lengths = torch.tensor([len(text) for text in batch_sentence_embeddings]).to(device)

    padded_embeddings = rnn_utils.pad_sequence(batch_sentence_embeddings, batch_first=True)

    masks = self._generate_masks(
        lengths,
        padded_embeddings.size(1),
        padded_embeddings.size(0),
    ).float()

    # print("Sentence encoding complete")
    # print_gpu_utilization()

    result = self.doc_encoder(
        inputs_embeds=padded_embeddings,
        attention_mask=masks,
        labels=labels_list,
        return_dict=True
    )

    # print("Doc encoding complete")
    # print_gpu_utilization()
    return result
    # if labels is not None:
    #   loss = nn.CrossEntropyLoss(logits.view(-1, self.num_labels), labels.view(-1))

    # # return F.softmax(logits, dim=-1)
    # return TokenClassifierOutput(logits=logits, loss=loss)

  def _generate_masks(self, lengths, max_len, batch_size):
    return torch.arange(max_len).to(device).expand(len(lengths), max_len) < lengths.unsqueeze(1)

  def _get_sentence_encoding(self, encoder_output, attention_mask):
    # Perform pooling
    sentence_embeddings = self.mean_pooling(encoder_output, attention_mask)

    # Normalize embeddings
    sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
    return sentence_embeddings

  def mean_pooling(self, model_output, attention_mask):
    token_embeddings = model_output[0] # First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

  def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
    self.sentence_encoder.gradient_checkpointing_enable(gradient_checkpointing_kwargs)
    self.doc_encoder.gradient_checkpointing_enable(gradient_checkpointing_kwargs)


In [10]:
from transformers import AutoTokenizer, AutoModel, RoFormerForTokenClassification, RoFormerConfig
from sentence_transformers import SentenceTransformer

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
sentence_encoder = AutoModel.from_pretrained('sentence-transformers/paraphrase-MiniLM-L6-v2')
# sentence_encoder = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')

cfg = RoFormerConfig(embedding_size=384, num_hidden_layers=12, num_attention_heads=8, num_labels=2)
doc_encoder = RoFormerForTokenClassification(cfg)

model = MiniSeg(sentence_encoder, doc_encoder).to(device)
#print(model.num_parameters())

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

In [11]:
# sentences = [
#     ["Hi world", "My name is ..."], # Paragraph 1
#     ["Hi", "My name is Guja", "Nice to meet you"] # Paragraph 2
# ]

# lbls = [
#     [0, 1, 0], [1, 0, 0]
# ]

# ids = [
#     tokenizer(sentences[0], padding=True, truncation=True, return_tensors='pt').to(device),
#     tokenizer(sentences[1], padding=True, truncation=True, return_tensors='pt').to(device)
# ]

# print(tokenizer(sentences[0], padding=True, truncation=True, return_tensors='pt'))
# print(tokenizer(sentences[1], padding=True, truncation=True, return_tensors='pt'))
# print(tokenizer(sentences[0] + sentences[1], padding=True, truncation=True, return_tensors='pt'))
# tokenized = tokenizer(sentences[0], padding=True, truncation=True, return_tensors='pt')
# print(tokenized)

# tokenized = tokenizer(sentences[0] + sentences[1], padding=True, truncation=True, return_tensors='pt')
# output = model(tokenized, torch.tensor(lbls).to(device))
# model(ids, lbls)
# print(output)

## Training

In [12]:
class CustomDataCollator:
  def __init__(self, tokenizer):
    self.tokenizer = tokenizer

  def __call__(self, entries):
    # Why tf is this a list

    texts, labels_list = zip(*[ # Super-cool "splat" operator
        self._process_entry(entry) for entry in entries
    ])

    padded_labels_list = torch.nn.utils.rnn.pad_sequence(labels_list, batch_first=True, padding_value=-100)
    batch = {
        "texts": texts,
        "labels_list": torch.tensor(padded_labels_list)
    }

    return batch

  def _process_entry(self, entry):
    text = self.tokenizer(entry["text"], padding=True, truncation=True, return_tensors='pt')
    labels = self._process_labels_string(entry["labels"])
    return text, labels

  def _process_labels_string(self, labels):
    return torch.tensor([int(c) for c in labels])


In [14]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m983.2 kB/s[0m eta [36m0:00:00[0m00:01[0m:01[0m0m
[?25hInstalling collected packages: nltk
Successfully installed nltk-3.8.1
[0m

In [15]:
import nltk
import segeval
import evaluate

accuracy_metric = evaluate.load("accuracy")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")
f1_metric = evaluate.load("f1")

def compute_boundary_similarity(result: str, actual: str):
  s1 = segeval.convert_nltk_to_masses(result, boundary_symbol='1')
  s2 = segeval.convert_nltk_to_masses(actual, boundary_symbol='1')
  score = segeval.boundary_similarity(s1, s2)
  return float(score)

def compute_segment_metrics(predictions, labels):
  pk_scores = []
  b_scores = []

  for p, l in zip(predictions, labels):
    invalid_indices = np.where(l == -100)[0]
    if len(invalid_indices) != 0:
      last_index = invalid_indices[0]
      p = p[:last_index]
      l = l[:last_index]

    # Last entry needs to be chopped off because
    # in NLTK a bit means a boundary while in dataset
    # a bit means a sentence
    predicted_boundaries = "".join([str(c) for c in p])[:-1]
    actual_boundaries = "".join([str(c) for c in l])[:-1]

    pk = nltk.pk(actual_boundaries, predicted_boundaries)
    b = compute_boundary_similarity(predicted_boundaries, actual_boundaries)

    pk_scores.append(pk)
    b_scores.append(b)

  # Is average right tho?
  average_pk = sum(pk_scores)/len(pk_scores)
  average_b = sum(b_scores)/len(b_scores)

  return {
      "pk": average_pk,
      "boundary_similarity": average_b
  }

def compute_individual_metrics(predictions, labels):
  predictions = np.ravel(predictions)
  labels = np.ravel(labels)
  valid_indices = np.where(labels != -100)

  predictions = predictions[valid_indices]
  labels = labels[valid_indices]

  return {
    "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
    "precision": precision_metric.compute(predictions=predictions, references=labels)["precision"],
    "recall": recall_metric.compute(predictions=predictions, references=labels)["recall"],
    "f1": f1_metric.compute(predictions=predictions, references=labels)["f1"]
  }


def compute_metrics(eval_pred):
  logits = eval_pred.predictions
  labels = eval_pred.label_ids
  predictions = np.argmax(logits, axis=2)

  individual_metrics = compute_individual_metrics(predictions, labels)
  segment_metrics = compute_segment_metrics(predictions, labels)

  return individual_metrics | segment_metrics


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

In [26]:
from transformers import Trainer, TrainingArguments, DataCollatorForTokenClassification

training_args = TrainingArguments(
    output_dir="trainer_output",
    optim="adamw_torch",
    learning_rate=2.5e-5,
    lr_scheduler_type="cosine",
    num_train_epochs=15,
    remove_unused_columns=False,
    eval_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="boundary_similarity",

    save_strategy="steps",
    save_steps = 2000,
    save_total_limit=1,
    # Optimizations for gpu:
    per_device_train_batch_size=16,
    # gradient_accumulation_steps=8,
    per_device_eval_batch_size=16,
    # fp16=True,
    gradient_checkpointing=True
)

trainer = Trainer(
    model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["validation"],
    data_collator=CustomDataCollator(tokenizer=tokenizer),
    compute_metrics=compute_metrics
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [27]:
trainer.train("/home/checkpoint-12000")


  "labels_list": torch.tensor(padded_labels_list)


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,Pk,Boundary Similarity
12500,0.0962,0.14642,0.955795,0.609478,0.40295,0.485149,0.297251,0.375339
13000,0.0934,0.148863,0.956384,0.623135,0.395117,0.483596,0.300961,0.371629


  "labels_list": torch.tensor(padded_labels_list)
  "labels_list": torch.tensor(padded_labels_list)
Could not locate the best model at trainer_output/checkpoint-12000/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=13485, training_loss=0.010446607443328959, metrics={'train_runtime': 2105.8747, 'train_samples_per_second': 102.406, 'train_steps_per_second': 6.404, 'total_flos': 0.0, 'train_loss': 0.010446607443328959, 'epoch': 15.0})

In [28]:
output_dir = '/home/best_model'
# Save the model using Trainer
trainer.save_model(output_dir)

# Additionally, save the sentence_encoder and tokenizer if they are separate
sentence_encoder.save_pretrained(f"{output_dir}/sentence_encoder")
tokenizer.save_pretrained(f"{output_dir}/sentence_encoder")

# Save the doc_encoder if it is separate
doc_encoder.save_pretrained(f"{output_dir}/doc_encoder")

# Save the MiniSeg model's state dict
import torch
torch.save(model.state_dict(), f"{output_dir}/miniseg_model.pth")


In [29]:
from transformers import AutoTokenizer, AutoModel, RoFormerForTokenClassification
import torch

sentence_encoder = AutoModel.from_pretrained(f"{output_dir}/sentence_encoder")
tokenizer = AutoTokenizer.from_pretrained(f"{output_dir}/sentence_encoder")

doc_encoder = RoFormerForTokenClassification.from_pretrained(f"{output_dir}/doc_encoder")

model = MiniSeg(sentence_encoder, doc_encoder).to(device)

model.load_state_dict(torch.load(f"{output_dir}/miniseg_model.pth"))

<All keys matched successfully>

In [51]:
sentences = {"text":[
    "Apples are delicious and nutritious.",
    "They are rich in vitamins and fiber.",
    "Eating apples can improve your health.",
    "Many people enjoy apples as a snack.",
    "Okay, let's switch the topic now and talk about programming.",
    "Programming can be fun and rewarding.",
    "There are many programming languages to learn.",
    "Python is popular for data science.",
    "Now let's discuss something different, like sports.",
    "Football is a widely watched sport.",
    "Basketball is also very popular.",
    "Sports can improve physical fitness."
], "labels": [0, 0, 0]}
model.eval()
# Tokenize the input sentences
inputs = [sentences]
collator = CustomDataCollator(tokenizer=tokenizer)
outputs = collator.__call__(inputs)
outputs = model(texts=outputs, labels_list = None)

# Get the predicted labels
predictions = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
predictions

  "labels_list": torch.tensor(padded_labels_list)


TypeError: BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 384, padding_idx=0)
    (position_embeddings): Embedding(512, 384)
    (token_type_embeddings): Embedding(2, 384)
    (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-5): 6 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=384, out_features=384, bias=True)
            (key): Linear(in_features=384, out_features=384, bias=True)
            (value): Linear(in_features=384, out_features=384, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=384, out_features=384, bias=True)
            (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Linear(in_features=384, out_features=1536, bias=True)
          (intermediate_act_fn): GELUActivation()
        )
        (output): BertOutput(
          (dense): Linear(in_features=1536, out_features=384, bias=True)
          (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
  )
  (pooler): BertPooler(
    (dense): Linear(in_features=384, out_features=384, bias=True)
    (activation): Tanh()
  )
) argument after ** must be a mapping, not str