In [None]:
!pip install datasets transformers nltk



In [None]:
# import pandas torch transformers nltk numpy matplotlib
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import T5Tokenizer, T5ForConditionalGeneration
import nltk
import numpy as np
import random
import matplotlib.pyplot as plt
from datasets import load_dataset
from tqdm import tqdm
import nltk
import re

from nltk.corpus import stopwords
from collections import Counter


nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    # Setting the seed in Hugging Face Transformers
    import transformers
    transformers.set_seed(seed)

set_seed(42)

In [None]:
### Optimizations:
"""
  1 - Use flash attention and rewrite forward of it
  2 - Use torch.compile
  3 - use kv-cache
  4 - use MixedPrecision training

"""

'\n  1 - Use flash attention and rewrite forward of it\n  2 - Use torch.compile\n  3 - use kv-cache\n  4 - use MixedPrecision training\n\n'

In [None]:
# load model
def load_model():
    model_name = "t5-small"
    tokenizer = T5Tokenizer.from_pretrained(model_name)
    # | Model Class                    | Encoder-Decoder | Vocabulary Logits for Generation | Typical Use Case                                 |
    # |------------------------------- |-----------------|----------------------------------|------------------------------------------------- |
    # | `T5ForConditionalGeneration`   | Yes             | Yes                              | Text generation (summarization, translation)     |
    # | `T5Model`                      | Yes             | No                               | Feature extraction, embeddings                   |
    # | `T5EncoderModel`               | Encoder only    | No                               | Classification, regression                       |

    model = T5ForConditionalGeneration.from_pretrained(model_name)
    return model, tokenizer

In [None]:
# load dataset
def load_imdb_data():
    dataset = load_dataset("imdb")
    train = dataset['train'].to_pandas()
    test = dataset['test'].to_pandas()
    unsupervised = dataset['unsupervised'].to_pandas()
    return {'train': train, 'test': test}#,'unsupervised':unsupervised}

def label_map(label):
    if label == 0:
        return 'negative'
    else:
        return 'positive'



dataset = load_imdb_data()
train = dataset['train']
test = dataset['test']
tqdm.pandas()
train['label'] = train['label'].progress_apply(label_map)
test['label'] = test['label'].progress_apply(label_map)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
100%|██████████| 25000/25000 [00:00<00:00, 1060206.46it/s]
100%|██████████| 25000/25000 [00:00<00:00, 1089022.29it/s]


In [None]:
def normalize_text(
      text,
      remove_stopwords=False,
      remove_punctuation=False,
      remove_numbers=False,
    ):
    text = text.lower()
    if remove_punctuation:
        text = re.sub(r"[^\w\s]", "", text)

    if remove_numbers:
        text = re.sub(r"\d+", "", text)

    text = re.sub(r"\s+", " ", text).strip()
    if remove_stopwords:
        text = " ".join([word for word in text.split() if word not in stop_words])
    return text

In [None]:
def get_most_frequent_words(text_series, n=10):
  words = []
  for text in tqdm(text_series):
      words.extend([word for word in re.findall(r'\w+', text.lower()) if word not in stop_words])
  most_common_words = Counter(words).most_common(n)
  return most_common_words


def get_top_ngrams(text_series, n=8, top_k=50):
    ngrams = []
    for text in tqdm(text_series):
        tokens = [word for word in re.findall(r'\w+', text.lower())]
        ngrams.extend([" ".join(tokens[i:i+n]) for i in range(len(tokens)-n+1)])
    most_common_ngrams = Counter(ngrams).most_common(top_k)
    return most_common_ngrams


most_common_words = get_most_frequent_words(train['text'])
most_common_ngrams = get_top_ngrams(train['text'])
print(most_common_ngrams)
print(most_common_words)

100%|██████████| 25000/25000 [00:02<00:00, 10737.70it/s]
100%|██████████| 25000/25000 [00:04<00:00, 5853.29it/s]


[('one of the worst movies i have ever', 45), ('of the worst movies i have ever seen', 44), ('one of the worst movies i ve ever', 29), ('of the worst movies i ve ever seen', 28), ('this is one of the worst movies i', 23), ('one of the worst films i have ever', 22), ('is one of the worst movies i have', 21), ('of the worst films i have ever seen', 20), ('my vote is eight br br title brazil', 17), ('is the worst movie i have ever seen', 15), ('don t say i didn t warn you', 15), ('br br don t get me wrong i', 15), ('one of the worst films i ve ever', 15), ('br br if you re looking for a', 14), ('this has to be one of the worst', 13), ('my vote is seven br br title brazil', 13), ('don t waste your time or money on', 12), ('br br if you want to see a', 12), ('of the worst films i ve ever seen', 12), ('i went out after and bought a case', 12), ('went out after and bought a case of', 12), ('out after and bought a case of cognac', 12), ('t waste your time or money on this', 11), ('don t even g

In [None]:
import torch

class IMDBDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]["text"]
        label = self.data.iloc[idx]["label"]

        text = "Answer this question just with positive or negative words.What is the sentiment of this review? " + text


        # Tokenize the text input
        text_encoding = self.tokenizer.encode_plus(
            text,
            return_tensors='pt',
            padding='longest',  # Dynamic padding
            truncation=True,
            max_length=self.max_length,
            return_attention_mask=True
        )

        # Tokenize the label as a text input
        label_encoding = self.tokenizer.encode_plus(
            label,
            return_tensors='pt',
            padding='longest',  # Dynamic padding
            truncation=True,
            max_length=8,
            return_attention_mask=True
        )

        return {
            "input_ids": text_encoding["input_ids"].squeeze(0),
            "attention_mask": text_encoding["attention_mask"].squeeze(0),
            "label_input_ids": label_encoding["input_ids"].squeeze(0),
            "label_attention_mask": label_encoding["attention_mask"].squeeze(0)
        }




def collate_fn(batch):
    max_input_length = max(item['input_ids'].size(0) for item in batch)
    max_label_length = max(item['label_input_ids'].size(0) for item in batch)
    def pad_sequence(sequence, max_len, pad_value=0):
        return torch.nn.functional.pad(sequence, (0, max_len - sequence.size(0)), value=pad_value)
    input_ids = torch.stack([pad_sequence(item['input_ids'], max_input_length) for item in batch])
    attention_mask = torch.stack([pad_sequence(item['attention_mask'], max_input_length) for item in batch])
    label_input_ids = torch.stack([pad_sequence(item['label_input_ids'], max_label_length) for item in batch])
    label_attention_mask = torch.stack([pad_sequence(item['label_attention_mask'], max_label_length) for item in batch])
    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'label_input_ids': label_input_ids,
        'label_attention_mask': label_attention_mask,
    }


In [None]:
import torch
import torch.nn as nn

class SoftPrompt(nn.Module):
    def __init__(self, n_tokens, embedding_layer, hidden_size):
        super().__init__()
        self.embedding = embedding_layer
        self.prompt_embedding = nn.Parameter(torch.randn(n_tokens, hidden_size))
        self.n_tokens = n_tokens

    def forward(self, input_ids):
        with torch.no_grad():
            input_embeddings = self.embedding(input_ids)
        batch_size = input_embeddings.size(0)
        prompt_embeddings = self.prompt_embedding.unsqueeze(0).expand(batch_size, -1, -1)
        combined_embeddings = torch.cat([prompt_embeddings, input_embeddings], dim=1)
        return combined_embeddings



In [None]:
model , tokenizer = load_model()
for param in model.parameters():
    param.requires_grad = False

soft = SoftPrompt(
    n_tokens=10,
    embedding_layer=model.encoder.embed_tokens,
    hidden_size=model.config.hidden_size
)
model.encoder.embed_tokens = soft

optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.encoder.embed_tokens.parameters()),
    lr=0.001,
    fused=True
)

trainset = IMDBDataset(train, tokenizer, max_length=384)
testset = IMDBDataset(test, tokenizer, max_length=384)
trainloader = DataLoader(trainset, batch_size=128, shuffle=True,collate_fn=collate_fn)
testloader = DataLoader(testset, batch_size=128, shuffle=False,collate_fn=collate_fn)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
import torch
import torch.nn as nn
import time
from tqdm import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def adjust_attention_mask(attention_mask, n_tokens=10):
    batch_size = attention_mask.size(0)
    soft_prompt_mask = torch.ones((batch_size, n_tokens), dtype=attention_mask.dtype, device=attention_mask.device)
    adjusted_attention_mask = torch.cat([soft_prompt_mask, attention_mask], dim=1)
    return adjusted_attention_mask


def train_step(model, data_loader, optimizer, device, mixed_precision=False):

    model.train()
    running_loss = 0.0
    start_time = time.time()

    scaler = torch.cuda.amp.GradScaler() if mixed_precision else None

    for batch in tqdm(data_loader, desc="Training", leave=False):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        attention_mask = adjust_attention_mask(attention_mask)
        label_input_ids = batch['label_input_ids'].to(device)


        optimizer.zero_grad()

        if mixed_precision:
            with torch.cuda.amp.autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_input_ids)
                loss = outputs.loss
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=label_input_ids)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        running_loss += loss.item()

    epoch_time = time.time() - start_time
    print(f"Training Step Time: {epoch_time:.2f} seconds")
    return running_loss / len(data_loader)

def eval_step(model, data_loader, device):
    model.eval()
    running_loss = 0.0
    all_preds = []
    all_labels = []
    start_time = time.time()

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            attention_mask = adjust_attention_mask(attention_mask)

            label_input_ids = batch['label_input_ids'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask,labels=label_input_ids)
            loss = outputs.loss
            running_loss += loss.item()

            preds = torch.argmax(outputs.logits, dim=-1).view(-1).cpu().numpy()
            labels = label_input_ids.view(-1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)

    eval_time = time.time() - start_time
    print(f"Evaluation Step Time: {eval_time:.2f} seconds")
    return running_loss / len(data_loader), all_preds, all_labels

def compute_metrics(all_preds, all_labels):
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds, average="weighted")
    recall = recall_score(all_labels, all_preds, average="weighted")
    f1 = f1_score(all_labels, all_preds, average="weighted")

    print("Evaluation Metrics:")
    print(f" - Accuracy: {accuracy:.4f}")
    print(f" - Precision: {precision:.4f}")
    print(f" - Recall: {recall:.4f}")
    print(f" - F1 Score: {f1:.4f}")

def train_and_evaluate(model, train_loader, test_loader, optimizer, device, epochs=3, mixed_precision=False, compile_model=False):
    import torch._dynamo
    torch._dynamo.config.suppress_errors = True
    if compile_model:
        print("Compiling model...")
        model = torch.compile(model)
    else:
        print("Not compiling model.")

    if mixed_precision:
      print("Using mixed precision training")
    else:
      print("Not using mixed precision training")

    for epoch in range(epochs):
        print(f"\nEpoch {epoch + 1}/{epochs}")
        start_time = time.time()

        train_loss = train_step(model, train_loader, optimizer, device, mixed_precision=mixed_precision)
        print(f"Train Loss: {train_loss:.4f}")

        eval_loss, all_preds, all_labels = eval_step(model, test_loader, device)
        print(f"Eval Loss: {eval_loss:.4f}")

        compute_metrics(all_preds, all_labels)

        epoch_time = time.time() - start_time
        print(f"Epoch Time: {epoch_time:.2f} seconds")

In [None]:
# Use Mixed Precision and compiling the model
train_and_evaluate(
    model=model,
    train_loader=trainloader,
    test_loader=testloader,
    optimizer=optimizer,
    device=device,
    epochs=5,
    mixed_precision=True,      # Set to True to enable mixed precision
    compile_model=False         # Set to True to enable torch.compile
)


  scaler = torch.cuda.amp.GradScaler() if mixed_precision else None


Not compiling model.
Using mixed precision training

Epoch 1/5


  with torch.cuda.amp.autocast():


Training Step Time: 176.24 seconds
Train Loss: 10.1323


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  scaler = torch.cuda.amp.GradScaler() if mixed_precision else None


Evaluation Step Time: 154.90 seconds
Eval Loss: 7.1349
Evaluation Metrics:
 - Accuracy: 0.0004
 - Precision: 0.1957
 - Recall: 0.0004
 - F1 Score: 0.0007
Epoch Time: 331.30 seconds

Epoch 2/5


  with torch.cuda.amp.autocast():


Training Step Time: 175.22 seconds
Train Loss: 6.8440


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  scaler = torch.cuda.amp.GradScaler() if mixed_precision else None


Evaluation Step Time: 155.53 seconds
Eval Loss: 1.7386
Evaluation Metrics:
 - Accuracy: 0.4177
 - Precision: 0.8512
 - Recall: 0.4177
 - F1 Score: 0.4953
Epoch Time: 330.90 seconds

Epoch 3/5


  with torch.cuda.amp.autocast():


Training Step Time: 174.88 seconds
Train Loss: 2.3634


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  scaler = torch.cuda.amp.GradScaler() if mixed_precision else None


Evaluation Step Time: 155.29 seconds
Eval Loss: 0.8051
Evaluation Metrics:
 - Accuracy: 0.5901
 - Precision: 0.8839
 - Recall: 0.5901
 - F1 Score: 0.6166
Epoch Time: 330.32 seconds

Epoch 4/5


  with torch.cuda.amp.autocast():


Training Step Time: 175.73 seconds
Train Loss: 1.0824


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  scaler = torch.cuda.amp.GradScaler() if mixed_precision else None


Evaluation Step Time: 156.00 seconds
Eval Loss: 0.4360
Evaluation Metrics:
 - Accuracy: 0.7632
 - Precision: 0.8751
 - Recall: 0.7632
 - F1 Score: 0.7146
Epoch Time: 331.88 seconds

Epoch 5/5


  with torch.cuda.amp.autocast():
Training:  64%|██████▍   | 126/196 [01:52<01:00,  1.16it/s]

In [None]:
import torch
import torch.nn.functional as F
from torch import nn
from transformers.models.t5.modeling_t5 import T5Attention

class T5FlashAttention(T5Attention):
    def forward(
        self,
        hidden_states,
        mask=None,
        key_value_states=None,
        position_bias=None,
        past_key_value=None,
        layer_head_mask=None,
        query_length=None,
        use_cache=False,
        output_attentions=False,
        cache_position=None,
    ):
        """
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        """
        # Input is (batch_size, seq_length, dim)
        # Mask is (batch_size, 1, 1, key_length) (non-causal encoder) or (batch_size, 1, seq_length, key_length) (causal decoder)
        batch_size, seq_length = hidden_states.shape[:2]

        # if key_value_states are provided this layer is used as a cross-attention layer for the decoder
        is_cross_attention = key_value_states is not None

        query_states = self.q(hidden_states)
        query_states = query_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)

        if past_key_value is not None:
            is_updated = past_key_value.is_updated.get(self.layer_idx)
            if is_cross_attention:
                # after the first generated id, we can subsequently re-use all key/value_states from cache
                curr_past_key_value = past_key_value.cross_attention_cache
            else:
                curr_past_key_value = past_key_value.self_attention_cache

        current_states = key_value_states if is_cross_attention else hidden_states
        if is_cross_attention and past_key_value is not None and is_updated:
            # reuse k,v, cross_attentions
            key_states = curr_past_key_value.key_cache[self.layer_idx]
            value_states = curr_past_key_value.value_cache[self.layer_idx]
        else:
            key_states = self.k(current_states)
            value_states = self.v(current_states)
            key_states = key_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)
            value_states = value_states.view(batch_size, -1, self.n_heads, self.key_value_proj_dim).transpose(1, 2)

            if past_key_value is not None:
                # save all key/value_states to cache to be re-used for fast auto-regressive generation
                cache_position = cache_position if not is_cross_attention else None
                key_states, value_states = curr_past_key_value.update(
                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
                )
                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
                if is_cross_attention:
                    past_key_value.is_updated[self.layer_idx] = True

        # Compute scaled dot product attention using F.scaled_dot_product_attention
        attn_output = F.scaled_dot_product_attention(
            query_states, key_states, value_states, attn_mask=mask, dropout_p=self.dropout if self.training else 0.0
        )

        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.view(batch_size, -1, self.inner_dim)

        attn_output = self.o(attn_output)

        outputs = (attn_output, past_key_value, position_bias)

        if output_attentions:
            outputs = outputs + (None,)
        return outputs


In [None]:
for layer in model.encoder.block:
    layer.layer[0].SelfAttention = T5FlashAttention(model.config)
for layer in model.decoder.block:
    layer.layer[0].SelfAttention = T5FlashAttention(model.config)
    layer.layer[1].EncDecAttention = T5FlashAttention(model.config)

In [None]:
print("Replace Attention with FlashAttention for optimize the calculations")
train_and_evaluate(
    model=model,
    train_loader=trainloader,
    test_loader=testloader,
    optimizer=optimizer,
    device=device,
    epochs=5,
    mixed_precision=True,      # Set to True to enable mixed precision
    compile_model=False         # Set to True to enable torch.compile
)


Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,negative
1,"Worth the entertainment value of a rental, esp...",negative
2,its a totally average film with a few semi-alr...,negative
3,STAR RATING: ***** Saturday Night **** Friday ...,negative
4,"First off let me say, If you haven't enjoyed a...",negative
...,...,...
24995,Just got around to seeing Monster Man yesterda...,positive
24996,I got this as part of a competition prize. I w...,positive
24997,I got Monster Man in a box set of three films ...,positive
24998,"Five minutes in, i started to feel how naff th...",positive
