# Gemma + RoBERTa Embedder
This notebook aims to evaluate the effectiveness of a pipeline that generates embeddings using a large language model (LLM) as a decoder (specifically, gemma_2b_en) and a feed-forward layer to ensure compatibility with the input dimensions of a sentence-level model (SLM) used as an encoder (such as RoBERTa base). We named the full embedding pipeline "Frankenstein Model".
An extensive set of experiments has been conducted on two datasets: 
1. Amazon Counterfactual Classification: binary classification task.
2. Emotion Classification: multilabel (6) classification task. 

The experiments gave us the following results:

\begin{array}{|c|c|c|c|}
\hline
\textbf{Model} & \textbf{Amazon Counterfactual Classification (Accuracy, F1)} & \textbf{Emotion Classification (Accuracy, F1)} & \textbf{Layer Number} \\
\hline
\text{Roberta-Only} & (0.92358, 0.88786) & (0.84130, 0.79769) & - \\
\text{Gemma-Only} & (0.65224, 0.59531) & (0.28535, 0.24303) & - \\
\text{Frankenstein} & (0.59567, 0.53944) & (0.32005, 0.20509) & -1 \\
\text{Frankenstein} & (0.73567, 0.67447) & (0.29070, 0.23099) & -2 \\
\text{Frankenstein} & (0.78910, 0.73468) & (0.33010, 0.25578) & -3 \\
\text{Frankenstein} & (0.81881, 0.75857) & (0.33950, 0.27807) & -4 \\
\text{Frankenstein} & (0.83522, 0.77633) & (0.17060, 0.11551) & -5 \\
\text{Frankenstein} & (0.82284, 0.76580) & (0.35050, 0.28033) & -6 \\
\text{Frankenstein} & (0.84746, 0.79004) & (0.21150, 0.12383) & -7 \\
\text{Frankenstein} & (0.73687, 0.66874) & (0.21770, 0.17535) & -8 \\
\text{Frankenstein} & (0.85896, 0.80996) & (0.35980, 0.28138) & -9 \\
\text{Frankenstein} & (0.82060, 0.76040) & (0.33605, 0.26878) & -10 \\
\text{Frankenstein} & (0.77179, 0.72740) & (0.29690, 0.24354) & -11 \\
\text{Frankenstein} & (0.78119, 0.71965) & (0.39920, 0.32163) & -12 \\
\text{Frankenstein} & (0.80448, 0.75166) & (0.24650, 0.19516) & -13 \\
\text{Frankenstein} & (0.85119, 0.79530) & (0.39890, 0.32221) & -14 \\
\text{Frankenstein} & (0.87493, 0.82954) & (0.18935, 0.14127) & -15 \\
\text{Frankenstein} & (0.82194, 0.76370) & (0.19300, 0.13079) & -16 \\
\text{Frankenstein} & (0.77567, 0.71621) & (0.18550, 0.14802) & -17 \\
\text{Frankenstein} & (0.65090, 0.58423) & (0.32005, 0.20508) & -18 \\
\hline
\end{array}

In this table, the 'Layer Number' column specifies from which layer the output of Gemma is being extracted. Negative numbers indicate the layer count starting from the topmost layer downward. The rows labeled 'Roberta-Only' and 'Gemma-Only' serve as baseline comparisons, demonstrating the performance of the standalone models without integration.

In order to reproduce these results (or even test the embedders on new datasets) it is sufficient to modify the constants in the following cell:

In [None]:
"""
Possible tasks to test:
    "AmazonCounterfactualClassification",
    "AmazonPolarityClassification",
    "AmazonReviewsClassification",
    "Banking77Classification",
    "EmotionClassification",
    "ImdbClassification",
    "MassiveIntentClassification",
    "MassiveScenarioClassification",
    "MTOPDomainClassification",
    "MTOPIntentClassification",
    "ToxicConversationsClassification",
    "TweetSentimentExtractionClassification"
"""

# default value = 33
SEED = 33 

# training hyperparameters
EPOCHS = 5
BATCH_SIZE = 32
LEARNING_RATE = 1e-5
LLM_layers = [-17]

ONLY_LLM = False # True if you want to use Gemma-only model
ONLY_SLM = True # True if you want to use Roberta-only model
 
# Choose only one task from the list above
TASK = "EmotionClassification"

In [None]:
# ensure compatibility with accelerate and bitsandbytes
!pip install transformers==4.30

# default required installations
!pip install mteb
!pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
!pip install torchinfo
!pip install bitsandbytes
!pip install accelerate
!pip install gputil

In [None]:
import numpy as np
import pandas as pd
from mteb import MTEB
import warnings
import os
from tqdm import tqdm
import bitsandbytes as bnb
import random

import shutil
from IPython.display import FileLink
import zipfile

from sklearn.model_selection import train_test_split

import torch
from torchinfo import summary
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from torch import nn
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, BitsAndBytesConfig
from transformers import PreTrainedTokenizer

from kaggle_secrets import UserSecretsClient

# import valid hugging_face token (update secret on Kaggle with your token)
user_secrets = UserSecretsClient()
HUGGING_FACE_TOKEN = user_secrets.get_secret("HUGGING_FACE_TOKEN")

# Suppress Warning that asks if the datasets are reliable
warnings.filterwarnings("ignore", category=FutureWarning)

# Suppress Warning that claims slow training and inference during the fitting
warnings.filterwarnings("ignore", category=UserWarning)

os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pooling = 'mean'
VAL_SPLIT = 0.2

# SEED settings to ensure reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED);

## Data Import
Loading the tasks and storing the corresponding dataset in a vocabulary (as datasets.dataset_dict.DatasetDict objects)

In [None]:
datasets_vocabulary = {}
evaluation_pipeline = MTEB(tasks=[TASK], task_langs=["en"])

def extract_dataset(datasets_vocabulary, dataset_name, val_split):
    dataset_dict = datasets_vocabulary[dataset_name].get("en", datasets_vocabulary[dataset_name])
    
    test_df = pd.DataFrame(dataset_dict["test"])
    train_data = pd.DataFrame(dataset_dict["train"])

    # Check if there's a predefined validation set
    if "validation" in dataset_dict:
        val_df = pd.DataFrame(dataset_dict["validation"])
    else:
        # Create a validation set if it doesn't exist
        train_df, val_df = train_test_split(train_data, test_size=val_split, random_state=SEED)
        return train_df, val_df, test_df
    
    # Use the entire train_data as train_df if validation set exists
    train_df = train_data
    return train_df, val_df, test_df

for task, task_name in zip(evaluation_pipeline.tasks, [TASK]):
  task.load_data(trust_remote_code=True)
  datasets_vocabulary[task_name] = task.dataset

train_df, val_df, test_df = extract_dataset(datasets_vocabulary, TASK, VAL_SPLIT)

num_of_labels = len(np.unique(pd.concat([train_df["label"], val_df["label"], test_df["label"]])))
N_CLASSES = 1 if num_of_labels == 2 else num_of_labels

print(f"\nDataset - {TASK}")
print(f"Size of dataframes:\t train - {len(train_df)}\t validation - {len(val_df)}\t test - {len(test_df)}")
print(f"This dataset has {N_CLASSES+1 if num_of_labels == 2 else N_CLASSES} different classes \n")
display(train_df)

## Download of Gemma and RoBERTa

In [None]:
gemma_tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b", use_auth_token=HUGGING_FACE_TOKEN)
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

TOKENIZER = roberta_tokenizer if ONLY_SLM else gemma_tokenizer

# Define the quantization configuration for 4-bit loading
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    quantization_dtype=torch.float32,
    compute_dtype=torch.float32
)

# Load gemma_model with 4-bit precision
gemma_model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2b",
    quantization_config=quantization_config,
    use_auth_token=HUGGING_FACE_TOKEN,
    device_map="auto"
)

roberta_model = AutoModel.from_pretrained("roberta-base")

## Models Definition

In [None]:
# superclass of Frankeinstein Model, Gemma_only and RoBERTa_only
class BaseModel(nn.Module):
    def __init__(self):
        super(BaseModel, self).__init__()
    
    def forward(self, *input, **kwargs):
        raise NotImplementedError("This method should be implemented by subclasses.")

    def set_encoder_parameters(self, batch_size, max_length, tokenizer, device):
        self.batch_size = batch_size
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.device = device
        
    def encode(self, sentences: list[str], **kwargs):
        try:
            self.batch_size
        except AttributeError as e:
            print(f"ERROR: {e}")
            print("Before running the evaluation set its parameters with the function set_encoder_parameters(batch_size, max_length, tokenizer, device)")
        self.to(device)
        max_length = min(self.max_length, self.tokenizer.model_max_length)
        all_embeddings = []
        for i in range(0, len(sentences), self.batch_size):
            batch_sentences = sentences[i:i + self.batch_size]
            inputs = self.tokenizer(batch_sentences, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
            input_ids = inputs['input_ids'].to(self.device)
            attention_mask = inputs['attention_mask'].to(self.device)
 
            with torch.no_grad():
                embeddings = self(input_ids=input_ids, attention_mask=attention_mask, clf_head=False)
 
            all_embeddings.append(embeddings.cpu())
        all_embeddings = torch.cat(all_embeddings, dim=0)
        return all_embeddings

In [None]:
class FrankensteinModel(BaseModel):
    def __init__(self, large_language_model, small_language_model, n_classes, LLM_layers = -1, pooling = 'max'):
        super(FrankensteinModel, self).__init__()
        self.LLM_layers = LLM_layers
        self.pooling = pooling
        self.llm = large_language_model
        self.slm = small_language_model
        self.projection = nn.Linear(self.llm.config.hidden_size, self.slm.config.hidden_size)
        self.clf_head = nn.Linear(self.slm.config.hidden_size, n_classes)
        
        for param in self.llm.parameters():
            param.requires_grad = False

    def forward(self, input_ids, attention_mask=None, clf_head = True):
        llm_outputs = self.llm(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        if isinstance(self.LLM_layers, int):
            llm_features = llm_outputs.hidden_states[self.LLM_layers]
        elif isinstance(self.LLM_layers, list):
            # Combine multiple layers (mean or max pooling)
            layers = [llm_outputs.hidden_states[layer] for layer in self.LLM_layers]
            if self.pooling == 'mean':
                llm_features = torch.mean(torch.stack(layers), dim=0)
            elif self.pooling == 'max':
                llm_features, _ = torch.max(torch.stack(layers), dim=0)
            else:
                raise ValueError(f"Invalid pooling method: {pooling}")
        else:
            raise ValueError(f"Invalid type for use_layer: {type(use_layer)}. Must be int or list of int.")
        llm_features = llm_features.to(torch.float32)
            
        projected_features = self.projection(llm_features)
        slm_outputs = self.slm(inputs_embeds=projected_features, attention_mask=attention_mask)
        pooled_output = torch.mean(slm_outputs.last_hidden_state, dim=1)
        
        if clf_head:
            output = self.clf_head(pooled_output)
        else:
            output = pooled_output
        return output

    
class RoBERTa_only(BaseModel):
    def __init__(self, small_language_model, n_classes):
        super(RoBERTa_only, self).__init__()
        
        self.slm = small_language_model
        self.clf_head = nn.Linear(self.slm.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask=None, clf_head=True):
        slm_outputs = self.slm(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = torch.mean(slm_outputs.last_hidden_state, dim=1)
        
        if clf_head:
            output = self.clf_head(pooled_output)
        else:
            output = pooled_output
        return output
    
    
class Gemma_only(BaseModel):
    def __init__(self, large_language_model, n_classes, LLM_layers = -1, pooling = 'max'):
        super(Gemma_only, self).__init__()
        self.LLM_layers = LLM_layers
        self.pooling = pooling
        self.llm = large_language_model
        self.clf_head = nn.Linear(self.llm.config.hidden_size, n_classes)
        
        for param in self.llm.parameters():
            param.requires_grad = False

    def forward(self, input_ids, attention_mask=None, clf_head = True):
        llm_outputs = self.llm(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        if isinstance(self.LLM_layers, int):
            llm_features = llm_outputs.hidden_states[self.LLM_layers]
        elif isinstance(self.LLM_layers, list):
            # Combine multiple layers (mean or max pooling)
            layers = [llm_outputs.hidden_states[layer] for layer in self.LLM_layers]
            if self.pooling == 'mean':
                llm_features = torch.mean(torch.stack(layers), dim=0)
            elif self.pooling == 'max':
                llm_features, _ = torch.max(torch.stack(layers), dim=0)
            else:
                raise ValueError(f"Invalid pooling method: {pooling}")
        else:
            raise ValueError(f"Invalid type for use_layer: {type(use_layer)}. Must be int or list of int.")
        llm_features = llm_features.to(torch.float32)
            
        pooled_output = torch.mean(llm_features, dim=1)
        
        if clf_head:
            output = self.clf_head(pooled_output)
        else:
            output = pooled_output
        return output
    
    
if ONLY_LLM:
    model = Gemma_only(large_language_model=gemma_model, n_classes=N_CLASSES)
elif ONLY_SLM:
    model = RoBERTa_only(small_language_model=roberta_model, n_classes=N_CLASSES)
else: 
    model = FrankensteinModel(large_language_model=gemma_model, small_language_model=roberta_model, n_classes=N_CLASSES)

## Dataloaders creation 

In [None]:
def find_max_encoded_utterance_len(data, tokenizer=TOKENIZER):    
    max_length = max([len(tokenizer.encode_plus(sentence)["input_ids"]) for sentence in data])
    return max_length


def create_dataloader(df, tokenizer=TOKENIZER, n_classes = N_CLASSES, batch_size = BATCH_SIZE, x="text", y="label", shuffle = True, max_length=512):
    texts = df[x].tolist()
    labels = df[y].tolist()
    max_length = min(max_length, tokenizer.model_max_length)
    
    tokens = tokenizer.batch_encode_plus(
        texts,
        padding="max_length",
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
    
    # Create a TensorDataset
    dataset = TensorDataset(tokens['input_ids'], tokens['attention_mask'], torch.tensor(labels))
    
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


train_max_tokenized_length = find_max_encoded_utterance_len(train_df["text"])
val_max_tokenized_length = find_max_encoded_utterance_len(val_df["text"])
test_max_tokenized_length = find_max_encoded_utterance_len(test_df["text"])

# creating dataloader for ToxicConversationsClassification binary task
train_dataloader = create_dataloader(train_df, shuffle=True, max_length=train_max_tokenized_length)
val_dataloader = create_dataloader(val_df, shuffle=False, max_length=val_max_tokenized_length)
test_dataloader = create_dataloader(test_df, shuffle=False, max_length=test_max_tokenized_length)


for dataloader, name in zip([train_dataloader, val_dataloader, test_dataloader], ["train", "val", "test"]):
    for input_ids, attention_mask, labels in dataloader:
        print(f"Shape of {name} batch:", input_ids.shape)
        break

## Train and Evaluate functions 

In [None]:
criterion = nn.BCEWithLogitsLoss() if N_CLASSES == 1 else nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=EPOCHS, device=device, n_classes=N_CLASSES):
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct_predictions = 0
        total_predictions = 0

        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False):
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            if n_classes == 1:
                loss = criterion(outputs.squeeze(), labels.float())
                predictions = (outputs.squeeze() > 0.5).int()
            else:
                loss = criterion(outputs, labels)
                predictions = torch.argmax(outputs, dim=1)

            loss.backward()
            optimizer.step()

            running_loss += loss.item() * input_ids.size(0)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

        epoch_loss = running_loss / len(train_loader.dataset)
        epoch_acc = correct_predictions / total_predictions

        val_loss, val_acc = evaluate_model(model, val_loader, criterion, device, n_classes)

        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f} - Accuracy: {epoch_acc:.7f} - Val Loss: {val_loss:.7f} - Val Accuracy: {val_acc:.7f}")

def evaluate_model(model, val_loader, criterion, device = device, n_classes = N_CLASSES):
    model.to(device)
    
    model.eval()
    running_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            if n_classes == 1:
                loss = criterion(outputs.squeeze(), labels.float())
                predictions = (outputs.squeeze() > 0.5).int()
            else:
                loss = criterion(outputs, labels)
                predictions = torch.argmax(outputs, dim=1)

            running_loss += loss.item() * input_ids.size(0)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    epoch_loss = running_loss / len(val_loader.dataset)
    epoch_acc = correct_predictions / total_predictions

    return epoch_loss, epoch_acc

## Training and Testing on MTEB Dataset

In [None]:
# Do not train at all if ONLY_LLM
if not ONLY_LLM:
    train_model(model, val_dataloader, test_dataloader, criterion, optimizer, num_epochs = EPOCHS)

In [None]:
test_loss, test_acc = evaluate_model(model, test_dataloader, criterion, device, N_CLASSES)
print(" --- Evaluation by using a classification head --- ")
print(f"Loss on the test set: {test_loss}")
print(f"Accuracy on the test set: {test_acc}")

In [None]:
model.set_encoder_parameters(BATCH_SIZE, test_max_tokenized_length, TOKENIZER, device)
evaluation = evaluation_pipeline.run(model,
                                     eval_splits=["test"],
                                     output_folder="results",
                                     overwrite_results=True)

print(" --- MTEB Evaluation --- ")
print("Average accuracy", evaluation[0].scores["test"][0]["accuracy"])
print("Average f1", evaluation[0].scores["test"][0]["f1"])