Loosely following:
 
https://www.datacamp.com/tutorial/fine-tuning-large-language-models

In [46]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, Gemma3ForCausalLM,  TrainingArguments, Trainer
from huggingface_hub import login
from dotenv import load_dotenv
import os
import torch
import torch.nn as nn
import evaluate
import numpy as np

In [2]:
load_dotenv()

HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
MODEL = "google/gemma-3-4b-it"
SEED = 69
device = 'mps'

login(token=HUGGINGFACE_TOKEN)

In [3]:
raw_dataset = load_dataset("mteb/tweet_sentiment_extraction")
df = pd.DataFrame(raw_dataset['train'])

In [4]:
df.iloc[26730]

id                             ed167662a5
text           But it was worth it  ****.
label                                   2
label_text                       positive
Name: 26730, dtype: object

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)

In [6]:
text = ['hello world', 'bobby like to eat pizza']
vec = tokenizer(text, padding=True)
print("encoding: ",vec)

print("decoding: ",tokenizer.batch_decode(vec['input_ids']))

encoding:  {'input_ids': [[0, 0, 0, 0, 2, 23391, 1902], [2, 236763, 13990, 1133, 531, 9039, 19406]], 'attention_mask': [[0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}
decoding:  ['<pad><pad><pad><pad><bos>hello world', '<bos>bobby like to eat pizza']


In [7]:
def tokenize_dataset(data):
    return tokenizer(data['text'], padding="max_length", truncation=True, max_length=128)

In [8]:
dataset = raw_dataset.map(tokenize_dataset, batched=True)

In [9]:
train = dataset['train'].shuffle(SEED).select(range(2))
test = dataset['test'].shuffle(SEED).select(range(2))

In [57]:
#since we are using gemma we need to def a model for seq classification

baseModel = Gemma3ForCausalLM.from_pretrained(MODEL, device_map="auto", output_hidden_states=True, attn_implementation="eager")

The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:15<00:00,  7.76s/it]
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['output_hidden_states']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [58]:
baseModel.config.output_hidden_states = True          
baseModel.gradient_checkpointing_enable()     

In [65]:
class Gemma3Classifier(nn.Module):
    def __init__(self, bmodel, hiddensize, dropout=0.1):
        super().__init__()
        self.bmodel = bmodel
        self.dropout = nn.Dropout(dropout) 
        self.head = nn.Linear(hiddensize, 3)
    
    def forward(self, input_ids, attention_mask, labels):
        out = self.bmodel(input_ids, attention_mask)
        hidden_state = out.hidden_states[-1]
        embeddings = hidden_state[:, -1, :]  
        logits = self.head(self.dropout(embeddings))

        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


In [66]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)

In [67]:
model = Gemma3Classifier(bmodel=baseModel, dropout=0.1, hiddensize=baseModel.config.hidden_size).to(device)

In [68]:
train

Dataset({
    features: ['id', 'text', 'label', 'label_text', 'input_ids', 'attention_mask'],
    num_rows: 2
})

In [69]:
out = model(input_ids=torch.tensor(train['input_ids']).to(device), attention_mask = torch.tensor(train['attention_mask']).to(device), labels = torch.tensor(train['label']).to(device))

In [70]:

training_args = TrainingArguments(
   output_dir="test_trainer",
   #evaluation_strategy="epoch",
   per_device_train_batch_size=4,  # Reduce batch size here
   per_device_eval_batch_size=4,    # Optionally, reduce for evaluation as well
   gradient_accumulation_steps=4
   )


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train,
   eval_dataset=test,
   compute_metrics=compute_metrics,)

trainer.train()

Step,Training Loss,Validation Loss,Accuracy
3,No log,0.610603,0.5


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG13XFamilyCommandBuffer: 0x12ce7aa20>
    label = <none> 
    device = <AGXG13XDevice: 0x11d584800>
        name = Apple M1 Max 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x121ca2a00>
        label = <none> 
        device = <AGXG13XDevice: 0x11d584800>
            name = Apple M1 Max 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG13XFamilyCommandBuffer: 0x156209c00>
    label = <none> 
    device = <AGXG13XDevice: 0x11d584800>
        name = Apple M1 Max 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x121ca2a00>
        label = <none> 
        device =

RuntimeError: 
            Some tensors share memory, this will lead to duplicate memory on disk and potential differences when loading them again: [{'bmodel.model.embed_tokens.weight', 'bmodel.lm_head.weight'}].
            A potential way to correctly save your model is to use `save_model`.
            More information at https://huggingface.co/docs/safetensors/torch_shared_tensors
            

In [71]:
trainer.evaluate()

Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG13XFamilyCommandBuffer: 0xd2c0a4470>
    label = <none> 
    device = <AGXG13XDevice: 0x11d584800>
        name = Apple M1 Max 
    commandQueue = <AGXG13XFamilyCommandQueue: 0x121ca2a00>
        label = <none> 
        device = <AGXG13XDevice: 0x11d584800>
            name = Apple M1 Max 
    retainedReferences = 1


{'eval_loss': 0.6106026768684387, 'eval_accuracy': 0.5}