Loosely following:
 
https://www.datacamp.com/tutorial/fine-tuning-large-language-models

In [1]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, Gemma3Model,  TrainingArguments, Trainer
from huggingface_hub import login
from dotenv import load_dotenv
import os
import torch
import torch.nn as nn

from tqdm import tqdm
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
MODEL = "google/gemma-3-4b-it"
SEED = 69

login(token=HUGGINGFACE_TOKEN)

In [3]:
# Lets get tha dataset
# For us the dataset will be 
raw_dataset = load_dataset("mteb/tweet_sentiment_extraction")
df_train = pd.DataFrame(raw_dataset['train'])
df_test = pd.DataFrame(raw_dataset['test'])

In [4]:
# each segment of text "tweet" has a class 0 (negative), 1 (neutral), or 2 (positive)
df_train['label'].unique()

array([1, 0, 2])

In [5]:
df_train

Unnamed: 0,id,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative
3,9642c003ef,what interview! leave me alone,0,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,negative
...,...,...,...,...
26727,4eac33d1c0,wish we could come see u on Denver husband l...,0,negative
26728,4f4c4fc327,I`ve wondered about rake to. The client has ...,0,negative
26729,f67aae2310,Yay good for both of you. Enjoy the break - y...,2,positive
26730,ed167662a5,But it was worth it ****.,2,positive


In [6]:
# we need this to format the input so model can understand
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)

In [7]:
# test of the tokenizer
text = ['hello world', 'bobby like to eat pizza']
vec = tokenizer(text, padding=True)
print("encoding: ",vec)

print("decoding: ",tokenizer.batch_decode(vec['input_ids']))

encoding:  {'input_ids': [[0, 0, 0, 0, 2, 23391, 1902], [2, 236763, 13990, 1133, 531, 9039, 19406]], 'attention_mask': [[0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}
decoding:  ['<pad><pad><pad><pad><bos>hello world', '<bos>bobby like to eat pizza']


In [8]:
# we jsut define this so be used with the 'dataset' map function so apply to the data
def tokenize_dataset(data):
    return tokenizer(data['text'], padding="max_length", truncation=True, max_length=128)

In [9]:
# apply tokanizeion to the dataset
dataset = raw_dataset.map(tokenize_dataset, batched=True)

In [10]:
# shuffle the dataset and split into smaller part sow e can run on laptop
train = dataset['train'].shuffle(SEED).select(range(4))
test = dataset['test'].shuffle(SEED).select(range(4))

In [11]:
X_train = torch.tensor(train['input_ids'])
y_train = torch.tensor(train['label'])
X_test = torch.tensor(test['input_ids'])
y_test = torch.tensor(test['label'])

X_train.shape, y_train.shape, X_test.shape, y_test.shape

(torch.Size([4, 128]), torch.Size([4]), torch.Size([4, 128]), torch.Size([4]))

In [12]:
# Since we are using gemma we need to def a model for seq classification
# To do so we will import the base model then construct our model using output from the base model
baseModel = Gemma3Model.from_pretrained(MODEL, device_map='auto', 
                                        output_hidden_states=True, 
                                        attn_implementation="eager", 
                                            max_memory = {
                                            0: "8GiB",        # Conservative GPU 0 - reserves memory for training
                                            1: "8GiB",        # More aggressive GPU 1 - stores most model
                                            "cpu": "80GiB"     # CPU buffer for offloading vision components
                                            }
                                        )

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.97s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.


In [13]:
baseModel.config.output_hidden_states = True          
baseModel.gradient_checkpointing_enable()     

In [14]:
class Gemma3Classifier(nn.Module):
    def __init__(self, bmodel, hiddensize, dropout=0.1):
        super().__init__()
        self.bmodel = bmodel
        self.dropout = nn.Dropout(dropout) 
        self.head = nn.Linear(hiddensize, 3).to('cuda:1')
        self.device_placement = True
    
    def forward(self, input_ids):
        out = self.bmodel(input_ids)
        hidden_state = out.hidden_states[-1]
        embeddings = hidden_state[:, -1, :]  

        embeddings = embeddings.to('cuda:1')

        logits = self.head(self.dropout(embeddings))

        return logits 

In [15]:
# from torch.utils.data import DataLoader, TensorDataset
# dataset_pt = TensorDataset(X_train, y_train)

In [16]:

# dataloader = DataLoader(dataset_pt, batch_size=2)

In [17]:
model = Gemma3Classifier(bmodel=baseModel, dropout=0.1, hiddensize=baseModel.config.text_config.hidden_size)

In [18]:
# from accelerate import Accelerator
# accelerator = Accelerator()
# model, dataloader = accelerator.prepare(model, X_train)

In [19]:
out = model(input_ids=X_train)

In [20]:
out

tensor([[-4.4074,  0.7933,  1.4498],
        [ 0.7192,  0.7687,  1.4797],
        [-0.3027,  4.2175,  0.1307],
        [ 0.8562, -0.4328,  1.5695]], device='cuda:1',
       grad_fn=<AddmmBackward0>)

In [21]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.head.parameters() ,lr=0.0003)

In [22]:
lossi = []
y_train = y_train.to('cuda:1')
for epoch in tqdm(range(100)):
    optimizer.zero_grad()
    out = model(input_ids=X_train)

    loss = criterion(out, y_train.to('cuda:1'))
    loss.backward()
    optimizer.step()
    lossi.append(loss.item())

  0%|          | 0/100 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 100.00 MiB. GPU 1 has a total capacity of 23.65 GiB of which 46.12 MiB is free. Process 2536225 has 8.11 GiB memory in use. Including non-PyTorch memory, this process has 15.47 GiB memory in use. Of the allocated memory 14.88 GiB is allocated by PyTorch, and 137.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)