Loosely following:
 
https://www.datacamp.com/tutorial/fine-tuning-large-language-models

In [1]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, Gemma3Model,  TrainingArguments, Trainer
from huggingface_hub import login
from dotenv import load_dotenv
import os
import torch
import torch.nn as nn

from tqdm import tqdm
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()

HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
MODEL = "google/gemma-3-4b-it"
SEED = 69

login(token=HUGGINGFACE_TOKEN)

In [3]:
# Lets get tha dataset
# For us the dataset will be 
raw_dataset = load_dataset("mteb/tweet_sentiment_extraction")
df_train = pd.DataFrame(raw_dataset['train'])
df_test = pd.DataFrame(raw_dataset['test'])

In [4]:
# each segment of text "tweet" has a class 0 (negative), 1 (neutral), or 2 (positive)
df_train['label'].unique()

array([1, 0, 2])

In [5]:
df_train

Unnamed: 0,id,text,label,label_text
0,cb774db0d1,"I`d have responded, if I were going",1,neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,0,negative
2,088c60f138,my boss is bullying me...,0,negative
3,9642c003ef,what interview! leave me alone,0,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...",0,negative
...,...,...,...,...
26727,4eac33d1c0,wish we could come see u on Denver husband l...,0,negative
26728,4f4c4fc327,I`ve wondered about rake to. The client has ...,0,negative
26729,f67aae2310,Yay good for both of you. Enjoy the break - y...,2,positive
26730,ed167662a5,But it was worth it ****.,2,positive


In [6]:
# we need this to format the input so model can understand
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)

In [7]:
# test of the tokenizer
text = ['hello world', 'bobby like to eat pizza']
vec = tokenizer(text, padding=True)
print("encoding: ",vec)

print("decoding: ",tokenizer.batch_decode(vec['input_ids']))

encoding:  {'input_ids': [[0, 0, 0, 0, 2, 23391, 1902], [2, 236763, 13990, 1133, 531, 9039, 19406]], 'attention_mask': [[0, 0, 0, 0, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1]]}
decoding:  ['<pad><pad><pad><pad><bos>hello world', '<bos>bobby like to eat pizza']


In [8]:
# we jsut define this so be used with the 'dataset' map function so apply to the data
def tokenize_dataset(data):
    return tokenizer(data['text'], padding="max_length", truncation=True, max_length=128)

In [9]:
# apply tokanizeion to the dataset
dataset = raw_dataset.map(tokenize_dataset, batched=True)

In [10]:
# shuffle the dataset and split into smaller part sow e can run on laptop
train = dataset['train'].shuffle(SEED).select(range(4))
test = dataset['test'].shuffle(SEED).select(range(4))

In [11]:
#make data into a tensor
X_train = torch.tensor(train['input_ids'])
y_train = torch.tensor(train['label'])
X_test = torch.tensor(test['input_ids'])
y_test = torch.tensor(test['label'])

X_train.shape, y_train.shape, X_test.shape, y_test.shape

(torch.Size([4, 128]), torch.Size([4]), torch.Size([4, 128]), torch.Size([4]))

In [12]:
def check_gpu_memory():
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            print(f"\nGPU {i}:")
            print(f"  Allocated: {torch.cuda.memory_allocated(i) / 1024**3:.2f} GB")
            print(f"  Cached: {torch.cuda.memory_reserved(i) / 1024**3:.2f} GB")
            print(f"  Total: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")


In [13]:
# Since we are using gemma we need to def a model for seq classification
# To do so we will import the base model then construct our model using output from the base model
baseModel = Gemma3Model.from_pretrained(MODEL, device_map='auto', 
                                        output_hidden_states=True, 
                                        attn_implementation="eager", 
                                            max_memory = {
                                            0: "20GiB",        # CGPU 0 - more memory training
                                            1: "8GiB",        # GPU 1 - less of the model since it will have outpus and y 
                                            "cpu": "80Gib"
                                            }
                                        )

check_gpu_memory()

Loading checkpoint shards: 100%|██████████| 2/2 [00:03<00:00,  1.93s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
We've detected an older driver with an RTX 4000 series GPU. These drivers have issues with P2P. This can affect the multi-gpu inference when using accelerate device_map.Please make sure to update your driver to the latest version which resolves this.



GPU 0:
  Allocated: 7.93 GB
  Cached: 7.94 GB
  Total: 23.65 GB

GPU 1:
  Allocated: 7.74 GB
  Cached: 7.74 GB
  Total: 23.65 GB


In [14]:

baseModel.vision_tower  = baseModel.vision_tower.to("cpu")
for param in baseModel.vision_tower.parameters():
                param.requires_grad = False
    
check_gpu_memory()


GPU 0:
  Allocated: 6.38 GB
  Cached: 7.94 GB
  Total: 23.65 GB

GPU 1:
  Allocated: 7.74 GB
  Cached: 7.74 GB
  Total: 23.65 GB


In [15]:
baseModel.config.output_hidden_states = True          
baseModel.gradient_checkpointing_enable()     

In [16]:
class Gemma3Classifier(nn.Module):
    def __init__(self, bmodel, hiddensize, dropout=0.1):
        super().__init__()
        self.bmodel = bmodel
        self.dropout = nn.Dropout(dropout) 
        self.head = nn.Linear(hiddensize, 3).to('cuda:1')
        self.device_placement = True
    
    def forward(self, input_ids):
        out = self.bmodel(input_ids)
        hidden_state = out.hidden_states[-1]
        embeddings = hidden_state[:, -1, :]  

        embeddings = embeddings.to('cuda:1')

        logits = self.head(self.dropout(embeddings))
        del embeddings

        return logits 

In [17]:
model = Gemma3Classifier(bmodel=baseModel, dropout=0.1, hiddensize=baseModel.config.text_config.hidden_size)

In [18]:
out = model(input_ids=X_train)

In [19]:
out

tensor([[-2.4767, -1.7002, -4.2100],
        [ 0.4149, -0.5600,  0.5343],
        [-1.9854, -1.8955, -3.7252],
        [ 0.2192,  0.1666, -0.7553]], device='cuda:1',
       grad_fn=<AddmmBackward0>)

In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params=model.head.parameters() ,lr=0.0003)

In [21]:
lossi = []
y_train = y_train.to('cuda:1')
torch.cuda.empty_cache()

for epoch in tqdm(range(100)):
    optimizer.zero_grad()
    out = model(input_ids=X_train)

    loss = criterion(out, y_train.to('cuda:1'))
    loss.backward()
    optimizer.step()
    lossi.append(loss.item())
    torch.cuda.empty_cache()


100%|██████████| 100/100 [00:55<00:00,  1.81it/s]


In [22]:
lossi

[1.415424108505249,
 0.6559700965881348,
 0.3055573105812073,
 0.10778151452541351,
 0.07022865116596222,
 0.0264415442943573,
 0.010756859555840492,
 0.007234584540128708,
 0.005657943431288004,
 0.0035776859149336815,
 0.002383651677519083,
 0.0028499371837824583,
 0.0009930935921147466,
 0.0007010772242210805,
 0.0005219571758061647,
 0.0009688568534329534,
 0.0005515862721949816,
 0.00031373760430142283,
 0.00044760460150428116,
 0.00021824135910719633,
 0.00013935935567133129,
 0.0001674848172115162,
 0.00015520457236561924,
 0.00012592134589795023,
 0.00015762449766043574,
 0.00014960102271288633,
 0.00011769855336751789,
 0.00014391285367310047,
 0.00010238455433864146,
 0.00010527321137487888,
 9.487610805081204e-05,
 0.00011474950588308275,
 0.00016279389092233032,
 3.6625457141781226e-05,
 0.00011057771916966885,
 0.00011489531607367098,
 6.14162054262124e-05,
 3.32279087160714e-05,
 6.144627695903182e-05,
 6.079075683373958e-05,
 9.660367504693568e-05,
 4.5475542719941586e-0