Loosely following:
 
https://www.datacamp.com/tutorial/fine-tuning-large-language-models

In [None]:
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer, Gemma3Model,  TrainingArguments, Trainer
from huggingface_hub import login
from dotenv import load_dotenv
import os
import torch
import torch.nn as nn
import evaluate
import numpy as np

In [None]:
load_dotenv()

HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
MODEL = "google/gemma-3-4b-it"
SEED = 69
device = 'mps'

login(token=HUGGINGFACE_TOKEN)

In [None]:
# Lets get tha dataset
# For us the dataset will be 
raw_dataset = load_dataset("mteb/tweet_sentiment_extraction")
df_train = pd.DataFrame(raw_dataset['train'])
df_test = pd.DataFrame(raw_dataset['test'])

In [None]:
# each segment of text "tweet" has a class 0 (negative), 1 (neutral), or 2 (positive)
df_train['label'].unique()

In [None]:
df_train

In [None]:
# we need this to format the input so model can understand
tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)

In [None]:
# test of the tokenizer
text = ['hello world', 'bobby like to eat pizza']
vec = tokenizer(text, padding=True)
print("encoding: ",vec)

print("decoding: ",tokenizer.batch_decode(vec['input_ids']))

In [None]:
# we jsut define this so be used with the 'dataset' map function so apply to the data
def tokenize_dataset(data):
    return tokenizer(data['text'], padding="max_length", truncation=True, max_length=128)

In [None]:
# apply tokanizeion to the dataset
dataset = raw_dataset.map(tokenize_dataset, batched=True)

In [None]:
# shuffle the dataset and split into smaller part sow e can run on laptop
train = dataset['train'].shuffle(SEED).select(range(2))
test = dataset['test'].shuffle(SEED).select(range(2))

In [None]:
# Since we are using gemma we need to def a model for seq classification
# To do so we will import the base model then construct our model using output from the base model
baseModel = Gemma3Model.from_pretrained(MODEL, device_map="auto", output_hidden_states=True, attn_implementation="eager")

In [None]:
baseModel.config.output_hidden_states = True          
baseModel.gradient_checkpointing_enable()     

In [None]:
class Gemma3Classifier(nn.Module):
    def __init__(self, bmodel, hiddensize, dropout=0.1):
        super().__init__()
        self.bmodel = bmodel
        self.dropout = nn.Dropout(dropout) 
        self.head = nn.Linear(hiddensize, 3)
    
    def forward(self, input_ids, attention_mask, labels = None):
        out = self.bmodel(input_ids, attention_mask)
        hidden_state = out.hidden_states[-1]
        print(hidden_state.shape)
        embeddings = hidden_state[:, -1, :]  
        logits = self.head(self.dropout(embeddings))

        loss = None
        if labels is not None:
            loss = nn.CrossEntropyLoss()(logits, labels)
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


In [None]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
   logits, labels = eval_pred
   predictions = np.argmax(logits, axis=-1)
   return metric.compute(predictions=predictions, references=labels)

In [None]:
model = Gemma3Classifier(bmodel=baseModel, dropout=0.1, hiddensize=baseModel.config.hidden_size).to(device)

In [None]:
out = model(input_ids=torch.tensor(train['input_ids']).to(device), attention_mask = torch.tensor(train['attention_mask']).to(device), labels = torch.tensor(train['label']).to(device))

In [None]:

training_args = TrainingArguments(
   output_dir="test_trainer",
   #evaluation_strategy="epoch",
   per_device_train_batch_size=4,  # Reduce batch size here
   per_device_eval_batch_size=4,    # Optionally, reduce for evaluation as well
   gradient_accumulation_steps=4
   )


trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=train,
   eval_dataset=test,
   compute_metrics=compute_metrics,)

trainer.train()

In [None]:
trainer.evaluate()