In [None]:
# Install Pytorch & other libraries
%pip -q install torch tensorboard

# Install Hugging Face libraries
%pip -q install transformers datasets accelerate evaluate trl protobuf sentencepiece

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/564.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m564.7/564.7 kB[0m [31m36.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import subprocess
import sys
import torch

def install_flash_attn_conditionally():
    """
    Checks the GPU's compute capability and installs the appropriate version of flash-attn.
    """
    if not torch.cuda.is_available():
        print("No CUDA-enabled GPU found. Skipping flash-attn installation.")
        return

    try:
        # Get the compute capability of the first available GPU
        major, minor = torch.cuda.get_device_capability(0)
        compute_capability = float(f"{major}.{minor}")
        gpu_name = torch.cuda.get_device_name(0)
        print(f"Found GPU: {gpu_name} with Compute Capability: {compute_capability}")

        # Check for Ampere, Ada, Hopper, or newer architectures (for FlashAttention 2)
        if compute_capability >= 8.0:
            print("Installing flash-attn for A100 GPU...")
            # Colab mein simple command use karo
            subprocess.check_call([
                sys.executable, "-m", "pip", "install",
                "flash-attn", "--no-build-isolation", "-q"
            ])
            print("Successfully installed.")
            return True
        else:
            print(f"GPU compute capability {compute_capability} not fully supported.")
            return False

    except Exception as e:
        print(f"Installation failed: {e}")
        return False

is_flash_attn_available = install_flash_attn_conditionally()

Found GPU: NVIDIA A100-SXM4-40GB with Compute Capability: 8.0
Installing flash-attn for A100 GPU...
Successfully installed.


In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
# General imports
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch

# Hugging Face imports
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
from datasets import Dataset, load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

# Scikit-learn for evaluation
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [None]:
print(f"transformers=={transformers.__version__}")

transformers==4.56.1


In [None]:
def set_deterministic(seed):
    """Sets all seeds and CUDA settings for deterministic results."""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU. [2, 3]
    set_seed(seed)

SEED = 0
set_deterministic(SEED)

In [None]:
# Hugging Face token setup
from huggingface_hub import login
from google.colab import userdata
hf_token = userdata.get('HF_Token')

login(token=hf_token)

# We specify the model path on Kaggle.
GEMMA_PATH = "google/gemma-3-4b-it"

# Determine the attention implementation.
# Use the faster "flash_attention_2" if installed, otherwise fall back to the eager implementation.
attn_implementation = "flash_attention_2" if is_flash_attn_available else "eager"

model = AutoModelForCausalLM.from_pretrained(
    GEMMA_PATH,
    dtype="auto", # Automatically uses bfloat16 on compatible GPUs
    device_map="auto",
    attn_implementation=attn_implementation,
    token=hf_token
)

max_seq_length = 2048
tokenizer = AutoTokenizer.from_pretrained(GEMMA_PATH, max_seq_length=max_seq_length)

# Explicitly enable use_cache for faster inference
model.config.use_cache = True

# We use the end-of-sequence token as the padding token.
# Padding on the left is a common practice for decoder-only models.
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
model.config.pad_token_id = tokenizer.pad_token_id
model.generation_config.pad_token_id = tokenizer.pad_token_id
model.config.bos_token_id = tokenizer.bos_token_id
model.generation_config.bos_token_id = tokenizer.bos_token_id

# Store the End-Of-Sequence token for use in prompt formatting
EOS_TOKEN = tokenizer.eos_token

print(f"Device: {model.device}")
print(f"DType: {model.dtype}")
print(f"Attention Implementation: {attn_implementation}")

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/215 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Device: cuda:0
DType: torch.bfloat16
Attention Implementation: flash_attention_2


In [None]:
filename = "all-data.csv"

df = pd.read_csv(filename,
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

# Stratified sampling to create balanced train and test sets
X_train, X_test = [], []
for sentiment in ["positive", "neutral", "negative"]:
    train, test  = train_test_split(df[df.sentiment==sentiment],
                                    train_size=300,
                                    test_size=300,
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

# Concatenate and shuffle the training data
X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

# Create a balanced evaluation set from the remaining data
eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

In [None]:
# Prompt engineering for training and inference

def create_training_prompt(data_point):
    """Formats a data point for training, including the expected sentiment."""
    return f"""generate_prompt Analyze the sentiment of the news headline enclosed in square brackets, determine if it is positive, neutral,
    or negative, and return the answer as the corresponding sentiment label "positive" or "neutral" or "negative"

            [{data_point["text"]}] = {data_point["sentiment"]}""".strip() + EOS_TOKEN

def create_test_prompt(data_point):
    """Formats a data point for inference, leaving the sentiment for the model to generate."""
    return f"""Analyze the sentiment of the news headline enclosed in square brackets, determine if it is positive, neutral, or negative,
    and return the answer as the corresponding sentiment label "positive" or "neutral" or "negative"

            [{data_point["text"]}] = """.strip()

In [None]:
# Apply prompt formatting
X_train["text"] = X_train.apply(create_training_prompt, axis=1)
X_eval["text"] = X_eval.apply(create_training_prompt, axis=1)

# Store true labels for final evaluation and format test set for inference
y_true = X_test.sentiment
X_test = pd.DataFrame(X_test.apply(create_test_prompt, axis=1), columns=["text"])

# Convert pandas DataFrames to Hugging Face Dataset objects
train_data = Dataset.from_pandas(X_train)
eval_data = Dataset.from_pandas(X_eval)

In [None]:
print(f"Training samples: {len(train_data)}")
print(f"Evaluation samples: {len(eval_data)}")
print(f"Test samples: {len(X_test)}")

Training samples: 900
Evaluation samples: 150
Test samples: 900


In [None]:
def evaluate(y_true, y_pred):
    """Calculates and prints comprehensive evaluation metrics."""

    labels = ['positive', 'neutral', 'negative']
    mapping = {'positive': 2, 'neutral': 1, 'none':1, 'negative': 0}
    def map_func(x):
        return mapping.get(x, 1)

    y_true = np.vectorize(map_func)(y_true)
    y_pred = np.vectorize(map_func)(y_pred)

    # Calculate accuracy
    accuracy = accuracy_score(y_true=y_true, y_pred=y_pred)
    print(f'Accuracy: {accuracy:.3f}')

    # Generate accuracy report
    unique_labels = set(y_true)  # Get unique labels

    for label in unique_labels:
        label_indices = [i for i in range(len(y_true))
                         if y_true[i] == label]
        label_y_true = [y_true[i] for i in label_indices]
        label_y_pred = [y_pred[i] for i in label_indices]
        accuracy = accuracy_score(label_y_true, label_y_pred)
        print(f'Accuracy for label {label}: {accuracy:.3f}')

    # Generate classification report
    class_report = classification_report(y_true=y_true, y_pred=y_pred)
    print('\nClassification Report:')
    print(class_report)

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_true=y_true, y_pred=y_pred, labels=[0, 1, 2])
    print('\nConfusion Matrix:')
    print(conf_matrix)

In [None]:
def predict(X_test, model, tokenizer):
    """Performs batch inference on the test set."""

    y_pred = []
    # Convert DataFrame column to a list of prompts
    prompts = X_test["text"].tolist()

    # Set batch size depending on GPU memory
    batch_size = 8

    for i in tqdm(range(0, len(prompts), batch_size)):
        batch = prompts[i:i+batch_size]
        inputs = tokenizer(batch,
                           return_tensors="pt",
                           padding=True,
                           truncation=True,
                           max_length=max_seq_length).to("cuda")

        outputs = model.generate(
            **inputs,
            # Set a higher max_new_tokens to ensure the model can generate full words
            max_new_tokens=10,
            do_sample=False, # Use greedy decoding for deterministic output
            top_p=1.0,
            top_k=50,
            pad_token_id=tokenizer.eos_token_id
        )

        # Decode and parse the generated text
        decoded_outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        for output in decoded_outputs:
            # The generated answer is after the last '=' sign
            answer = output.split("=")[-1].lower().strip()

            if "positive" in answer:
                y_pred.append("positive")
            elif "negative" in answer:
                y_pred.append("negative")
            elif "neutral" in answer:
                y_pred.append("neutral")
            else:
                # Fallback for unexpected or empty outputs
                y_pred.append("none")

    return y_pred

In [None]:
# Evaluate the base model
y_pred = predict(X_test, model, tokenizer)

100%|██████████| 113/113 [01:45<00:00,  1.07it/s]


In [None]:
evaluate(y_true, y_pred)

Accuracy: 0.769
Accuracy for label 0: 0.963
Accuracy for label 1: 0.647
Accuracy for label 2: 0.697

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.96      0.88       300
           1       0.73      0.65      0.68       300
           2       0.76      0.70      0.73       300

    accuracy                           0.77       900
   macro avg       0.76      0.77      0.76       900
weighted avg       0.76      0.77      0.76       900


Confusion Matrix:
[[289   9   2]
 [ 42 194  64]
 [ 27  64 209]]


In [None]:
# LoRA configuration
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                    "gate_proj", "up_proj", "down_proj",],
)

# SFT (Supervised Fine-tuning) configuration
training_arguments = SFTConfig(
    output_dir="logs",
    seed=SEED,
    num_train_epochs=5,
    gradient_checkpointing=True,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    optim="adamw_torch_fused",
    save_steps=0,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=True,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=False,
    eval_strategy='steps',
    eval_steps = 112,
    eval_accumulation_steps=1,
    lr_scheduler_type="cosine",
    dataset_text_field="text",
    packing=False,
    max_length=max_seq_length,
    report_to="tensorboard",
)

# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=eval_data,
    peft_config=peft_config,
    processing_class=tokenizer,
    args=training_arguments,

)

Adding EOS to train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/900 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/150 [00:00<?, ? examples/s]

In [None]:
# Train model
trainer.train()

# Save the fine-tuned LoRA adapter
trainer.model.save_pretrained("trained-model")

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 1}.
Casting fp32 inputs back to torch.float16 for flash-attn compatibility.


Step,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
112,7.0719,0.837998,0.83796,79519.0,0.827394


KeyboardInterrupt: 

In [None]:
# Access the log history
log_history = trainer.state.log_history

# Extract training / validation loss
train_losses = [log["loss"] for log in log_history if "loss" in log]
epoch_train = [log["epoch"] for log in log_history if "loss" in log]
eval_losses = [log["eval_loss"] for log in log_history if "eval_loss" in log]
epoch_eval = [log["epoch"] for log in log_history if "eval_loss" in log]

# Plot the training loss
plt.plot(epoch_train, train_losses, label="Training Loss")
plt.plot(epoch_eval, eval_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss per Epoch")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/runs

In [None]:
# Set model configuration for inference
model.gradient_checkpointing_disable()
model.config.use_cache = True

y_pred = predict(X_test, model, tokenizer)
evaluate(y_true, y_pred)

In [None]:
evaluation_df = pd.DataFrame({'text': X_test["text"], 'y_true':y_true, 'y_pred': y_pred})
evaluation_df.to_csv("test_predictions.csv", index=False)

print("Predictions saved to test_predictions.csv")
evaluation_df.head()