<a href="https://colab.research.google.com/github/IdaCy/jailbreak-alert-research/blob/main/notebooks/ReNeLLM_gemma3-4b-it_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Check Python version (optional):
import sys
print("Python version:", sys.version)

# Get installations
!pip install --quiet torch numpy matplotlib scikit-learn pandas
!pip install --quiet huggingface_hub transformers

import os
import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# If you want to check GPU usage:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)

In [None]:
import torch
torch.cuda.empty_cache()
%ls

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# After running this cell, follow the link to grant Colab access to your Google Drive.

In [None]:
!git clone https://github.com/IdaCy/jailbreak-alert-research.git
%cd jailbreak-alert-research

In [None]:
!pip install huggingface_hub --quiet

from huggingface_hub import notebook_login

# This will prompt you in Colab to enter your HF token or log in directly
notebook_login()

# Load Model and Tokeniser here first time around

In [None]:
import logging
logging.basicConfig(level=logging.INFO)

from functions.load_csv_prompts import load_prompts
from functions.csv_inference import load_model_and_tokenizer, run_inference

# 1. Load the model/tokenizer once
tokenizer, model = load_model_and_tokenizer(
    model_name="google/gemma-2-9b-it",
    use_bfloat16=True,
    hf_token=None
)

In [None]:
# Save model and tokenizer to Google Drive
SAVE_PATH = "/content/drive/MyDrive/models/gemma-2-9b-it"
!mkdir -p $SAVE_PATH

print(f"Saving model and tokenizer to {SAVE_PATH}...")
model.save_pretrained(SAVE_PATH)
tokenizer.save_pretrained(SAVE_PATH)
print("Save complete!")


# In future, skip above, load model here

In [None]:
#in future use this to load the model and tokenizer
from transformers import AutoModelForCausalLM, AutoTokenizer

LOAD_PATH = "/content/drive/MyDrive/models/gemma-2-9b-it"
print("Loading saved model from Google Drive...")
model = AutoModelForCausalLM.from_pretrained(
    LOAD_PATH,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(LOAD_PATH)
print("Model loaded successfully!")

In [None]:
from functions.load_csv_prompts import load_prompts

# 2. Load prompts from CSV
csv_path = "data/code_chameleon/formatted_none_820jb.csv"  # Path to your jailbreak prompts
prompts = load_prompts(
    csv_path=csv_path,
    num_samples=None  # Set a number here if you want to test with fewer samples
)

print(f"Loaded {len(prompts)} prompts")
%mkdir output/extractions/gemma-2-9b-it

In [None]:
### just some logging ###

# Create (or get) a logger and set the level
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# OPTIONAL: if you want to clear any existing handlers, do:
# logger.handlers = []

# Create a console handler that prints to stdout
console_handler = logging.StreamHandler(sys.stdout)
console_handler.setLevel(logging.INFO)

# Create a formatter and set it for the console handler
formatter = logging.Formatter("[%(asctime)s] [%(levelname)s] %(message)s")
console_handler.setFormatter(formatter)

# Add the handler to the logger
logger.addHandler(console_handler)


In [None]:
# 3. Run inference & capture activations
run_inference(
    model,
    tokenizer,
    prompts,  
    output_dir="output/extractions/gemma-2-9b-it",  
    batch_size=8,
    max_new_tokens=50,
    extract_hidden_layers=[0,5,10,15,20,25],
    extract_attention_layers=[0,5,10,15,20,25],
    top_k_logits=10
)

In [None]:
LOCAL_PATH = "output/extractions/gemma-2-9b-it"
GDRIVE_PATH = "/content/drive/MyDrive/jailbreak_storing/output/extractions/gemma-2-9b-it"

!mkdir -p "$GDRIVE_PATH"
!cp -r "$LOCAL_PATH"/* "$GDRIVE_PATH"

print("got .pt files over to:", GDRIVE_PATH)

In [None]:
%ls GDRIVE_PATH
%mkdir output/extractions/gemma3-4bit/attack_jailbreak

In [None]:
# 2. Load one set of prompts
prompts_file = "data/renellm/attacks_all.json"
prompts_jb = load_json_prompts(
    prompts_file,
    prompt_key="attack_jailbreak",
    num_samples=None
)

In [None]:
# 3. Run inference & capture activations
run_inference(
    model,
    tokenizer,
    prompts_bad,
    output_dir="output/extractions/gemma3-4bit/attack_jailbreak",
    batch_size=8,
    max_new_tokens=50,
    extract_hidden_layers=[0,5,10,15,20,25],
    extract_attention_layers=[0,5,10,15,20,25],
    top_k_logits=10
)

In [None]:
LOCAL_PATH = "output/extractions/gemma3-4b/attack_jailbreak"
GDRIVE_PATH = "/content/drive/MyDrive/jailbreak_storing/output/extractions/gemma3-4b/attack_jailbreak"

!mkdir -p "$GDRIVE_PATH"
!cp -r "$LOCAL_PATH"/* "$GDRIVE_PATH"

print("got .pt files over to:", GDRIVE_PATH)

In [None]:
%ls /content/drive/MyDrive/jailbreak_storing/output/extractions/gemma3-4b/
%ls /content/drive/MyDrive/jailbreak_storing/output/extractions/gemma3-4b/attack_jailbreak

In [None]:
import sys
sys.path.append(".")

from functions.read_predictions import read_predictions

# Then call it:
preds = read_predictions(
    read_output_dir="output/extractions/gemma3-4b/attack",
    max_predictions=50,
    write_predictions_file="logs/pred_out.txt",
    log_file="logs/readpred.log"
)

print("Number of preds:", len(preds))
print("First 3 preds:", preds[:15])