In [None]:
import os
from google.colab import userdata
os.environ["HF_TOKEN"] = userdata.get("HF_TOKEN")

# Install dependencies

In [None]:
! pip install --upgrade --quiet bitsandbytes datasets peft transformers trl rdkit

# Load model from HF

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

base_model = "google/txgemma-"
CHAT_VARIANT = "9b-chat" # @param ["9b-chat", "27b-chat"]

model_id = base_model + CHAT_VARIANT

# Use 4-bit quantization to reduce memory usage
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map={"":0},
    torch_dtype="auto",
    attn_implementation="eager",
)

# Load dataset and prepare train test split

In [None]:
import json

with open("train_hif_binding.jsonl","r") as f:
    binders = [json.loads(line) for line in f]

# Create formatting function for LoRA later
def formatting_func(example):
    return f"{example['prompt']}\n{example['bind']}"

print(formatting_func(binders[0]))

In [None]:
import pandas as pd

data = pd.DataFrame([
    {
        "input": ex["prompt"],
        "output": ex["bind"]
    }
    for ex in binders
])

data

## Spliting train test

In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)

train_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

train_data

# Fine tuning the model (finally 😱)

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    task_type="CAUSAL_LM",
    target_modules=[
        "q_proj",
        "o_proj",
        "k_proj",
        "v_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
)

In [None]:
from peft import prepare_model_for_kbit_training, get_peft_model

# Preprocess quantized model for training
model = prepare_model_for_kbit_training(model)

# Create PeftModel from quantized model and configuration
model = get_peft_model(model, lora_config)

In [None]:
import transformers
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    args=SFTConfig(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=50,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=5,
        max_seq_length=512,
        output_dir="/content/outputs",
        optim="paged_adamw_8bit",
        report_to="none",
    ),
    peft_config=lora_config,
    formatting_func=formatting_func,
)


In [None]:
trainer.train()

trainer.save_model()
trainer.tokenizer.save_pretrained(
    trainer.args.output_dir
)

# Test the fine-tuned model

In [None]:
prompt = test_data.iloc[0]["input"]
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens=8)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
print(f"The correct output should be:\n{test_data.iloc[0]['output']}")