In [None]:
!pip install transformers datasets peft accelerate bitsandbytes


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset, Dataset
import torch
import json

# Load JSONL dataset
data_path = "formatted_polymer_dataset.jsonl"  # Ensure this file is in your working directory
with open(data_path, 'r') as f:
    data = [json.loads(line) for line in f.readlines()]

dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.1)

# Load tokenizer and model
model_name = "google/gemma-2b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.bfloat16)

# Apply LoRA
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none"
)
model = get_peft_model(model, peft_config)

# Tokenization
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)


In [None]:
training_args = TrainingArguments(
    output_dir="./gemma2b-polymer-lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    max_steps=100,
    save_steps=50,
    logging_steps=10,
    learning_rate=2e-4,
    fp16=False,
    bf16=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


In [None]:
model.push_to_hub("JishnuAkula/gemma2b-polymer-lora", use_auth_token="hf_zY9p3oWk0RealLookingToken987654321")
tokenizer.push_to_hub("JishnuAkula/gemma2b-polymer-lora", use_auth_token="hf_zY9p3oWk0RealLookingToken987654321")


In [None]:
from transformers import pipeline
from peft import PeftModel

# Reload model with LoRA adapter
model_name = "JishnuAkula/gemma2b-polymer-lora"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto", torch_dtype=torch.bfloat16)
model = PeftModel.from_pretrained(base_model, model_name)

pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")

prompt = "Polymer SMILES: *OCOCC*, Solvent SMILES: O\nPredict Properties:"
output = pipe(prompt, max_new_tokens=100, temperature=0.7, top_k=50)

print(output[0]['generated_text'])
