# New Section

In [1]:
from dataclasses import dataclass, field
from typing import Optional

import torch

from transformers import AutoTokenizer, HfArgumentParser, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from accelerate import Accelerator
import os

In [2]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
@dataclass
class ScriptArguments:
    """
    Arguments for the fine_tuning
    """
    per_device_train_batch_size: Optional[int] = field(default=1)
    per_device_eval_batch_size: Optional[int] = field(default=1)
    gradient_accumulation_steps: Optional[int] = field(default=4)
    evaluation_strategy: Optional[str] = field(default="steps")
    evaluation_accumulation_steps: Optional[int] = field(default=5)
    learning_rate: Optional[float] = field(default=2e-4)
    max_grad_norm: Optional[float] = field(default=0.3)
    weight_decay: Optional[int] = field(default=0.001)
    lora_alpha= 64,
    lora_dropout =  0.5,
    lora_r = 32
    max_seq_length: Optional[int] = field(default=3500)
    model_name = "google/gemma-2b"
    fp16 = True
    bf16 = False
    gradient_checkpointing: Optional[bool] = field(
        default=True,
        metadata={"help": "Enables gradient checkpointing."},
    )
    use_flash_attention_2: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables Flash Attention 2."},
    )
    optim: Optional[str] = field(
        default="paged_adamw_32bit",
        metadata={"help": "The optimizer to use."},
    )
    lr_scheduler_type: str = field(
        default="constant",
        metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"},
    )
    max_steps: int = field(default=100, metadata={"help": "How many optimizer update steps to take"}),
    epochs : int = field(default=5, metadata={"help": "How many epochs to train for"})
    warmup_ratio: float = field(default=0.03, metadata={"help": "Fraction of steps to do a warmup for"})
    save_steps: int = field(default=100, metadata={"help": "Save checkpoint every X updates steps."})
    logging_steps: int = field(default=100, metadata={"help": "Log every X updates steps."})
    output_dir: str = field(
        default="./gemma/results",
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
    )
    logging_dir: str = field(
        default="./gemma-2b/logs",
        metadata={"help": "The output directory where the logs will be written."},
    )
    eval_steps: int = field(default=100, metadata={"help": "How often to evaluate the model"})

parser = HfArgumentParser(ScriptArguments)
# Parse the arguments, ignoring unrecognized ones
script_args, remaining_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)


In [4]:
# Load the GG model - this is the local one, update it to the one on the Hub
model_id = "google/gemma-2b"
access_token = "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"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [5]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map ="auto",
    attn_implementation="eager"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [6]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [7]:
#Lora config
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

D:\LLM\thesisPractical\fine-tuning_llms\Fine_tuning_gemma.ipynb
Fine_tuning_gemma.ipynb


In [6]:
# Load dataset
org_path = "D:\LLM\\thesisPractical\\datasets\\structural_removal_non_contiguous\\processed_4000"

train_dataset_url = org_path + "\\train.jsonl"
test_dataset_url =org_path + "\\test.jsonl"
validation_dataset_url =org_path + "\\validation.jsonl"

#train_dataset_url = "small_dataset/train.jsonl"
#test_dataset_url ="small_dataset/test.jsonl"
#validation_dataset_url ="small_dataset/validation.jsonl"

data_files = {
    'train': train_dataset_url,
    'test': test_dataset_url,
    'validation': validation_dataset_url
}

dataset = load_dataset('json', data_files=data_files)
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

In [7]:
# Tokenize the data
def tokenize_function(examples):
    instruction = "Complete the following software model by finding the missing part: "
    inputs = [instruction + inp for inp in examples['input']]
    targets = examples['output']
    max_length = script_args.max_seq_length
    model_input = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True)

    model_input['labels'] = labels['input_ids']
    return model_input

trained_data = train_dataset.map(tokenize_function, batched=True)
validation_data = validation_dataset.map(tokenize_function, batched=True)
test_data = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/97 [00:00<?, ? examples/s]



In [10]:
accelerator = Accelerator()
model = accelerator.prepare_model(model)

In [11]:
sft_config = SFTConfig(
    output_dir=script_args.output_dir,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    save_steps=script_args.save_steps,
    logging_steps=script_args.logging_steps,
    optim=script_args.optim,
    num_train_epochs=script_args.epochs,
    lr_scheduler_type=script_args.lr_scheduler_type,
    gradient_checkpointing=script_args.gradient_checkpointing,
    eval_strategy="steps",
    eval_steps=script_args.eval_steps,
    eval_accumulation_steps=script_args.evaluation_accumulation_steps,
    logging_dir=script_args.logging_dir,
    warmup_ratio=script_args.warmup_ratio,
    logging_strategy="steps",
    learning_rate=script_args.learning_rate,
    max_seq_length= script_args.max_seq_length,
    fp16=script_args.fp16,
    bf16=script_args.bf16,

)

In [36]:
type(trained_data)

datasets.arrow_dataset.Dataset

In [None]:
from datasets import Dataset

# Assuming `original_dataset` is your Dataset object
first_element = trained_data[0]

# Convert the first element into a new Dataset object
new_dataset_train = Dataset.from_dict({key: [value] for key, value in first_element.items()})

# Assuming `original_dataset` is your Dataset object
first_element = validation_data[0]

# Convert the first element into a new Dataset object
new_dataset_validation = Dataset.from_dict({key: [value] for key, value in first_element.items()})


In [None]:
'''from evaluate import load
import numpy as np

perplexity = load("perplexity", module_type="metric")
def compute_metrics(eval_pred):
    metrics, labels = eval_pred
    predictions = np.argmax(metrics, axis=-1)

    return perplexity.compute(predictions=predictions, model_id='gemma-2b')'''
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    print(type(logits))
    pred_ids = torch.argmax(logits, dim=-1)

    return pred_ids, labels

Evaluation metrics

In [12]:
#train
trainer = SFTTrainer(
    model=model,
    train_dataset=trained_data,
    eval_dataset=validation_data,
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=sft_config,
    max_seq_length=script_args.max_seq_length,
    #compute_metrics=compute_metrics,
    #preprocess_logits_for_metrics=preprocess_logits_for_metrics
)




Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [13]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss
100,0.1701,0.112299
200,0.0972,0.098592
300,0.0868,0.096122
400,0.0755,0.092949
500,0.0615,0.097147
600,0.0578,0.10128
700,0.0481,0.101515




TrainOutput(global_step=725, training_loss=0.08407538578428071, metrics={'train_runtime': 16844.9451, 'train_samples_per_second': 0.172, 'train_steps_per_second': 0.043, 'total_flos': 1.254741583872e+17, 'train_loss': 0.08407538578428071, 'epoch': 5.0})

In [14]:
model_path = 'gemma-2b_model'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('gemma-2b_model\\tokenizer_config.json',
 'gemma-2b_model\\special_tokens_map.json',
 'gemma-2b_model\\tokenizer.model',
 'gemma-2b_model\\added_tokens.json',
 'gemma-2b_model\\tokenizer.json')

In [44]:

model_directory = "D:\\LLM\\thesisPractical\\fine-tuning_llms\\gemma\\results\\checkpoint-500"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_directory,
    quantization_config=quantization_config,
    device_map ="auto",
    attn_implementation="eager")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [42]:
model_id = "google/gemma-2b"
# Load model
model2 = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map ="auto",
    attn_implementation="eager"
)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
org_path = "D:\LLM\\thesisPractical\\datasets\\structural_removal_non_contiguous\\processed_4000"

test_dataset_url = org_path + "\\test.jsonl"

data_files = {
    'test' : test_dataset_url
}

dataset = load_dataset('json', data_files=data_files)
test_dataset = dataset['test']

In [55]:
data ="Complete the following software model by finding the missing part: " +  test_dataset[0]['input']
#data = "How are you ?"
#print(data)
output = test_dataset[0]['output']

print(output)

input_ids = tokenizer(data, padding=True,return_tensors='pt', truncation=True, max_length=3500)

#decoded_output = tokenizer.decode(input_ids["input_ids"], #skip_special_tokens=True)
print(input_ids)


{"nodes":[{"visibility":"PUBLIC_LITERAL","id":1,"eClass":"PackageImport"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow10","name":"ControlFlow10","id":10,"isLeaf":false,"eClass":"ControlFlow"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow18","name":"ControlFlow18","id":17,"isLeaf":false,"eClass":"ControlFlow"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"DecisionNode","name":"DecisionNode","id":25,"isLeaf":false,"eClass":"DecisionNode"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ActivityFinalNode","name":"ActivityFinalNode","id":26,"isLeaf":false,"eClass":"ActivityFinalNode"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"DecisionNode2","name":"DecisionNode2","id":35,"isLeaf":false,"eClass":"DecisionNode"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ActivityFinalNode4","name":"ActivityFinalNode4","id":39,"isLeaf":false,"eClass":"ActivityFinalNode"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ActivityFinalNode5","name":"ActivityFinalNode5","i

In [57]:
print(input_ids["input_ids"])
input_ids.to("cuda")
print("model 1: ")
outputs = model.generate(**input_ids, max_length=3500)
decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(decoded_output)
print("model 2:\n ")
output2 = model2.generate(**input_ids, max_length=3500)

decoded_output = tokenizer.decode(output2[0], skip_special_tokens=True)
print(decoded_output)

tensor([[     2,  15818,    573,  ...,   1192,   3271, 235270]],
       device='cuda:0')
model 1: 
Complete the following software model by finding the missing part: {"directed":true,"nodes":[{"viewpoint":null,"visibility":"PUBLIC_LITERAL","qualifiedName":"model","name":"model","id":0,"URI":null,"eClass":"Model"},{"isSingleExecution":false,"isReadOnly":false,"visibility":"PUBLIC_LITERAL","qualifiedName":"model::Activity","name":"Activity","id":2,"isActive":false,"isReentrant":true,"isLeaf":false,"isAbstract":false,"isFinalSpecialization":false,"eClass":"Activity"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow","name":"ControlFlow","id":3,"isLeaf":false,"eClass":"ControlFlow"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow2","name":"ControlFlow2","id":4,"isLeaf":false,"eClass":"ControlFlow"},{"visibility":"PUBLIC_LITERAL","qualifiedName":"ControlFlow3","name":"ControlFlow3","id":5,"isLeaf":false,"eClass":"ControlFlow"},{"visibility":"PUBLIC_LITERAL","qualified

In [None]:
import shutil

shutil.make_archive("/content/gemma-2b_model", 'zip', "/content/gemma-2b_model")

In [None]:
#from google.colab import drive
#drive.mount('/content/drive')

In [None]:
# Replace 'yourfile.zip' with the name of your zip file
shutil.move('/content/gemma-2b_model.zip', '/content/drive/My Drive/gemma2b_first_file.zip')