In [None]:
import os
import shutil

def unzip_folder(folder_path):
    shutil.unpack_archive(folder_path, folder_path.replace(".zip",""))


This is structural contiguous

In [None]:
unzip_folder("processed_2000.zip")
unzip_folder("processed_4000.zip")

In [None]:
!pip install datasets
!pip install peft
!pip install accelerate
!pip install bitsandbytes
!pip install --upgrade transformers
!pip install trl
!pip install evaluate

# New Section

In [None]:
from dataclasses import dataclass, field
from typing import Optional

import torch

from transformers import AutoTokenizer, HfArgumentParser, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from accelerate import Accelerator

In [None]:
from huggingface_hub import login
login()

In [None]:
@dataclass
class ScriptArguments:
    """
    Arguments for the fine_tuning
    """
    per_device_train_batch_size: Optional[int] = field(default=1)
    per_device_eval_batch_size: Optional[int] = field(default=1)
    gradient_accumulation_steps: Optional[int] = field(default=4)
    evaluation_strategy: Optional[str] = field(default="steps")
    evaluation_accumulation_steps: Optional[int] = field(default=5)
    learning_rate: Optional[float] = field(default=2e-4)
    max_grad_norm: Optional[float] = field(default=0.3)
    weight_decay: Optional[int] = field(default=0.001)
    lora_alpha= 64,
    lora_dropout =  0.5,
    lora_r = 32
    max_seq_length: Optional[int] = field(default=600)
    model_name = "google/gemma-2b"
    fp16 = True
    bf16 = False
    gradient_checkpointing: Optional[bool] = field(
        default=True,
        metadata={"help": "Enables gradient checkpointing."},
    )
    use_flash_attention_2: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables Flash Attention 2."},
    )
    optim: Optional[str] = field(
        default="paged_adamw_32bit",
        metadata={"help": "The optimizer to use."},
    )
    lr_scheduler_type: str = field(
        default="constant",
        metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"},
    )
    max_steps: int = field(default=100, metadata={"help": "How many optimizer update steps to take"}),
    epochs : int = field(default=5, metadata={"help": "How many epochs to train for"})
    warmup_ratio: float = field(default=0.03, metadata={"help": "Fraction of steps to do a warmup for"})
    save_steps: int = field(default=100, metadata={"help": "Save checkpoint every X updates steps."})
    logging_steps: int = field(default=100, metadata={"help": "Log every X updates steps."})
    output_dir: str = field(
        default="./gemma/results",
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
    )
    logging_dir: str = field(
        default="./gemma-2b/logs",
        metadata={"help": "The output directory where the logs will be written."},
    )
    eval_steps: int = field(default=5, metadata={"help": "How often to evaluate the model"})

parser = HfArgumentParser(ScriptArguments)
# Parse the arguments, ignoring unrecognized ones
script_args, remaining_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)


In [None]:
# Load the GG model - this is the local one, update it to the one on the Hub
model_id = "google/gemma-2b"
access_token = "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"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [None]:
# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map ="auto",
    attn_implementation="eager"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

In [None]:
from peft import prepare_model_for_kbit_training

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
#Lora config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

In [None]:
# Load dataset
train_dataset_url = "processed_2000/train.jsonl"
test_dataset_url ="processed_2000/test.jsonl"
validation_dataset_url ="processed_2000/validation.jsonl"

#train_dataset_url = "small_dataset/train.jsonl"
#test_dataset_url ="small_dataset/test.jsonl"
#validation_dataset_url ="small_dataset/validation.jsonl"

data_files = {
    'train': train_dataset_url,
    'test': test_dataset_url,
    'validation': validation_dataset_url
}

dataset = load_dataset('json', data_files=data_files)
train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

In [None]:
# Tokenize the data
def tokenize_function(examples):
    instruction = "Complete the following software model by finding the missing part: "
    inputs = [instruction + inp for inp in examples['input']]
    targets = examples['output']
    max_length = script_args.max_seq_length
    model_input = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_length, padding="max_length", truncation=True)

    model_input['labels'] = labels['input_ids']
    return model_input

trained_data = train_dataset.map(tokenize_function, batched=True)
validation_data = validation_dataset.map(tokenize_function, batched=True)
test_data = test_dataset.map(tokenize_function, batched=True)

In [None]:
accelerator = Accelerator()
model = accelerator.prepare_model(model)

In [None]:
sft_config = SFTConfig(
    output_dir=script_args.output_dir,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    save_steps=script_args.save_steps,
    logging_steps=script_args.logging_steps,
    optim=script_args.optim,
    num_train_epochs=script_args.epochs,
    lr_scheduler_type=script_args.lr_scheduler_type,
    gradient_checkpointing=script_args.gradient_checkpointing,
    eval_strategy="steps",
    eval_steps=script_args.eval_steps,
    eval_accumulation_steps=script_args.evaluation_accumulation_steps,
    logging_dir=script_args.logging_dir,
    warmup_ratio=script_args.warmup_ratio,
    logging_strategy="steps",
    learning_rate=script_args.learning_rate,
    max_seq_length= script_args.max_seq_length,
    fp16=script_args.fp16,
    bf16=script_args.bf16,

)

In [None]:
type(trained_data)

In [None]:
from datasets import Dataset

# Assuming `original_dataset` is your Dataset object
first_element = trained_data[0]

# Convert the first element into a new Dataset object
new_dataset_train = Dataset.from_dict({key: [value] for key, value in first_element.items()})

# Assuming `original_dataset` is your Dataset object
first_element = validation_data[0]

# Convert the first element into a new Dataset object
new_dataset_validation = Dataset.from_dict({key: [value] for key, value in first_element.items()})


In [None]:
'''from evaluate import load
import numpy as np

perplexity = load("perplexity", module_type="metric")
def compute_metrics(eval_pred):
    metrics, labels = eval_pred
    predictions = np.argmax(metrics, axis=-1)

    return perplexity.compute(predictions=predictions, model_id='gemma-2b')'''
import evaluate
import numpy as np

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    print(type(logits))
    pred_ids = torch.argmax(logits, dim=-1)

    return pred_ids, labels

Evaluation metrics

In [None]:
#train
trainer = SFTTrainer(
    model=model,
    train_dataset=trained_data,
    eval_dataset=validation_data,
    peft_config=lora_config,
    tokenizer=tokenizer,
    args=sft_config,
    max_seq_length=script_args.max_seq_length,
    compute_metrics=compute_metrics,
    #preprocess_logits_for_metrics=preprocess_logits_for_metrics
)



In [None]:
trainer.train()

In [None]:
model_path = 'gemma-2b_model'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:

model_directory = "fine-tuning"

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_directory)

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_directory)

In [None]:
data_files = {
    'test' : 'json_4000/test.jsonl'
}

dataset = load_dataset('json', data_files=data_files)
test_dataset = dataset['test']

In [None]:
test_data = test_dataset[0]['input']
input = tokenizer(test_data, return_tensors="pt", padding=True, truncation=True, max_length=4000)
output = model.generate(**input, max_new_tokens=4100, do_sample=True)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

In [None]:
import shutil

shutil.make_archive("/content/gemma-2b_model", 'zip', "/content/gemma-2b_model")

In [None]:
#from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Replace 'yourfile.zip' with the name of your zip file
shutil.move('/content/gemma-2b_model.zip', '/content/drive/My Drive/gemma2b_first_file.zip')