In [None]:
!nvidia-smi

# Finetune Llama-2-7b on a Google colab
Welcome to this Google Colab notebook that shows how to fine-tune the recent Llama-2-7b model on a single Google colab and turn it into a chatbot

We will leverage PEFT library from Hugging Face ecosystem, as well as QLoRA for more memory efficient finetuning

# Setup
Run the cells below to setup and install the required libraries. For our experiment we will need `accelerate`, `peft`, `transformers`, `datasets` and `TRL` to leverage the recent `SFTTrainer`. We will use `bitsandbytes` to [quantize the base model into 4bit](https://huggingface.co/blog/4bit-transformers-bitsandbytes). We will also install einops as it is a requirement to load Falcon models.

In [None]:
!pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
!pip install -q datasets bitsandbytes einops wandb

In [None]:
# from huggingface_hub import login
# login()

# Dataset

In [None]:
import os

def extract_data(base_path, language):
    difficulties = ["easy", "external", "hard", "harder", "hardest", "medium"]
    data = []
    
    # Define a mapping for the language to the folder name
    language_folder_map = {
        "python": "solutions_python",
        "cpp": "solutions_c++"
    }
    solution_folder_name = language_folder_map.get(language, f"solutions_{language}")

    for difficulty in difficulties:
        difficulty_path = os.path.join(base_path, "codechef", difficulty)
        if os.path.exists(difficulty_path):  # Checking if the difficulty folder exists
            problem_folders = os.listdir(difficulty_path)
            
            for problem in problem_folders:
                # Skip system files or folders starting with dot
                if problem.startswith('.'):
                    continue
                description_path = os.path.join(difficulty_path, problem, "description", "description.txt")

                with open(description_path, 'r', encoding="utf-8", errors='ignore') as desc_file:
                    description = desc_file.read()

                    # Check if solutions folder exists for the language
                    solution_folder_path = os.path.join(difficulty_path, problem, solution_folder_name)
                    if os.path.exists(solution_folder_path):
                        solution_files = os.listdir(solution_folder_path)

                        # Loop over all solution files for the problem
                        for solution_file in solution_files:
                            solution_path = os.path.join(solution_folder_path, solution_file)

                            with open(solution_path, 'r', encoding="utf-8", errors='ignore') as sol_file:
                                solution = sol_file.read()

                                # Append to the data list
                                data.append({
                                    "description": description,
                                    "solution": solution
                                })

    return data



# Extracting data for Python and C++
data_python = extract_data("/kaggle/input/description2code/description2code_current/", "python")
data_cpp = extract_data("/kaggle/input/description2code/description2code_current/", "cpp")

# Structuring the data
structured_data_python = [{"prompt": problem["description"], "output": problem["solution"]} for problem in data_python]
structured_data_cpp = [{"prompt": problem["description"], "output": problem["solution"]} for problem in data_cpp]

In [None]:
# You can now save these structured datasets to files or use them directly
# Assuming the data structuring script was executed and the 
# structured_data_python and structured_data_cpp lists are available

# Display the first few entries of the Python dataset
print("Python Dataset Samples:")
for i, entry in enumerate(structured_data_python[:5]):
    print(f"Sample {i+1}:")
    print("Prompt (Description):\n", entry["prompt"])
    print("Output (Solution):\n", entry["output"])
    print("---------------------------------------------------------")

# Display the first few entries of the C++ dataset
print("\nC++ Dataset Samples:")
for i, entry in enumerate(structured_data_cpp[:5]):
    print(f"Sample {i+1}:")
    print("Prompt (Description):\n", entry["prompt"])
    print("Output (Solution):\n", entry["output"])
    print("---------------------------------------------------------")

# Printing out the total number of entries
print(f"\nTotal entries in Python dataset: {len(structured_data_python)}")
print(f"Total entries in C++ dataset: {len(structured_data_cpp)}")

In [None]:
from datasets import Dataset

In [None]:
def list_of_dicts_to_dict_of_lists(list_of_dicts):
    """Convert a list of dictionaries to a dictionary of lists."""
    return {key: [d[key] for d in list_of_dicts] for key in list_of_dicts[0].keys()}

data_dict_python = list_of_dicts_to_dict_of_lists(structured_data_python)
dataset = Dataset.from_dict(data_dict_python)



In [None]:
# from datasets import load_dataset

# dataset_name = 'nisaar/Articles_Constitution_3300_Instruction_Set'
# dataset = load_dataset(dataset_name, split="train")

In [None]:
dataset[1]

In [None]:
print(dataset)

In [None]:
# from datasets import Dataset

# # Assuming `dataset` is your Dataset object
# dataset = dataset.map(lambda example: {'text': example['prompt'] + example['output']})

# Loading the Model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer

model_name = "TinyPixel/Llama-2-7B-bf16-sharded"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    trust_remote_code=True,
    device_map = "auto",
)
model.config.use_cache = False

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from peft import LoraConfig, get_peft_model

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

In [None]:
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

In [None]:
from transformers import TrainingArguments

In [None]:
output_dir = "./results"
per_device_train_batch_size = 1
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_steps = 1
num_train_epochs = 4
logging_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = 20
warmup_ratio = 0.03
lr_scheduler_type = "linear"

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=True,
    lr_scheduler_type=lr_scheduler_type,
)

In [None]:
from trl import SFTTrainer

In [None]:
max_seq_length = 2048

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="prompt",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

In [None]:
for name, module in trainer.model.named_modules():
    if "norm" in name:
        module = module.to(torch.float32)

In [None]:
trainer.train()

In [None]:
model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model  # Take care of distributed/parallel training
model_to_save.save_pretrained("outputs")

In [None]:
lora_config = LoraConfig.from_pretrained('outputs')
model = get_peft_model(model, lora_config)

In [None]:
# model.push_to_hub("ashishpatel26/Llama2_Finetuned_Articles_Constitution_3300_Instruction_Set",create_pr=1)

In [None]:
dataset['prompt'][0]

In [None]:
dataset['output'][0]

In [None]:
text = dataset['prompt'][0]
device = "cuda:0"

In [None]:
inputs = tokenizer(text, return_tensors="pt").to(device)
outputs = model.generate(**inputs, max_new_tokens=1000)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))