## Layer extraction + finetuning of small Llama 1.4B 

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git 
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [None]:
!pip install numpy==1.22.0

## Llama distillation

### loading model reduced to 11 hidden layers

**Note: Loading the model with fewer hidden layers does not give you an option to select the layers to keep, but loading the full model and cutting away some layers manually will throw index errors because layers try to access attention scores from previous layers**

In [None]:
from transformers import AutoConfig
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from torch import nn
from datasets import Dataset

In [None]:
llama_config=AutoConfig.from_pretrained("/kaggle/input/llama-2/pytorch/7b-chat-hf/1")

In [None]:
llama_config.num_hidden_layers=11

In [None]:


model_id = "/kaggle/input/llama-2/pytorch/7b-chat-hf/1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model_first_11 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0}, config=llama_config)



### Loading the full model (32 layers) and extracting wanted layers into custom ModuleList

In [None]:
model_id = "/kaggle/input/llama-2/pytorch/7b-chat-hf/1"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model_full = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})



In [None]:
# test access
model_full.model.layers[0]

In [None]:


# == selecting distinct Llama layers (always attention + mlp + InputLayerNorm + PostAttentionLayerNorm) == #
# defining a wrapper for all of the layers (to stricly follow Llama's layer tree hierarchies)
layers = nn.ModuleList()

# adding layers for the distil model to the layers wrapper
distil_layers = [0,1,2,6,10,14,18,22,26,30,31]
for number, n in enumerate(distil_layers):
    model_first_11.model.layers[number].load_state_dict(model_full.model.layers[n].state_dict())



### Replacing the first 11 layers of the 11 layer config model with the custom layers

**Note: Doing this causes the same index out of range issue since the layer numbers have remained the same, so a renaming might be needed in the ModuleList**

**Update: Renaming does not solve the issue, so probably each layer has its index hardcoded at some point and needs to access its initial number-1 proabably**

**Solution idea: Keeping only the first 11 layers but replacing the weights with weights of other layers, this hopefully keeps numberings the same and still works**

**Update: Works :)**

In [None]:
model_first_11.model.layers[4]

#### Evaluation

In [None]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model_first_11,
    torch_dtype=torch.float16,
    device_map="auto",
    tokenizer=tokenizer
)


In [None]:
sequences = pipeline(
   '[INST]I liked "Breaking Bad" and "Band of Brothers". Do you have any recommendations of other shows I might like?\n[/INST]',
   do_sample=True,
   top_k=10,
   num_return_sequences=1,
   eos_token_id=tokenizer.eos_token_id,
   max_length=400,
)
for seq in sequences:
   print(f"Result: {seq['generated_text']}")

## finetuning distilled model
##### to hopefully establish meaningful connections between rearranged layers and regain ability of producing at least full words of finetuning domain as output

finetuning configuration

In [None]:
from peft import prepare_model_for_kbit_training

model_first_11.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model_first_11)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, config)
print_trainable_parameters(model)

input preparation

In [None]:
with open("/kaggle/input/edulllamafinetuning/Big Data - Pfisterer.txt", 'r', encoding='utf-8') as file:
        bigdata = file.read()
sections = bigdata.split("Prof. Dr.-Ing. habil. Dennis Pfisterer")

import re
inp_bigdata= []

for section in sections: 
    i = 1
    if section.startswith("\n"+str(i)):
        text_parts = section
        try:
            while text_parts != None:
                text_parts = text_parts.split("\n"+str(i+1),1)
                inp_bigdata.append(text_parts[0].replace("\n"," "))
                text_parts = text_parts[1]
                i += 1
        except:
            text_parts = None
            
#inp_bigdata

In [None]:
with open("/kaggle/input/edulllamafinetuning/Datenbanken.txt", 'r', encoding='utf-8') as file:
        databases = file.read()

import re

def split_text_on_numbers(text):
    # Verwenden Sie ein reguläres Ausdrucksmuster, um den Text zu splitten
    split_sections = re.split(r'\n\d+\n', text)

    # Entfernen von Leerzeichen und leeren Zeichenketten
    split_sections = [s.strip().replace("\n"," ").replace("–\xa0","")for s in split_sections if s.strip()]

    return split_sections

inp_database = split_text_on_numbers(databases)
#inp_database

In [None]:
with open("/kaggle/input/edulllamafinetuning/Introduction_to_Data_Science_eng.txt", 'r', encoding='utf-8') as file:
        ds = file.read()

import re

def split_text_on_name(text):
    # Verwenden Sie ein reguläres Ausdrucksmuster, das verschiedene Formate des Namens abdeckt
    #split_sections = re.split(r'Prof\.?\s*Dr\.?\s*Bernhard Drabant', text)
    split_sections = re.split(r'Bernhard Drabant', text)

    # Entfernen von Leerzeichen und leeren Zeichenketten
    split_sections = [s.strip().replace("\n", " ").replace("\x0c","") for s in split_sections if s.strip()]

    return split_sections

inp_ds = split_text_on_name(ds)
#inp_ds

In [None]:
with open("/kaggle/input/edulllamafinetuning/Kommunikations_und_Betriebssysteme_eng.txt",'r', encoding='utf-8') as file:
        kub = file.read()

import re

def merge_items_with_previous(lst):
    merged_list = []
    current_item = ''

    for item in lst:
        # Check if the item starts with a lowercase letter or a special character
        if item and (item[0].islower() or not item[0].isalnum()):
            # Merge with the previous item
            current_item += ' ' + item
        else:
            # Add the current merged item to the list
            if current_item:
                merged_list.append(current_item.strip())
            # Set the current item to the current item without merging
            current_item = item

    # Add the last merged item to the list
    if current_item:
        merged_list.append(current_item.strip())

    return merged_list

split_sections = re.split(r'Course: WWI21DSB', kub)
inp_KuB = []

for index, section in enumerate(split_sections):
    match index:
        case 0:
                continue
        case 1: 
            section = section.split("\n")[58:]
        case _: 
            section = section.split("\n")[13:]

    
    sec_result = merge_items_with_previous(section)
    inp_KuB += sec_result

def filter_items_with_one_word(lst):
    return [item for item in lst if len(item.split()) > 1]

inp_KuB = filter_items_with_one_word(inp_KuB) # removed 1000 single words 
#inp_KuB

In [None]:
with open("/kaggle/input/edulllamafinetuning/ML_fundamentals.txt", 'r', encoding='utf-8') as file:
        ml = file.read()

import re

def split_text_on_name(text):
    # Use a regular expression pattern that covers various formats of the name
    split_sections = re.split(r'Daniel Wehner, M.Sc. \(SAP SE\), Winter term 2023/2024', text)

    # Remove spaces and empty strings
    split_sections = [s.strip().replace("\x0c","") for s in split_sections if s.strip()]

    truncated_sections = []
    for section in split_sections:
        *_,truncated_section = re.split(r'\n\n', section, 1)
        truncated_sections.append(truncated_section.strip().replace("\n"," "))

    return truncated_sections

inp_ml = split_text_on_name(ml)
#inp_ml

In [None]:
final = inp_bigdata+inp_database+inp_ds+inp_KuB+inp_ml
len(final)

In [None]:
text_lengths = [len(str(text)) for text in final]

# Calculate the average length
average_length = sum(text_lengths) / len(text_lengths)
average_length

In [None]:
split_entries = []

for text in final:
    # Check if the number of words in the text is greater than 600
    if len(text.split()) > 600:
        # Splitting the text into parts of average_length words
        word_parts = [" ".join(text.split()[i:i + int(average_length)]) for i in range(0, len(text.split()), int(average_length))]
        # Extend the split_entries list with the word_parts
        split_entries.extend(word_parts)
    else:
        # If the text has 600 or fewer words, add the entire text to split_entries
        split_entries += [text]
    
len(split_entries)

In [None]:
loonger_entries = []

def combine_entries(a,b,c,d):
    combined = a +" "+ b +" "+ c +" "+ d 
    return combined

for index in range(1,len(split_entries),4):
    inputs = combine_entries(split_entries[index-3],split_entries[index-2],split_entries[index-1],split_entries[index])
    loonger_entries.append(inputs)

#loonger_entries

In [None]:
data_dict = {"text":loonger_entries}
data = Dataset.from_dict(data_dict)
data = data.map(lambda samples: tokenizer(samples["text"]), batched=True)
data

finetuning training process

In [None]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()

my_secret = user_secrets.get_secret("new_wandb") 

wandb.login(key=my_secret)

In [None]:
### === huggingface saving has been tried for saving the whole model in one go, but has been dismissed again ===
# from huggingface_hub import notebook_login, login
# from huggingface_hub import HfFolder
# repo_name = "Marcus02W/extracted-llama-finetuned"

In [None]:
# login(user_secrets.get_secret("Llama_pushing"))

In [None]:
#wandb.finish()

In [None]:
import transformers
from torch.utils.data import DataLoader

dataloader = DataLoader(data, batch_size=1, shuffle=False)

tokenizer.pad_token = tokenizer.eos_token

epochen = 7

trainer = transformers.Trainer(
    model=model,
    #train_dataset=data,
    train_dataset=dataloader.dataset,
    args=transformers.TrainingArguments(
        num_train_epochs=epochen,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=3,
        warmup_steps=2,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="/kaggle/working/halfinput",
        optim="paged_adamw_8bit",
#         # push to hub parameters
#         push_to_hub=True,
#         hub_strategy="every_save",
#         hub_model_id=repo_name,
#         hub_token=HfFolder.get_token()
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

In [None]:
trainer.save_model("/kaggle/working/")

In [None]:
model_first_11.generation_config.do_sample=True

In [None]:
model_first_11.save_pretrained("/kaggle/working/full_model")

In [None]:
pipeline_distil = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
    tokenizer=tokenizer
)

In [None]:
sequences1 = pipeline_distil(
   f'[INST]What is big data?[/INST]',
   do_sample=True,
   top_k=10,
   num_return_sequences=1,
   eos_token_id=tokenizer.eos_token_id,
   max_length=200,
)
sequences2 = pipeline_distil(
   f'[INST]Explain the RDBMS?[/INST]',
   do_sample=True,
   top_k=10,
   num_return_sequences=1,
   eos_token_id=tokenizer.eos_token_id,
   max_length=200,
)
sequences3 = pipeline_distil(
   f'[INST]What is Data Science?[/INST]',
   do_sample=True,
   top_k=10,
   num_return_sequences=1,
   eos_token_id=tokenizer.eos_token_id,
   max_length=200,
)


In [None]:
sequences1

In [None]:
sequences2

In [None]:
sequences3

### Model loading from datasets

**Note: Calling trainer.save after QLoRa finetuning does only saft the adapter matrices, however this causes problems since when just loading that, the base model with 32 layers will be automatically used again. Therefore we saved the 11 layer model with the custom weights (extracted from defined layers) as well and changed the base model path in the config json to the path of the 11-layer model dataset.**

In [None]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM

#config = PeftConfig.from_pretrained("/kaggle/input/distil-11-finetuned")
model = AutoModelForCausalLM.from_pretrained("/kaggle/working/full_model")
model = PeftModel.from_pretrained(model, "/kaggle/input/distil-11-finetuned")

In [None]:
pipeline2 = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
    tokenizer=tokenizer
)

In [None]:
sequences2 = pipeline2(
   f'[INST]What is Big Data?[/INST]',
   do_sample=True,
   top_k=10,
   num_return_sequences=1,
   eos_token_id=tokenizer.eos_token_id,
   max_length=200,
)

In [None]:
sequences2