## Core distillation process

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git 
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q datasets

In [4]:
!pip install numpy==1.22.0

Collecting numpy==1.22.0
  Downloading numpy-1.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m44.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.24.3
    Uninstalling numpy-1.24.3:
      Successfully uninstalled numpy-1.24.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.7 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you hav

In [5]:
from transformers import AutoConfig
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
from torch import nn
from datasets import Dataset



### Loading the teacher model (our finetuned full-size Llama 7B)

In [6]:
model_id = "/kaggle/input/finetuned-llama/model_7_4Entries"
llama_base_id = "/kaggle/input/llama-2/pytorch/7b-chat-hf/1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(llama_base_id)
teacher_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"":0})

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



### Loading the student model (our 11-layer Llama 1.5B)
Note: In our process here the extracted model or the extracted & already finetuned model can be loaded depending on the preferred order of training operations in our distil model pipeline

In [7]:
student_model = AutoModelForCausalLM.from_pretrained("/kaggle/input/base-11-model/")

In [8]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM

student_model = PeftModel.from_pretrained(student_model, "/kaggle/input/distil-11-finetuned")

In [9]:
# merging the lora matrices with the base weights
student_model = student_model.merge_and_unload()



In [10]:
student_model.model

LlamaModel(
  (embed_tokens): Embedding(32000, 4096, padding_idx=0)
  (layers): ModuleList(
    (0-10): 11 x LlamaDecoderLayer(
      (self_attn): LlamaAttention(
        (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        (v_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
        (up_proj): Linear4bit(in_features=4096, out_features=11008, bias=False)
        (down_proj): Linear4bit(in_features=11008, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): LlamaRMSNorm()
      (post_attention_layernorm): LlamaRMSNorm()
    )
  )
  (norm): LlamaRMSNorm()
)

# *IMPORANT: For only loading the model and testing jump directly to model loading section from here*

##### Preparing student for QLoRa (also used in our distillation process)

In [10]:
student_model.gradient_checkpointing_enable()

**The code line below has been caused a fatal error for the custom DistillationTrainer defined later on regarding some "inf checks ...", exact causal relation could not have been determined due to time reasons, interesting subject for further theoretical investigation :)**

In [11]:
# from peft import prepare_model_for_kbit_training
# student_model = prepare_model_for_kbit_training(student_model)

In [12]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [13]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=8, 
    lora_alpha=32, 
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "down_proj", "up_proj"],
    lora_dropout=0.05, 
    bias="none", 
    task_type="CAUSAL_LM",
    inference_mode=False
)

student_model = get_peft_model(student_model, config)
print_trainable_parameters(student_model)

trainable params: 6871040 || all params: 1382172672 || trainable%: 0.4971187854595348


### Implementing custom trainer for distilloss calculation 
(code adapted from tiny-bert project https://github.com/philschmid/knowledge-distillation-transformers-pytorch-sagemaker/blob/master/knowledge-distillation.ipynb and significantly rewritten and adapted to fit our goal for Llama distillation)

In [14]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("CUDA is available")
else:
    device = torch.device("cpu")
    print("CUDA is not available")


CUDA is available


In [15]:
from transformers import TrainingArguments, Trainer
import torch.nn as nn
import torch.nn.functional as F

class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        
        self.alpha = alpha
        self.temperature = temperature
        
class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher = teacher_model
        # place teacher on same device as student
        #self._move_model_to_device(self.teacher,self.model.device)
        self.teacher.eval()

    def compute_loss(self, model, inputs, return_outputs=False):
        ### trials from the debugging process, but not needed
        #print(inputs)
        # compute student output
#         if hasattr(model, "enable_input_require_grads"):
#             model.enable_input_require_grads()
#         else:
#             def make_inputs_require_grad(module, input, output):
#                  output.requires_grad_(True)

#             model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

        inputs = inputs.to(device)
    
        # requires_grad statements resulted from debugging process, probably mostly not needed 
        # also from a theoretical side this does probably not make sense totally
        # further investigation not possible though because of Kaggle GPU limit reached
        inputs.requires_grad=True
        model.enable_input_require_grads()
        model.requires_grad=True
        
        # student output
        outputs_student = model.generate(input_ids = inputs.input_ids, max_length=200, output_scores=True,return_dict_in_generate=True)
        outputs_student = outputs_student
        student_logits = outputs_student.scores # tensor of probit tensors (for every output subword)
        
        # teacher output
        outputs_teacher = self.teacher.generate(inputs.input_ids, max_length=200, output_scores=True,return_dict_in_generate=True)
        outputs_teacher = outputs_teacher
        teacher_logits = outputs_teacher.scores # tensor of probit tensors (for every output subword)
        
        
        # Soften probabilities and compute distillation loss
        loss_function = nn.KLDivLoss(reduction="batchmean")

        total_loss = torch.zeros(1).to(device)
        
        # adding up loss over all subwords
        for i in range(len(teacher_logits)):
            current_loss = (
                loss_function(
                    F.softmax(teacher_logits[i].requires_grad_(True) / self.args.temperature, dim=-1),
                    F.softmax(student_logits[i].requires_grad_(True) / self.args.temperature, dim=-1)) * (self.args.temperature ** 2)).to(device)
            #print(current_loss)
            current_loss = current_loss.reshape(1)
            #print(current_loss)
            
            # new tensor to store the accumulated loss 
            # (not possible to overwrite because the tensor object requires grad and is tracked by torch)
            new_total_loss = total_loss + current_loss

            # updating the initial total_loss variable
            total_loss = new_total_loss
            
        loss = total_loss.squeeze().float().abs() # .abs() for positive loss values instead of negative ones returned by custom loss fct
        #print(loss)
        return (loss, outputs_student) if return_outputs else loss

### Defining training params and running training

#### dataset preprocessing

**Note: These dataset questions have been custom generated by utilizing a base Llama 7B prompted to generate general questions to a topic (for instance for Big Data), so in fact our proposed distillation process requires no ground data at all**

In [16]:
data_big_data = pd.read_csv("/kaggle/input/qa-distillation-set/qa_big_data.csv")
data_ml_ds = pd.read_csv("/kaggle/input/qa-distillation-set/qa_ds_ml.csv")
data_sql = pd.read_csv("/kaggle/input/qa-distillation-set/qa_sql.csv")
data_sys = pd.read_csv("/kaggle/input/qa-distillation-set/qa_sys.csv")

In [17]:
data_sql.rename(columns={'0': 'Question'}, inplace=True)


In [18]:
data_ml_ds.rename(columns={'question': 'Question'}, inplace=True)

In [19]:
data_sql.head()

Unnamed: 0.1,Unnamed: 0,Question
0,1,What is the difference between a table and a d...
1,2,What is the purpose of a primary key in a rela...
2,3,How does a foreign key work in a relational da...
3,4,What is the difference between SQL and NoSQL d...
4,5,What is the purpose of a database schema in SQL?


In [20]:
data = pd.concat([data_big_data, data_ml_ds, data_sql, data_sys], axis=0)

In [21]:
data = data.reset_index()

In [22]:
data = data["Question"]

In [23]:
data = data.apply(tokenizer)

In [24]:
data = np.array(data)

**input transformed to embedding ids and mask (but mask is not there basically and will not further be used since learning takes place on the generation**

In [25]:
data[0]

{'input_ids': [1, 1724, 338, 7997, 3630, 29973], 'attention_mask': [1, 1, 1, 1, 1, 1]}

In [26]:
len(data)

180

#### Training

In [27]:
from torch.utils.data import DataLoader

dataloader = DataLoader(data, batch_size=2, shuffle=False)

tokenizer.pad_token = tokenizer.eos_token

In [28]:
# define training args
training_args = DistillationTrainingArguments(
    
    # general training params
    gradient_accumulation_steps=1,
    warmup_steps=2,
    num_train_epochs=0.4,
    per_device_train_batch_size=2,
    fp16=False,
    learning_rate=2e-5,
    seed=33,
    output_dir="/kaggle/working/halfinput",
    optim="paged_adamw_8bit",
    logging_steps=1,

    # distilation parameters
    #alpha=0.5, # alpha not needed since we only use teacher soft targets for loss, so no weighting factor needed
    temperature=3.0 # Hinton et. al. propose a high temperature for distillation
    )

In [29]:
trainer = DistillationTrainer(
    student_model,
    training_args,
    teacher_model=teacher_model,
    train_dataset=dataloader.dataset,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

In [30]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()

my_secret = user_secrets.get_secret("new_wandb") 

wandb.login(key=my_secret)

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [31]:
#wandb.finish()

In [32]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mw02marcus[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,16828.0273
2,16831.1074
3,16336.2021
4,16626.2305
5,16622.2383
6,16720.4414
7,16902.8984
8,17246.7168
9,17050.1348
10,16813.0898


TrainOutput(global_step=36, training_loss=16717.99867078993, metrics={'train_runtime': 5740.2112, 'train_samples_per_second': 0.013, 'train_steps_per_second': 0.006, 'total_flos': 13986394791936.0, 'train_loss': 16717.99867078993, 'epoch': 0.4})

In [34]:
trainer.save_model("/kaggle/working/student_lora_adapters")

## Model loading and testing

**Only run the cell below if no training has taken place in this run**

the distil-11-final dataset contains the lora matrices we got from the qa distillation process, those are loaded onto the unified model (base model + finetuning lora matrices)

In [12]:
student_model = PeftModel.from_pretrained(student_model, "/kaggle/input/distil-11-final")

### Evaluation

In [13]:
pipeline_distil = transformers.pipeline(
    "text-generation",
    model=student_model,
    torch_dtype=torch.float16,
    device_map="auto",
    tokenizer=tokenizer
)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonF

In [14]:
sequences1 = pipeline_distil(
   f'[INST]At what point can we call something big data?[/INST]',
   do_sample=True,
   top_k=10,
   num_return_sequences=1,
   eos_token_id=tokenizer.eos_token_id,
   max_length=200,
)
# sequences2 = pipeline_distil(
#    f'[INST]Explain the RDBMS?[/INST]',
#    do_sample=True,
#    top_k=10,
#    num_return_sequences=1,
#    eos_token_id=tokenizer.eos_token_id,
#    max_length=200,
# )
# sequences3 = pipeline_distil(
#    f'[INST]What is Data Science?[/INST]',
#    do_sample=True,
#    top_k=10,
#    num_return_sequences=1,
#    eos_token_id=tokenizer.eos_token_id,
#    max_length=200,
# )

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [15]:
sequences1

[{'generated_text': '[INST]At what point can we call something big data?[/INST]The big data in the world nowaday holds this information: What are the data and what means are actually big data? Data was created after the 270 year programming. sierpion s values for the human data file 128482, with the next 8916 year with this data, the world is called "Data in the world nowadays". a database contains many more numbers than their numbers.  Data is stored for the future. This is one process called to release data from the entire universe. The entire database is stored under the entire view of the entire database, with its entire file stored in one full record. The world has been able to create an entire data value with a view of the world. Databases are now databases from the entire universe. The world is the world and all over there are. Datar Worlds have access to'}]

## ======================== ##

### Trials for investigating the inner structure of Llama forward pass, inputs, generation function and huggingface adaptions of those functions 

### IMPORTANT: Code execution might or will fail in some cells below, this is intended, because this is just for documentation of the dev and debugging process

**this section documents all the trials made to generate insights and overcome certain issues**

During the implementation of our custom trainer and the custom loss calculation several errors occured along the way. A couple of major important fixes have been added as comments along the way already.

Because we wanted to run a distillation without a ground dataset based only on a catalogue of questions, we had to redesign the compute_loss() function of the distillation trainer in a way that would allow us to generate whole answers in one pass and calculate the loss as a sum of differences between the subword probit tensors.

Since at first we could not find a way to make the .generate() function also return logits, because the huggingface .generate() differs from the Llama GitHub defined .generate() we also spend some substiantal amount of time in investigating the forward pass logic of Llama to somehow find a way to implement the generation logic ourselfes, so to basically recursively to forward passes. This approach had some issues though and we could thankfully dismiss it because of the capability to output the logits directly from the .generate()

tokenizer investigation

In [None]:
torch.tensor(tokenizer("What is big data?")["input_ids"])

In [266]:
prompt_template = "[INST]What is big data?[/INST]"

investigating input processing to embedding tensors

In [267]:
input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()

In [275]:
input_ids

tensor([[    1,   518, 25580, 29962,  5618,   338,  4802,   848, 29973, 29961,
         29914, 25580, 29962]], device='cuda:0')

investigation of generation function

In [1]:
# this fails, because only the .generate() defined in the Llama 2 Github features logprobs=True
# huggingface implementation features a standard .generate() method with different parameters
teacher_model.generate(input_ids, logprobs=True)

NameError: name 'teacher_model' is not defined

In [None]:
# output_scores=True has been the final solution to get all the logits
output_teacher = teacher_model.generate(input_ids, max_length=200, output_scores=True,return_dict_in_generate=True)

In [None]:
output_student.scores

shape investigation of logits returned

In [None]:
output_teacher.scores[0].shape

regenerating text out of the generation function outputs

In [None]:
generated_text = tokenizer.batch_decode(output_teacher, skip_special_tokens=True)

In [None]:
generated_text

In [271]:
student_model.generate(input_ids=input_ids, max_length=200)

tensor([[    1,   518, 25580, 29962,  5618,   338,  4802,   848, 29973, 29961,
         29914, 25580, 29962,  6970,   848,   338,   263,   376,  3752,   848,
         29908,   376,  3752,   848, 29908,   376,  3752,   848, 29908,   376,
          3752,   848, 29908,   376,  3752,   848, 29908,   376,  3752,   848,
         29908,   376,  3752,   848, 29908,   376,  3752,   848, 29908,   376,
          3752,   848, 29908,   376,  3752,   848, 29908,   376,  3752,   848,
         29908,   376,  3752,   848, 29908,   376,  3752,   848, 29908,   376,
          3752,   848, 29908,   376,  3752,   848, 29908,   376,  3752,   848,
         29908,   376,  3752,   848, 29908,   376,  3752,   848, 29908,   376,
          3752,   848, 29908,   376,  3752,   848, 29908,   376,  3752,   848,
         29908,   376,  3752,   848, 29908,   376,  3752,   848, 29908,   376,
          3752,   848, 29908,   376,  3752,   848, 29908,   376,  3752,   848,
         29908,   376,  3752,   848, 29908,   376,  

forward pass investigation

In [None]:
with torch.no_grad():
    result_teacher=teacher_model.forward(input_ids)

In [None]:
with torch.no_grad():
    result=student_model.forward(input_ids)

In [None]:
teacher_probits = F.softmax(result_teacher.logits, dim=-1)

In [None]:
tokenizer.decode(torch.argmax(teacher_probits))

In [None]:
tokenizer.decode(torch.argmax(torch.argmax(teacher_probits, dim=-1), dim=-1))

In [None]:
concatenated_tensor = torch.cat((input_ids, new_tensor), 1)

In [None]:
tokenizer.batch_decode(torch.argmax(teacher_probits, dim=-1))

In [None]:
input_ids

In [None]:
result_teacher=teacher_model.forward(input_ids)
teacher_probits = F.softmax(result_teacher.logits / 2.0, dim=-1)
new_tensor = torch.argmax(teacher_probits, dim=-1)
for i in range(0,5):
    with torch.no_grad():
        result_teacher=teacher_model.forward(new_tensor)
        teacher_probits = F.softmax(result_teacher.logits / 2.0, dim=-1)
        new_tensor = torch.argmax(teacher_probits, dim=-1)
        #concat_tensor = torch.cat((concat_tensor, new_tensor), 1)
        print(tokenizer.batch_decode(new_tensor))

In [None]:
tokenizer.batch_decode(concat_tensor)

In [None]:
result.logits

In [None]:
student_probits = F.log_softmax(result.logits / 2.0, dim=-1)

In [None]:
tokenizer.batch_decode(torch.argmax(student_probits, dim=-1))

In [None]:
loss_function = nn.KLDivLoss(reduction="batchmean")

In [None]:
loss_function(student_probits, teacher_probits)

In [None]:
generated_text_student = tokenizer.batch_decode(output_student, skip_special_tokens=True)

In [None]:
generated_text_student

In [None]:
student_model.generation_config