# **SETUP**

## **Installs**

In [1]:
! pip install torch==2.0.1 transformers datasets peft accelerate trl bitsandbytes optimum auto-gptq

Collecting torch==2.0.1
  Downloading torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl (619.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m619.9/619.9 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.8.2-py3-none-any.whl (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.4/183.4 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m32.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting trl
  Downloading trl-0.7.11-py3-none-any.whl (155 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m155.3/155.3 kB[0m [31m19.7 MB/s[0m eta 

## **Import**

In [2]:
import torch
from datasets import Dataset, load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoTokenizer, TrainingArguments, AutoModelForCausalLM, GPTQConfig, GenerationConfig
from trl import DPOTrainer
import pandas as pd
import time

## **HuggingFace Login**

In [3]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## **Params**

In [4]:
HG_MODEL_NAME = "TheBloke/OpenHermes-2-Mistral-7B-GPTQ"
HG_TOKENIZER_NAME = HG_MODEL_NAME
HG_DATASET_NAME = "HuggingFaceH4/ultrafeedback_binarized"
#TOKEN = 'ENTER TOKEN HERE'
TOKEN = "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"
#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

# **DATASET, MODEL, CONFIGS**

## **Dataset and Preprocessing**

In [5]:
def hg_data(hg_dataset_name, split, token):
   """Loads and prepares a Hugging Face dataset for training and validation.

   Args:
       hg_dataset_name (str): Name of the Hugging Face dataset to load.
       split (str): Split of the dataset to load (e.g., "train", "test").
       token (str): Hugging Face access token.

   Returns:
       tuple: Tuple containing the prepared train and validation datasets.
   """

   # Load the dataset from Hugging Face
   dataset = load_dataset(hg_dataset_name, split=split, token=token)

   # Create a list of prompts from the original dataset
   original_columns = dataset.column_names
   dataset = dataset.map(
       lambda sample: {
           "prompt": [prompt for prompt in sample["prompt"]],
           "chosen": sample["chosen"],
           "rejected": sample["rejected"],
       },
       batched=True,
       remove_columns=original_columns,
   )

   # Convert to pandas DataFrame, handle missing values, and extract content
   train_df = dataset.to_pandas().dropna()
   train_df["chosen"] = train_df["chosen"].str.get(1).str.get("content")
   train_df["rejected"] = train_df["rejected"].str.get(1).str.get("content")

   print(train_df.iloc[0])
   print(train_df.columns)

   # Create a small validation set from the training data
   val_df = train_df.sample(10)

   # Convert back to Hugging Face datasets for training and validation
   train_data = Dataset.from_pandas(train_df)
   val_data = Dataset.from_pandas(val_df)

   return train_data, val_data

In [6]:
from sklearn.model_selection import train_test_split

def socratic_data(path_to_file, split, limit):
  name_mapping = {"Prompt": "prompt", "Good answer": "chosen", "Bad answer": "rejected"}
  df = pd.read_csv(path_to_file, names=['prompt', 'rejected', 'chosen'], header=0)

  df['rejected'] = "The answer is, here is the solution"

  df = df.dropna()

  df = df.sample(n=limit)

  train_df, test_df = train_test_split(df, test_size=split, random_state=42)

  print(train_df.iloc[5])
  print(test_df.iloc[3])

  train_data = Dataset.from_pandas(train_df)
  val_data = Dataset.from_pandas(test_df)
  return train_data, val_data

In [7]:
#train_data, val_data = hg_data(HG_DATASET_NAME, "test_prefs", TOKEN)
socratic_data, socratic_val_data = socratic_data("/content/dataset.csv", 0.1, 5000)

prompt      Student: Professor, I have a problem. Calculat...
rejected                  The answer is, here is the solution
chosen                           Teacher: Exactly! Well done!
Name: 19826, dtype: object
prompt      Student: Professor, how much profit will the t...
rejected                  The answer is, here is the solution
chosen      Teacher: (smiling) Ah, I see! That's a good at...
Name: 8549, dtype: object


## **Models and Tokenizer**

In [8]:
model = AutoModelForCausalLM.from_pretrained(HG_MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True, quantization_config=GPTQConfig(bits=4, disable_exllama=True))

model_ref = AutoModelForCausalLM.from_pretrained(HG_MODEL_NAME, torch_dtype=torch.float16, low_cpu_mem_usage=True, quantization_config=GPTQConfig(bits=4, disable_exllama=True))

tokenizer = AutoTokenizer.from_pretrained(HG_TOKENIZER_NAME)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. use_exllama, exllama_config, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


model.safetensors:   0%|          | 0.00/4.16G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/136 [00:00<?, ?B/s]

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.
You passed `quantization_config` to `from_pretrained` but the model you're loading already has a `quantization_config` attribute and has already quantized weights. However, loading attributes (e.g. use_exllama, exllama_config, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored.


tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/174 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## **Peft Config and Model Setup**

In [9]:
peft_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules=["q_proj", "v_proj"],
        bias="none",
        task_type="CAUSAL_LM",
    )
peft_config.inference_mode = False

In [10]:
model = prepare_model_for_kbit_training(model)
model.config.use_cache=False
model.gradient_checkpointing_enable()
model.config.pretraining_tp=1
model = get_peft_model(model, peft_config)

# **TRAINING**

## **Training**

In [14]:
training_args = TrainingArguments(
        per_device_train_batch_size=1,
        max_steps=30,
        remove_unused_columns=False,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        evaluation_strategy="steps",
        logging_first_step=True,
        logging_steps=10,
        output_dir="openhermes-mistral-dpo-gptq",
        optim="paged_adamw_32bit",
        warmup_steps=2,
        fp16=True,
        push_to_hub=True
    )

In [15]:
dpo_trainer = DPOTrainer(
        model,
        model_ref,
        args=training_args,
        beta=0.1,
        train_dataset=socratic_data,
        eval_dataset=socratic_val_data,
        tokenizer=tokenizer,
        max_length=512,
        max_target_length=256,
        max_prompt_length=256
    )

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [28]:
dpo_trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [29]:
dpo_trainer.save_model("/models")

events.out.tfevents.1708211737.1827e93c0501.461.1:   0%|          | 0.00/5.50k [00:00<?, ?B/s]

# **INFERENCE**

In [30]:
input = tokenizer("""Student: I would like to know how many pieces of 12% I can fit in a 100%, Teacher: """, return_tensors="pt").to("cuda")

trained_model = AutoPeftModelForCausalLM.from_pretrained(
    "openhermes-mistral-dpo-gptq",
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cuda")

Using `disable_exllama` is deprecated and will be removed in version 4.37. Use `use_exllama` instead and specify the version with `exllama_config`.The value of `use_exllama` will be overwritten by `disable_exllama` passed in `GPTQConfig` or stored in your config file.


OutOfMemoryError: CUDA out of memory. Tried to allocate 28.00 MiB (GPU 0; 14.75 GiB total capacity; 14.29 GiB already allocated; 3.06 MiB free; 14.60 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [19]:
generation_config = GenerationConfig(
    do_sample=True,
    top_k=1,
    temperature=0.1,
    max_new_tokens=256,
    pad_token_id=tokenizer.eos_token_id
)

st_time = time.time()
trained_output = trained_model.generate(**input, generation_config=generation_config)
print(f"Trained: {tokenizer.decode(trained_output[0], skip_special_tokens=True)}")


st_time = time.time()
ref_output = model_ref.generate(**input, generation_config=generation_config)
print(f"Untrained: {tokenizer.decode(ref_output[0], skip_special_tokens=True)}")

Trained: I would like to know how many pieces of 12% I can fit in a 100% pie chart.

I know that 12% is 1/8 of the whole, so I can fit 8 pieces of 12% in a 100% pie chart.

I can also divide 100% by 12% to find out how many times 12% goes into 100%.

100% ÷ 12% = 8.333333333333333

So, I can fit 8.333333333333333 pieces of 12% in a 100% pie chart.

However, since I can only have whole pieces in a pie chart, I can only fit 8 pieces of 12% in a 100% pie chart.
Untrained: I would like to know how many pieces of 12% I can fit in a 100% solution.

I am trying to find the number of 12% solutions that can be mixed to make a 100% solution.

I know that the 12% solution is 12/100 = 0.12.

I also know that the 100% solution is 100/100 = 1.

I want to find the number of 12% solutions that can be mixed to make a 100% solution.

I can use the formula for mixing two solutions:

100% = x * 12% + (100 - x) * 100%

Simplifying this equation gives me:

x * 12% = 100% - 100%

x * 12% = 0%

Dividing both 