In [1]:
!pip install transformers==4.44.0
!pip install -U bitsandbytes



In [None]:
hf_token = "xd"
print(hf_token)

In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
import torch
import pickle
import json


set_seed(42)

def test_dola(model_name, hf_token=None):
    print(f"\nTesting {model_name}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, token=hf_token)
    
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    
    text = "On what date was the Declaration of Independence officially signed?"
    inputs = tokenizer(text, return_tensors="pt").to(device)
    
    results = {
        "model_name": model_name,
        "input_text": text,
        "outputs": {}
    }

    # Vanilla greedy decoding
    vanilla_output = model.generate(**inputs, do_sample=False, max_new_tokens=50)
    vanilla_text = tokenizer.decode(vanilla_output[0, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
    results["outputs"]["vanilla"] = vanilla_text
    print("Vanilla output:", vanilla_text)
    
    # DoLa decoding with contrasting higher part of layers (layers 16,18,...,30)
    try:
        dola_high_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers='high')
        dola_high_text = tokenizer.decode(dola_high_output[0, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
        results["outputs"]["dola_high"] = dola_high_text
        print("\nDoLA high output:", dola_high_text)
    except Exception as e:
        error_message = str(e)
        results["outputs"]["dola_high"] = f"Error: {error_message}"
        print(f"\nError occurred during DoLA high decoding: {error_message}")

    # DoLa decoding with contrasting specific layers (layers 28 and 30)
    try:
        dola_custom_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers=[28,30], repetition_penalty=1.2)
        dola_custom_text = tokenizer.decode(dola_custom_output[0, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
        results["outputs"]["dola_custom"] = dola_custom_text
        print("\nDoLA custom output:", dola_custom_text)
    except Exception as e:
        error_message = str(e)
        results["outputs"]["dola_custom"] = f"Error: {error_message}"
        print(f"\nError occurred during DoLA custom decoding: {error_message}")

    filename = f"{model_name.replace('/', '_')}_results.json"

    # Save results to a JSON file
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)

    print(f"\nResults saved to {filename}")
    del model
    del tokenizer
    torch.cuda.empty_cache()
    import gc
    gc.collect()
    return results

In [13]:
test_dola("huggyllama/llama-7b", hf_token=hf_token)


Testing huggyllama/llama-7b


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  88%|########7 | 8.75G/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Vanilla output: 
The Declaration of Independence was signed on July 4, 1776.
What was the date of the signing of the Declaration of Independence?
The Declaration of Independence was signed on July 4,

DoLA high output: 
July 4, 1776, when the Continental Congress voted to separate from Great Britain. The 56 delegates to the Continental Congress signed the Declaration on August 2, 1776.

DoLA custom output: 
It was officially signed on 2 August 1776, when 56 members of the Second Continental Congress, representing the original 13 American colonies, voted unanimously for the resolution for independence. The 2

Results saved to huggyllama_llama-7b_results.json


{'model_name': 'huggyllama/llama-7b',
 'input_text': 'On what date was the Declaration of Independence officially signed?',
 'outputs': {'vanilla': '\nThe Declaration of Independence was signed on July 4, 1776.\nWhat was the date of the signing of the Declaration of Independence?\nThe Declaration of Independence was signed on July 4,',
  'dola_high': '\nJuly 4, 1776, when the Continental Congress voted to separate from Great Britain. The 56 delegates to the Continental Congress signed the Declaration on August 2, 1776.',
  'dola_custom': '\nIt was officially signed on 2 August 1776, when 56 members of the Second Continental Congress, representing the original 13 American colonies, voted unanimously for the resolution for independence. The 2'}}

In [16]:
test_dola("google/gemma-2-2b-it", hf_token=hf_token)


Testing google/gemma-2-2b-it


tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Vanilla output: 

The Declaration of Independence was adopted on **July 4, 1776**.  

However, it wasn't officially signed until **August 2, 1776**. 


DoLA high output: 

**Answer:**  **July 4, 1776** 


However, it's important to remember: 
* **The Declaration was adopted on July 4th.** The Continental Congress voted on July 4th

Error occurred during DoLA custom decoding: stack expects a non-empty TensorList

Results saved to google_gemma-2-2b-it_results.json


{'model_name': 'google/gemma-2-2b-it',
 'input_text': 'On what date was the Declaration of Independence officially signed?',
 'outputs': {'vanilla': "\n\nThe Declaration of Independence was adopted on **July 4, 1776**.  \n\nHowever, it wasn't officially signed until **August 2, 1776**. \n",
  'dola_high': "\n\n**Answer:**  **July 4, 1776** \n\n\nHowever, it's important to remember: \n* **The Declaration was adopted on July 4th.** The Continental Congress voted on July 4th",
  'dola_custom': 'Error: stack expects a non-empty TensorList'}}

In [14]:
test_dola("meta-llama/Meta-Llama-3.1-8B-Instruct", hf_token=hf_token)


Testing meta-llama/Meta-Llama-3.1-8B-Instruct


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Vanilla output:  August 2, 1776
A. August 2, 1776
B. July 4, 1776
C. August 2, 1776
D. July 4, 1776

Answer:


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



DoLA high output:  August 2, 1776
A. August 2, 1776
B. July 4, 1776
C. August 2, 1776
D. July 4, 1776
Answer:

DoLA custom output:  August 2, 1776
The Declaration of Independence was officially adopted and signed by the Continental Congress on July 4, 1776. The document was written primarily by Thomas Jefferson, with contributions from John Adams, Benjamin Franklin, Robert Livingston

Results saved to meta-llama_Meta-Llama-3.1-8B-Instruct_results.json


{'model_name': 'meta-llama/Meta-Llama-3.1-8B-Instruct',
 'input_text': 'On what date was the Declaration of Independence officially signed?',
 'outputs': {'vanilla': ' August 2, 1776\nA. August 2, 1776\nB. July 4, 1776\nC. August 2, 1776\nD. July 4, 1776\n\nAnswer:',
  'dola_high': ' August 2, 1776\nA. August 2, 1776\nB. July 4, 1776\nC. August 2, 1776\nD. July 4, 1776\nAnswer:',
  'dola_custom': ' August 2, 1776\nThe Declaration of Independence was officially adopted and signed by the Continental Congress on July 4, 1776. The document was written primarily by Thomas Jefferson, with contributions from John Adams, Benjamin Franklin, Robert Livingston'}}

In [15]:
test_dola("mistralai/Mistral-7B-Instruct-v0.1", hf_token=hf_token)


Testing mistralai/Mistral-7B-Instruct-v0.1


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Vanilla output: 

July 4, 1776


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



DoLA high output: 

July 4, 1776

DoLA custom output: 
July 4, 1776

Results saved to mistralai_Mistral-7B-Instruct-v0.1_results.json


{'model_name': 'mistralai/Mistral-7B-Instruct-v0.1',
 'input_text': 'On what date was the Declaration of Independence officially signed?',
 'outputs': {'vanilla': '\n\nJuly 4, 1776',
  'dola_high': '\n\nJuly 4, 1776',
  'dola_custom': '\nJuly 4, 1776'}}

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
import torch
import pickle
import json


set_seed(42)

def test_dola_u(model_name, hf_token=None):
    print(f"\nTesting {model_name}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
    model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, token=hf_token)
    
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    model
    
    text = "On what date was the Declaration of Independence officially signed?"
    inputs = tokenizer(text, return_tensors="pt")
    
    results = {
        "model_name": model_name,
        "input_text": text,
        "outputs": {}
    }

    # Vanilla greedy decoding
    vanilla_output = model.generate(**inputs, do_sample=False, max_new_tokens=50)
    vanilla_text = tokenizer.decode(vanilla_output[0, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
    results["outputs"]["vanilla"] = vanilla_text
    print("Vanilla output:", vanilla_text)
    print("Device of inputs:", inputs["input_ids"].device)
    inputs.to(device)
    print("Device of inputs:", inputs["input_ids"].device)

    
    # DoLa decoding with contrasting higher part of layers (layers 16,18,...,30)
    try:
        dola_high_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers='high')
        dola_high_text = tokenizer.decode(dola_high_output[0, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
        results["outputs"]["dola_high"] = dola_high_text
        print("\nDoLA high output:", dola_high_text)
    except Exception as e:
        error_message = str(e)
        results["outputs"]["dola_high"] = f"Error: {error_message}"
        print(f"\nError occurred during DoLA high decoding: {error_message}")

    # DoLa decoding with contrasting specific layers (layers 28 and 30)
    try:
        dola_custom_output = model.generate(**inputs, do_sample=False, max_new_tokens=50, dola_layers=[28,30], repetition_penalty=1.2)
        dola_custom_text = tokenizer.decode(dola_custom_output[0, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
        results["outputs"]["dola_custom"] = dola_custom_text
        print("\nDoLA custom output:", dola_custom_text)
    except Exception as e:
        error_message = str(e)
        results["outputs"]["dola_custom"] = f"Error: {error_message}"
        print(f"\nError occurred during DoLA custom decoding: {error_message}")

    filename = f"{model_name.replace('/', '_')}_results.json"

    # Save results to a JSON file
    with open(filename, 'w') as f:
        json.dump(results, f, indent=2)

    
    print(f"\nResults saved to {filename}")

    del model
    del tokenizer
    torch.cuda.empty_cache()
    import gc
    gc.collect()
    return results

In [5]:
# check unsloth lib
test_dola_u("unsloth/Meta-Llama-3.1-8B-bnb-4bit", hf_token=hf_token)


Testing unsloth/Meta-Llama-3.1-8B-bnb-4bit


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]



Vanilla output:  The Declaration of Independence was signed on August 2, 1776. The Declaration of Independence was signed on August 2, 1776. The Declaration of Independence was signed on August 2, 1776. The Declaration of Independence was
Device of inputs: cpu
Device of inputs: cuda:0





DoLA high output:  On July 4, 1776, the Continental Congress adopted the Declaration of Independence, a historic document asserting America’s independence from Great Britain. More than two months had passed since Thomas Jefferson wrote the first draft of the Declaration, on June 28

DoLA custom output:  The answer is July 4, 1776. But the truth is the document was not actually signed by the delegates to the Second Continental Congress on that day.
The Declaration of Independence was written in June, 1776, and the Second Continental

Results saved to unsloth_Meta-Llama-3.1-8B-bnb-4bit_results.json


{'model_name': 'unsloth/Meta-Llama-3.1-8B-bnb-4bit',
 'input_text': 'On what date was the Declaration of Independence officially signed?',
 'outputs': {'vanilla': ' The Declaration of Independence was signed on August 2, 1776. The Declaration of Independence was signed on August 2, 1776. The Declaration of Independence was signed on August 2, 1776. The Declaration of Independence was',
  'dola_high': ' On July 4, 1776, the Continental Congress adopted the Declaration of Independence, a historic document asserting America’s independence from Great Britain. More than two months had passed since Thomas Jefferson wrote the first draft of the Declaration, on June 28',
  'dola_custom': ' The answer is July 4, 1776. But the truth is the document was not actually signed by the delegates to the Second Continental Congress on that day.\nThe Declaration of Independence was written in June, 1776, and the Second Continental'}}

In [20]:
import torch
torch.cuda.empty_cache()
import gc
gc.collect()

0

In [21]:
!kill -9 3010    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


/bin/bash: line 0: kill: (3010) - No such process


# Results

In [17]:
import os
import json
import pandas as pd

path_with_json = "/kaggle/working"

json_files = [pos_json for pos_json in os.listdir(path_with_json) if pos_json.endswith('.json')]

# Initialize an empty list to store the data
data = []

# Loop through each JSON file and load the data
for js in json_files:
    with open(os.path.join(path_with_json, js)) as json_file:
        json_text = json.load(json_file)
        for key in json_text['outputs']:
            json_text[f"{key}_output"] = json_text['outputs'][key]
        
        del json_text["outputs"]
        
        data.append(json_text)

# Convert list of dictionaries to DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print(df)


                              model_name  \
0                   google/gemma-2-2b-it   
1     unsloth/Meta-Llama-3.1-8B-bnb-4bit   
2     mistralai/Mistral-7B-Instruct-v0.1   
3                    huggyllama/llama-7b   
4  meta-llama/Meta-Llama-3.1-8B-Instruct   

                                          input_text  \
0  On what date was the Declaration of Independen...   
1  On what date was the Declaration of Independen...   
2  On what date was the Declaration of Independen...   
3  On what date was the Declaration of Independen...   
4  On what date was the Declaration of Independen...   

                                      vanilla_output  \
0  \n\nThe Declaration of Independence was adopte...   
1   The Declaration of Independence was signed on...   
2                                   \n\nJuly 4, 1776   
3  \nThe Declaration of Independence was signed o...   
4   August 2, 1776\nA. August 2, 1776\nB. July 4,...   

                                    dola_high_output  \
0  \n

In [23]:
!pip install 'unsloth[cu121]'

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [24]:
!pip install torch==2.1.0

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [8]:
from unsloth import FastLanguageModel
import torch

model_name = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"  # Replace with your actual model name
max_seq_length = 512  # Adjust as necessary
dtype = torch.float16  # Assuming you're using fp16
load_in_4bit = True  # Set based on your needs

# Initialize the FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token=hf_token
)


==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla P100-PCIE-16GB. Max memory: 15.888 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.1.0+cu121. CUDA = 6.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.22.post7. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [10]:
text = "On what date was the Declaration of Independence officially signed?"
inputs = tokenizer(text, return_tensors="pt")
results = {
    "model_name": model_name,
    "input_text": text,
    "outputs": {}
}

# Vanilla greedy decoding
vanilla_output = model.generate(**inputs, do_sample=False, max_new_tokens=50)
vanilla_text = tokenizer.decode(vanilla_output[0, inputs.input_ids.shape[-1]:], skip_special_tokens=True)
results["outputs"]["vanilla"] = vanilla_text
print("Vanilla output:", vanilla_text)

KeyError: 'Cache only has 0 layers, attempted to access layer with index 0'