## import packages

In [2]:
# import packages
import os
from tqdm.notebook import tqdm
import warnings
import json
import torch.nn.functional as F
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
from datetime import datetime

## load data

In [3]:
# load from json file

data_path = "./arxiv_pdfs_cs_24_1_2000_to_7000.json"


def load_list_from_json(file_path):
    """
    Loads a list of strings from a JSON file.

    :param file_path: Path of the JSON file to be loaded.
    :return: List of strings loaded from the JSON file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)
    

extracted_texts = load_list_from_json(data_path)

print(len(extracted_texts))
# print([len(x) for x in extracted_texts])

1000


In [4]:
for text in extracted_texts[:100]:
    print(text)
    print('-' * 100)

nlocation is compared with a database of reference imagesfrom known locations in order to localize the query image.The location of the query image is estimated by identifyingthe closest matching image in the reference image database.This task is challenging due to variations in seasons, illumi-nation, viewpoint, and occlusions. Typically, two types ofimage representations are used in VPR tasks: global andpatch-level descriptors. Global descriptors [2]–[4] provide asuccinct image representation in a single vector, facilitatingefficient large-scale searches. Patch-level or local descriptors[5]–[7] encode details about specific regions or key points ofthe image and are used for performing geometric verificationbetween image pairs.To enhance performance, VPR is commonly executed intwo distinct phases. Initially, a global retrieval is conductedThe authors are with SMART Lab, Department of Computer and In-formation Technology, Purdue University, West Lafayette, IN 47907, USAkannan9@purdue.ed

## Now evaluating the models

In [5]:
chunk_size = 1024
log_folder_path = './logs/'

In [6]:
def calculate_log_sum(logits, target_token_ids):
    shifted_logits = logits[:-1, :]
    shifted_targets = target_token_ids[1:]
    
    log_probs = F.log_softmax(shifted_logits, dim=-1)
    
    target_log_probs = -log_probs.gather(1, shifted_targets.unsqueeze(1)).squeeze()
    # print(target_log_probs)
    
    log_sum = torch.sum(target_log_probs, dim=-1)
    # print(perplexity_sum)

    return log_sum.item()


def print_model_parameters_in_billions(model):
    
    total_params = sum(p.numel() for p in model.parameters())
    
    total_params_billion = total_params / 1e9
    
    print(f"Model parameters: {total_params_billion:.3f} billion")
    
    
def log(data_dict, folder_path):
    if not os.path.exists(folder_path):
        try:
            os.makedirs(folder_path)
            print(f"Directory created at {folder_path}")
        except Exception as e:
            print(f"Error creating directory: {e}")
            return

    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    file_name = f"{timestamp}.json"
    file_path = os.path.join(folder_path, file_name)

    try:
        with open(file_path, 'w') as file:
            json.dump(data_dict, file, indent=4)
        print(f"Dictionary saved successfully to {file_path}")
    except Exception as e:
        print(f"Error saving dictionary: {e}")

## Evaluate RWKV(v5/v6)

In [7]:
# load rwkv model
model_name_or_path = r'../models/rwkv_5_3b/RWKV-5-World-3B-v2-20231113-ctx4096.pth'

os.environ['RWKV_JIT_ON'] = '1'
os.environ["RWKV_CUDA_ON"] = '1'

from rwkv.model import RWKV
from rwkv.utils import PIPELINE

model = RWKV(model=model_name_or_path, strategy='cuda fp16')
pipeline = PIPELINE(model, r"rwkv_vocab_v20230424")
# pipeline = PIPELINE(model, "./models/20B_tokenizer.json")  # v4
tokenizer = pipeline.tokenizer

Using /root/.cache/torch_extensions/py38_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py38_cu118/wkv_cuda/build.ninja...
Building extension module wkv_cuda...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module wkv_cuda...


ninja: no work to do.
RWKV_JIT_ON 1 RWKV_CUDA_ON 1 RESCALE_LAYER 6

Loading ../models/rwkv_5_3b/RWKV-5-World-3B-v2-20231113-ctx4096.pth ...
Model detected: v5.2
Strategy: (total 32+1=33 layers)
* cuda [float16, float16], store 33 layers
0-cuda-float16-float16 1-cuda-float16-float16 2-cuda-float16-float16 3-cuda-float16-float16 4-cuda-float16-float16 5-cuda-float16-float16 6-cuda-float16-float16 7-cuda-float16-float16 8-cuda-float16-float16 9-cuda-float16-float16 10-cuda-float16-float16 11-cuda-float16-float16 12-cuda-float16-float16 13-cuda-float16-float16 14-cuda-float16-float16 15-cuda-float16-float16 16-cuda-float16-float16 17-cuda-float16-float16 18-cuda-float16-float16 19-cuda-float16-float16 20-cuda-float16-float16 21-cuda-float16-float16 22-cuda-float16-float16 23-cuda-float16-float16 24-cuda-float16-float16 25-cuda-float16-float16 26-cuda-float16-float16 27-cuda-float16-float16 28-cuda-float16-float16 29-cuda-float16-float16 30-cuda-float16-float16 31-cuda-float16-float16 32-cu

Using /root/.cache/torch_extensions/py38_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py38_cu118/rwkv5/build.ninja...
Building extension module rwkv5...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.


Loading extension module rwkv5...


In [8]:
# eval rwkv
rwkv_test_data = []
rwkv_token_length_list = []

for idx, sample in tqdm(enumerate(extracted_texts), total=len(extracted_texts)):
    
    with torch.no_grad():
        
        input_seq = tokenizer.encode(sample)
        # input_seq = tokenizer.encode(sample).ids # v4
        input_length = len(input_seq)
        
        neg_log_prob_temp = 0
        for begin in range(0, input_length, chunk_size):
            input_chunk = input_seq[begin: begin + chunk_size]
            

            logit = model.forward(input_chunk, None, full_output=True)[0]
            
            if len(input_chunk) == 1:
                logit = logit.unsqueeze(0)

            log_sum = calculate_log_sum(logit, torch.tensor(input_chunk).cuda())
            
            neg_log_prob_temp += log_sum

        rwkv_token_length_list.append(input_length)
        rwkv_test_data.append(neg_log_prob_temp)
        
data_dict = {
    'model_name_or_path': model_name_or_path,
    'data_path': data_path,
    'neg_log_prob_sum': sum(rwkv_test_data) / len(rwkv_test_data),
    'avg tokens': sum(rwkv_token_length_list) / len(rwkv_token_length_list),
       }

log(data_dict, log_folder_path)
        
print(f'log probability sum: {sum(rwkv_test_data) / len(rwkv_test_data):.2f}')
print(f'avg tokens: {sum(rwkv_token_length_list) / len(rwkv_token_length_list):.0f}')

  0%|          | 0/1000 [00:00<?, ?it/s]

 does not have profile information (Triggered internally at ../third_party/nvfuser/csrc/graph_fuser.cpp:104.)
  r, k, v, g, xxx, ss = self.v5_2_before(x, sx, s, ln_w, ln_b, lx_w, lx_b, k_mix, v_mix, r_mix, g_mix, t_decay, t_first, kw, vw, rw, gw, ow, kmx, krx, kmy, kry, vmx, vrx, vmy, vry, rmx, rrx, rmy, rry, gmx, grx, gmy, gry, omx, orx, omy, ory)


Dictionary saved successfully to ./logs/2024-02-29_23-54-19.json
log probability sum: 2900.97
avg tokens: 1143


In [9]:
del model, pipeline, tokenizer, logit

gc.collect()
torch.cuda.empty_cache()

## Evaluate Hugging Face models

In [10]:
# load model

model_name_or_path = r"stabilityai/stablelm-3b-4e1t"
cache_dir = '../models/temp/'

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, 
                                             device_map="cuda", 
                                             trust_remote_code=True, 
                                             cache_dir=cache_dir).eval()

print_model_parameters_in_billions(model)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model parameters: 2.795 billion


In [11]:
# eval
data = []
token_length_list = []

for idx, sample in tqdm(enumerate(extracted_texts), total=len(extracted_texts)):
    
    with torch.no_grad():
    
        inputs = tokenizer(sample, return_tensors='pt')
        inputs = inputs.to(model.device)

        seq_length = inputs['input_ids'].shape[-1]
        
        neg_log_prob_temp = 0
        for begin in range(0, seq_length, chunk_size):
            
            input_chunk = inputs['input_ids'][:, begin: begin + chunk_size]

            logit = model.forward(input_ids=input_chunk).logits[0, :, :]

            log_sum = calculate_log_sum(logit, input_chunk.squeeze(0))
            neg_log_prob_temp += log_sum

        token_length_list.append(seq_length)
        data.append(neg_log_prob_temp)
        
data_dict = {
    'model_name_or_path': model_name_or_path,
    'data_path': data_path,
    'neg_log_prob_sum': sum(data) / len(data),
    'avg tokens': sum(token_length_list) / len(token_length_list),
       }

log(data_dict, log_folder_path)

print(f'log probability sum: {sum(data) / len(data):.2f}')
print(f'avg tokens: {sum(token_length_list) / len(token_length_list):.0f}')

  0%|          | 0/1000 [00:00<?, ?it/s]

Dictionary saved successfully to ./logs/2024-02-29_23-59-02.json
log probability sum: 2894.56
avg tokens: 1149


In [12]:
del model, tokenizer, logit, inputs

gc.collect()
torch.cuda.empty_cache()

## Evaluate Mamba

In [13]:
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel

model_name_or_path = "state-spaces/mamba-2.8b-slimpj"

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
model = MambaLMHeadModel.from_pretrained(model_name_or_path, device="cuda", dtype=torch.float16)
device = torch.device('cuda')

print_model_parameters_in_billions(model)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model parameters: 2.768 billion


In [14]:
# eval
data = []
token_length_list = []

for idx, sample in tqdm(enumerate(extracted_texts), total=len(extracted_texts)):
    
    with torch.no_grad():
    
        inputs = tokenizer(sample, return_tensors='pt')
        inputs = inputs.to(device)

        seq_length = inputs['input_ids'].shape[-1]
        
        neg_log_prob_temp = 0
        for begin in range(0, seq_length, chunk_size):
            
            input_chunk = inputs['input_ids'][:, begin: begin + chunk_size]

            logit = model.forward(input_ids=input_chunk).logits[0, :, :]

            log_sum = calculate_log_sum(logit, input_chunk.squeeze(0))
            neg_log_prob_temp += log_sum

        token_length_list.append(seq_length)
        data.append(neg_log_prob_temp)
        
data_dict = {
    'model_name_or_path': model_name_or_path,
    'data_path': data_path,
    'neg_log_prob_sum': sum(data) / len(data),
    'avg tokens': sum(token_length_list) / len(token_length_list),
       }

log(data_dict, log_folder_path)

print(f'log probability sum: {sum(data) / len(data):.2f}')
print(f'avg tokens: {sum(token_length_list) / len(token_length_list):.0f}')

  0%|          | 0/1000 [00:00<?, ?it/s]

Dictionary saved successfully to ./logs/2024-03-01_00-01-53.json
log probability sum: 3165.90
avg tokens: 1149


In [15]:
del model, tokenizer, logit, inputs

gc.collect()
torch.cuda.empty_cache()