## import packages

In [1]:
# import pkg
import os
from tqdm.notebook import tqdm
import warnings
import json
import torch.nn.functional as F
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM

## load data

In [2]:
# load from json file

file_path = "./arxiv_pdfs_cs_24_1_2000_to_7000.json"


def load_list_from_json(file_path):
    """
    Loads a list of strings from a JSON file.

    :param file_path: Path of the JSON file to be loaded.
    :return: List of strings loaded from the JSON file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)
    

extracted_texts = load_list_from_json(file_path)

print(len(extracted_texts))
# print([len(x) for x in extracted_texts])

1000


In [None]:
for text in extracted_texts[:100]:
    print(text)
    print('-' * 100)

## Now evaluating the models

In [3]:
def calculate_log_sum(logits, target_token_ids):
    shifted_logits = logits[:-1, :]
    shifted_targets = target_token_ids[1:]
    
    log_probs = F.log_softmax(shifted_logits, dim=-1)
    
    target_log_probs = -log_probs.gather(1, shifted_targets.unsqueeze(1)).squeeze()
    # print(target_log_probs)
    
    log_sum = torch.sum(target_log_probs, dim=-1)
    # print(perplexity_sum)

    return log_sum.item()

## Evaluate RWKV

In [6]:
# load rwkv model
rwkv5_7b_path = r'../rwkv5_7b/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth'
max_length = 4096
truncate = True

os.environ['RWKV_JIT_ON'] = '1'
os.environ["RWKV_CUDA_ON"] = '1'

from rwkv.model import RWKV
from rwkv.utils import PIPELINE

rwkv5_7b = RWKV(model=rwkv5_7b_path, strategy='cuda fp16')
pipeline = PIPELINE(rwkv5_7b, r"rwkv_vocab_v20230424")
rwkv_tokenizer = pipeline.tokenizer

Using /root/.cache/torch_extensions/py38_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py38_cu118/wkv_cuda/build.ninja...
Building extension module wkv_cuda...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)


ninja: no work to do.
RWKV_JIT_ON 1 RWKV_CUDA_ON 1 RESCALE_LAYER 6

Loading ../rwkv5_7b/RWKV-v5-Eagle-World-7B-v2-20240128-ctx4096.pth ...


Loading extension module wkv_cuda...


Model detected: v5.2
Strategy: (total 32+1=33 layers)
* cuda [float16, float16], store 33 layers
0-cuda-float16-float16 1-cuda-float16-float16 2-cuda-float16-float16 3-cuda-float16-float16 4-cuda-float16-float16 5-cuda-float16-float16 6-cuda-float16-float16 7-cuda-float16-float16 8-cuda-float16-float16 9-cuda-float16-float16 10-cuda-float16-float16 11-cuda-float16-float16 12-cuda-float16-float16 13-cuda-float16-float16 14-cuda-float16-float16 15-cuda-float16-float16 16-cuda-float16-float16 17-cuda-float16-float16 18-cuda-float16-float16 19-cuda-float16-float16 20-cuda-float16-float16 21-cuda-float16-float16 22-cuda-float16-float16 23-cuda-float16-float16 24-cuda-float16-float16 25-cuda-float16-float16 26-cuda-float16-float16 27-cuda-float16-float16 28-cuda-float16-float16 29-cuda-float16-float16 30-cuda-float16-float16 31-cuda-float16-float16 32-cuda-float16-float16 
emb.weight                        f16      cpu  65536  4096 
blocks.0.ln1.weight               f16   cuda:0   4096      

Using /root/.cache/torch_extensions/py38_cu118 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py38_cu118/rwkv5/build.ninja...
Building extension module rwkv5...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module rwkv5...


ninja: no work to do.


In [9]:
# eval rwkv
rwkv_test_data = []
rwkv_token_length_list = []
overlong_seq = 0

for idx, sample in tqdm(enumerate(extracted_texts), total=len(extracted_texts)):
    
    with torch.no_grad():
        
        input_seq = rwkv_tokenizer.encode(sample)

        if len(input_seq) > max_length:
            overlong_seq += 1
            warnings.warn(f'seq-{idx} length > {max_length}')
            
            if truncate:
                input_seq = input_seq[:max_length]
                factor = len(input_seq) / max_length
        else:
            factor = 1
            

        logit = rwkv5_7b.forward(input_seq, None, full_output=True)[0]

        log_sum = calculate_log_sum(logit, torch.tensor(input_seq).cuda())
        log_sum *= factor

        rwkv_token_length_list.append(len(input_seq))
        rwkv_test_data.append(log_sum)
        
print(f'log probability sum: {sum(rwkv_test_data) / len(rwkv_test_data)}')
print(f'avg tokens: {sum(rwkv_token_length_list) / len(rwkv_token_length_list)}')
print(f'overlong_seq: {overlong_seq}')

  0%|          | 0/1000 [00:00<?, ?it/s]



log probability sum: 5171.20875805664
avg tokens: 2368.164
overlong_seq: 2


In [10]:
del rwkv5_7b, pipeline, rwkv_tokenizer, logit

gc.collect()
torch.cuda.empty_cache()

## Evaluate Hugging Face models

In [4]:
# load model


# llama2
llama2_7b_path = r"../Llama-2-7b-hf/"
max_length = 4096
truncate = True

tokenizer = AutoTokenizer.from_pretrained(llama2_7b_path)
model = AutoModelForCausalLM.from_pretrained(llama2_7b_path, device_map="auto", trust_remote_code=True).eval()


# mistral
# mistral_7b_path = r"../mistral_7b/"
# max_length = 8192
# truncate = True

# tokenizer = AutoTokenizer.from_pretrained(mistral_7b_path)
# model = AutoModelForCausalLM.from_pretrained(mistral_7b_path, device_map="auto", trust_remote_code=True).eval()


# mpt
# mpt_7b_path = r"../mpt_7b/"
# max_length = 2048
# truncate = True

# tokenizer = AutoTokenizer.from_pretrained(mpt_7b_path)
# model = AutoModelForCausalLM.from_pretrained(mpt_7b_path, device_map="auto", trust_remote_code=True).eval()


# yi
# yi_6b_path = r"../yi_6b/"
# max_length = 4096
# truncate = True

# tokenizer = AutoTokenizer.from_pretrained(yi_6b_path)
# model = AutoModelForCausalLM.from_pretrained(yi_6b_path, device_map="auto", trust_remote_code=True).eval()


# falcon
# falcon_7b_path = r"../falcon_7b/"
# max_length = 2048
# truncate = True

# tokenizer = AutoTokenizer.from_pretrained(falcon_7b_path)
# model = AutoModelForCausalLM.from_pretrained(falcon_7b_path, device_map="auto", trust_remote_code=False).eval()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
# eval
data = []
token_length_list = []
overlong_seq = 0

for idx, sample in tqdm(enumerate(extracted_texts), total=len(extracted_texts)):
    
    with torch.no_grad():
    
        inputs = tokenizer(sample, return_tensors='pt')
        inputs = inputs.to(model.device)

        seq_length = inputs['input_ids'].shape[-1]

        if seq_length > max_length:
            overlong_seq += 1
            if truncate:
                inputs = tokenizer(sample, return_tensors='pt', max_length=max_length, truncation=True)
                inputs = inputs.to(model.device)
                warnings.warn(f'seq-{idx} length {seq_length} > truncation_length({max_length}) truncated')
                factor = (seq_length / max_length)
        else:
            factor = 1
                

        logit = model.forward(**inputs).logits[0, :, :]

        log_sum = calculate_log_sum(logit, inputs['input_ids'].squeeze(0))
        log_sum *= factor
        # print(log_sum)

        token_length_list.append(seq_length)
        data.append(log_sum)

print(f'log probability sum: {sum(data) / len(data)}')
print(f'avg tokens: {sum(token_length_list) / len(token_length_list)}')
print(f'overlong_seq: {overlong_seq}')

  0%|          | 0/1000 [00:00<?, ?it/s]

In [None]:
del model, tokenizer, logit, inputs

gc.collect()
torch.cuda.empty_cache()

## Evaluate Mamba

In [None]:
# from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel

# model_name = "state-spaces/mamba-1.4b"
# max_length = 2048

# mamba_tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
# mamba_1b4 = MambaLMHeadModel.from_pretrained(model_name, device="cuda", dtype=torch.float16)

In [None]:
# mamba_test_data = []
# mamba_token_length_list = []
# overlong_seq = 0

# for idx, sample in tqdm(enumerate(extracted_texts), total=len(extracted_texts)):
    
#     with torch.no_grad():
        
#         inputs = mamba_tokenizer(sample, return_tensors="pt").input_ids.to(device=device)

#         seq_length = inputs.shape[-1]

#         if seq_length > max_length:
#             # print(f'length > {max_length}')
#             overlong_seq += 1
#             warnings.warn(f'seq-{idx} length {seq_length} > truncation_length({max_length})')

#         mamba_output = mamba_1b4.forward(inputs)
#         logit = mamba_output.logits[0, :, :]

#         log_sum = calculate_log_sum(logit, inputs[0])

#         mamba_token_length_list.append(seq_length)
#         mamba_test_data.append(log_sum)
        

# mamba_1b4_log_sum = sum(mamba_test_data) / len(mamba_test_data)
# mamba_1b4_avg_length = sum(mamba_token_length_list) / len(mamba_token_length_list)
# print(f'log probability sum: {mamba_1b4_log_sum}')
# print(f'avg tokens: {mamba_1b4_avg_length}')
# print(f'overlong_seq: {overlong_seq}')

In [None]:
# del mamba_1b4, inputs, logit

# gc.collect()
# torch.cuda.empty_cache()