## import packages

In [None]:
# import packages
import os
from tqdm.notebook import tqdm
import warnings
import json
import torch.nn.functional as F
import torch
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM
from datetime import datetime

## load data

In [None]:
# load from json file

data_path = "./arxiv_pdfs_cs_24_2_2000_to_7000.json"


def load_list_from_json(file_path):
    """
    Loads a list of strings from a JSON file.

    :param file_path: Path of the JSON file to be loaded.
    :return: List of strings loaded from the JSON file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)
    

extracted_texts = load_list_from_json(data_path)

print(len(extracted_texts))
# print([len(x) for x in extracted_texts])

In [None]:
for text in extracted_texts[:100]:
    print(text)
    print('-' * 100)

## Now evaluating the models

In [None]:
chunk_size = 1024
log_folder_path = './logs/'

In [None]:
def calculate_log_sum(logits, target_token_ids):
    shifted_logits = logits[:-1, :]
    shifted_targets = target_token_ids[1:]
    
    log_probs = F.log_softmax(shifted_logits, dim=-1)
    
    target_log_probs = -log_probs.gather(1, shifted_targets.unsqueeze(1)).squeeze()
    # print(target_log_probs)
    
    log_sum = torch.sum(target_log_probs, dim=-1)
    # print(perplexity_sum)

    return log_sum.item()


def print_model_parameters_in_billions(model):
    
    total_params = sum(p.numel() for p in model.parameters())
    
    total_params_billion = total_params / 1e9
    
    print(f"Model parameters: {total_params_billion:.3f} billion")
    
    
def log(data_dict, folder_path):
    if not os.path.exists(folder_path):
        try:
            os.makedirs(folder_path)
            print(f"Directory created at {folder_path}")
        except Exception as e:
            print(f"Error creating directory: {e}")
            return

    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
    file_name = f"{timestamp}.json"
    file_path = os.path.join(folder_path, file_name)

    try:
        with open(file_path, 'w') as file:
            json.dump(data_dict, file, indent=4)
        print(f"Dictionary saved successfully to {file_path}")
    except Exception as e:
        print(f"Error saving dictionary: {e}")

## Evaluate RWKV(v4)

In [None]:
# load rwkv model
model_name_or_path = r'../models/rwkv-4-3b/RWKV-4-Pile-3B-20221110-ctx4096.pth'

os.environ['RWKV_JIT_ON'] = '1'
os.environ["RWKV_CUDA_ON"] = '1'

from rwkv.model import RWKV
from rwkv.utils import PIPELINE

model = RWKV(model=model_name_or_path, strategy='cuda fp16')
# pipeline = PIPELINE(model, r"rwkv_vocab_v20230424")
pipeline = PIPELINE(model, "./support/20B_tokenizer.json")  # v4
tokenizer = pipeline.tokenizer

In [None]:
# eval rwkv
rwkv_test_data = []
rwkv_token_length_list = []

for idx, sample in tqdm(enumerate(extracted_texts), total=len(extracted_texts)):
    
    with torch.no_grad():
        
        input_seq = tokenizer.encode(sample).ids # v4
        input_length = len(input_seq)
        
        neg_log_prob_temp = 0
        for begin in range(0, input_length, chunk_size):
            input_chunk = input_seq[begin: begin + chunk_size]
            

            logit = model.forward(input_chunk, None, full_output=True)[0]
            
            if len(input_chunk) == 1:
                logit = logit.unsqueeze(0)

            log_sum = calculate_log_sum(logit, torch.tensor(input_chunk).cuda())
            
            neg_log_prob_temp += log_sum

        rwkv_token_length_list.append(input_length)
        rwkv_test_data.append(neg_log_prob_temp)
        
data_dict = {
    'model_name_or_path': model_name_or_path,
    'data_path': data_path,
    'neg_log_prob_sum': sum(rwkv_test_data) / len(rwkv_test_data),
    'avg tokens': sum(rwkv_token_length_list) / len(rwkv_token_length_list),
       }

log(data_dict, log_folder_path)
        
print(f'log probability sum: {sum(rwkv_test_data) / len(rwkv_test_data):.2f}')
print(f'avg tokens: {sum(rwkv_token_length_list) / len(rwkv_token_length_list):.0f}')

In [None]:
del model, pipeline, tokenizer, logit

gc.collect()
torch.cuda.empty_cache()

## Evaluate RWKV(v5/v6)

In [None]:
# load rwkv model
model_name_or_path = r'../models/rwkv_5_3b/RWKV-5-World-3B-v2-20231113-ctx4096.pth'

os.environ['RWKV_JIT_ON'] = '1'
os.environ["RWKV_CUDA_ON"] = '1'

from rwkv.model import RWKV
from rwkv.utils import PIPELINE

model = RWKV(model=model_name_or_path, strategy='cuda fp16')
pipeline = PIPELINE(model, r"rwkv_vocab_v20230424")
# pipeline = PIPELINE(model, "./models/20B_tokenizer.json")  # v4
tokenizer = pipeline.tokenizer

In [None]:
# eval rwkv
rwkv_test_data = []
rwkv_token_length_list = []

for idx, sample in tqdm(enumerate(extracted_texts), total=len(extracted_texts)):
    
    with torch.no_grad():
        
        input_seq = tokenizer.encode(sample)
        # input_seq = tokenizer.encode(sample).ids # v4
        input_length = len(input_seq)
        
        neg_log_prob_temp = 0
        for begin in range(0, input_length, chunk_size):
            input_chunk = input_seq[begin: begin + chunk_size]
            

            logit = model.forward(input_chunk, None, full_output=True)[0]
            
            if len(input_chunk) == 1:
                logit = logit.unsqueeze(0)

            log_sum = calculate_log_sum(logit, torch.tensor(input_chunk).cuda())
            
            neg_log_prob_temp += log_sum

        rwkv_token_length_list.append(input_length)
        rwkv_test_data.append(neg_log_prob_temp)
        
data_dict = {
    'model_name_or_path': model_name_or_path,
    'data_path': data_path,
    'neg_log_prob_sum': sum(rwkv_test_data) / len(rwkv_test_data),
    'avg tokens': sum(rwkv_token_length_list) / len(rwkv_token_length_list),
       }

log(data_dict, log_folder_path)
        
print(f'log probability sum: {sum(rwkv_test_data) / len(rwkv_test_data):.2f}')
print(f'avg tokens: {sum(rwkv_token_length_list) / len(rwkv_token_length_list):.0f}')

In [None]:
del model, pipeline, tokenizer, logit

gc.collect()
torch.cuda.empty_cache()

## Evaluate Hugging Face models

In [None]:
# load model

model_name_or_path = r"stabilityai/stablelm-3b-4e1t"
cache_dir = '../models/temp/'

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, 
                                             device_map="cuda", 
                                             trust_remote_code=True, 
                                             cache_dir=cache_dir).eval()

print_model_parameters_in_billions(model)

In [None]:
# eval
data = []
token_length_list = []

for idx, sample in tqdm(enumerate(extracted_texts), total=len(extracted_texts)):
    
    with torch.no_grad():
    
        inputs = tokenizer(sample, return_tensors='pt')
        inputs = inputs.to(model.device)

        seq_length = inputs['input_ids'].shape[-1]
        
        neg_log_prob_temp = 0
        for begin in range(0, seq_length, chunk_size):
            
            input_chunk = inputs['input_ids'][:, begin: begin + chunk_size]

            logit = model.forward(input_ids=input_chunk).logits[0, :, :]

            log_sum = calculate_log_sum(logit, input_chunk.squeeze(0))
            neg_log_prob_temp += log_sum

        token_length_list.append(seq_length)
        data.append(neg_log_prob_temp)
        
data_dict = {
    'model_name_or_path': model_name_or_path,
    'data_path': data_path,
    'neg_log_prob_sum': sum(data) / len(data),
    'avg tokens': sum(token_length_list) / len(token_length_list),
       }

log(data_dict, log_folder_path)

print(f'log probability sum: {sum(data) / len(data):.2f}')
print(f'avg tokens: {sum(token_length_list) / len(token_length_list):.0f}')

In [None]:
del model, tokenizer, logit, inputs

gc.collect()
torch.cuda.empty_cache()

## Evaluate Mamba

In [None]:
from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel

model_name_or_path = "state-spaces/mamba-2.8b-slimpj"

tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b")
model = MambaLMHeadModel.from_pretrained(model_name_or_path, device="cuda", dtype=torch.float16)
device = torch.device('cuda')

print_model_parameters_in_billions(model)

In [None]:
# eval
data = []
token_length_list = []

for idx, sample in tqdm(enumerate(extracted_texts), total=len(extracted_texts)):
    
    with torch.no_grad():
    
        inputs = tokenizer(sample, return_tensors='pt')
        inputs = inputs.to(device)

        seq_length = inputs['input_ids'].shape[-1]
        
        neg_log_prob_temp = 0
        for begin in range(0, seq_length, chunk_size):
            
            input_chunk = inputs['input_ids'][:, begin: begin + chunk_size]

            logit = model.forward(input_ids=input_chunk).logits[0, :, :]

            log_sum = calculate_log_sum(logit, input_chunk.squeeze(0))
            neg_log_prob_temp += log_sum

        token_length_list.append(seq_length)
        data.append(neg_log_prob_temp)
        
data_dict = {
    'model_name_or_path': model_name_or_path,
    'data_path': data_path,
    'neg_log_prob_sum': sum(data) / len(data),
    'avg tokens': sum(token_length_list) / len(token_length_list),
       }

log(data_dict, log_folder_path)

print(f'log probability sum: {sum(data) / len(data):.2f}')
print(f'avg tokens: {sum(token_length_list) / len(token_length_list):.0f}')

In [None]:
del model, tokenizer, logit, inputs

gc.collect()
torch.cuda.empty_cache()