In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

# Load embedding model
from FlagEmbedding import BGEM3FlagModel

embedding_model = BGEM3FlagModel('/data/aicuserData/guoqizhi/embedding_model/bge-m3',  use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation
import numpy
Query_embedding = numpy.load('query_embedding.npy')
Answer_embedding = numpy.load('answer_embedding.npy')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Searching all file under given path
import os
def get_filepath_list(root_dir):
    file_list = []
    folder_list = []
    
    for root,dirs,files in os.walk(root_dir):
        for i_file in files:
            file_list.append(f'{root}/{i_file}')
            
        for i_dir in dirs:
            folder_list.append(f'{root}/{i_dir}')
    return file_list, folder_list

# Classify file type
def analyse_file_type(f_list):
    file_type_list = {}
    for file in f_list:
        file_type = file.split('.')[-1]
        if file_type in file_type_list.keys():
            file_type_list[file_type].append(file)
        else:
            file_type_list[file_type] = [file]
            
    return file_type_list

dir = '/home/guoqizhi/deepseek-chat/'
file_list, folder_list = get_filepath_list(dir)
file_type_list = analyse_file_type(file_list)

In [3]:
# Merge different data
from transformers import AutoTokenizer
embedding_tokenizer = AutoTokenizer.from_pretrained('/data/aicuserData/guoqizhi/embedding_model/bge-m3')

import json
qa_dataset = []
passage_dataset = []
i = 0
for file in file_type_list['json']:
    with open(file) as f:
        data = json.load(f)
        for d in data:
            d['file'] = file.split('/')[-1]
            d['block_id'] = i
            i += 1
            # Filte extrame long query and answer
            for j in range(len(d['qa_pairs'])):
                encoding_result = embedding_tokenizer(d['qa_pairs'][j]['Gura'], add_special_tokens=False)['input_ids']
                if len(encoding_result) > 256:
                    d['qa_pairs'][j]['Gura'] = embedding_tokenizer.decode(encoding_result[:512])
            qa_dataset += d['qa_pairs']
            passage_dataset.append(d['script'])

In [4]:
# Extract embedding result
import numpy
Query = [d['User'] for d in qa_dataset]
Answer = [d['Gura'] for d in qa_dataset]

Query_embedding = numpy.load('query_embedding.npy')
Answer_embedding = numpy.load('answer_embedding.npy')

In [5]:
user_input = [
    'What is your favorite food?',
    'Can you introduce yourself? Tell me about your name, age and hobbies. ',
    'Do you remember when your debut took place? What is the exact date and time?',
    'Can you give 3 shark facts that you mentioned in your stream?',
    'What do you call your fans?',
    'Which of the following is not your nickname? a) gooba b)goomba c)goob d)goola',
    'Which of the following are you bad at?  a)English b)Mathematics c)Video games d)Drawing',
    'You only speak English and Chinese, but not Japanese, is that correct? ',
    'Which Vtuber agency do you belong to?  a) Hololive  b)Nijisanji c)VShojo d)Youtube',
    'What is your favorite type of shark?',
    'Do you know who Watson Amelia is? If yes,  tell me more about your relationship with her.',
    'What is the name of your first original song?',
    'Can you describe your outfit and its unique features?',
    'What games do you often play on your streams?',
    'Have you ever collaborated with other Hololive members? If so, who were they?',
    'How tall are you?',
    'When is your birthday?',
    'How old are you?',
]

user_input_embedding = embedding_model.encode(user_input, return_dense=True, return_sparse=True, return_colbert_vecs=False)['dense_vecs']

In [6]:
# Load rank model
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

def get_inputs(pairs, tokenizer, prompt=None, max_length=1024):
    if prompt is None:
        prompt = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'."
    sep = "\n"
    prompt_inputs = tokenizer(prompt,
                              return_tensors=None,
                              add_special_tokens=False)['input_ids']
    sep_inputs = tokenizer(sep,
                           return_tensors=None,
                           add_special_tokens=False)['input_ids']
    inputs = []
    for query, passage in pairs:
        query_inputs = tokenizer(f'A: {query}',
                                 return_tensors=None,
                                 add_special_tokens=False,
                                 max_length=max_length,# * 3 // 4,
                                 truncation=True)
        passage_inputs = tokenizer(f'B: {passage}',
                                   return_tensors=None,
                                   add_special_tokens=False,
                                   max_length=max_length,
                                   truncation=True)
        item = tokenizer.prepare_for_model(
            [tokenizer.bos_token_id] + query_inputs['input_ids'],
            sep_inputs + passage_inputs['input_ids'],
            truncation='only_second',
            max_length=max_length,
            padding=False,
            return_attention_mask=False,
            return_token_type_ids=False,
            add_special_tokens=False
        )
        item['input_ids'] = item['input_ids'] + sep_inputs + prompt_inputs
        item['attention_mask'] = [1] * len(item['input_ids'])
        inputs.append(item)
    return tokenizer.pad(
            inputs,
            padding=True,
            max_length=max_length + len(sep_inputs) + len(prompt_inputs),
            pad_to_multiple_of=8,
            return_tensors='pt',
    )

tokenizer = AutoTokenizer.from_pretrained('/data/aicuserData/guoqizhi/embedding_model/bge-reranker-v2-gemma/')
model = AutoModelForCausalLM.from_pretrained('/data/aicuserData/guoqizhi/embedding_model/bge-reranker-v2-gemma/').half().cuda()
yes_loc = tokenizer('Yes', add_special_tokens=False)['input_ids'][0]
model.eval()

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.85it/s]


GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
      )
    )
    (norm): GemmaR

In [11]:
# Calculate rank score
def compute_score(tokenizer, model, match_list):
    result = []
    # Batch size depend on GPU memory
    batch_size = 20
    if len(match_list) % batch_size != 0:
        add_batch = 1
    else:
        add_batch = 0
    for i in range(len(match_list) // batch_size + add_batch):
        with torch.no_grad():
            inputs = get_inputs(match_list[i*batch_size: i*batch_size+batch_size], tokenizer)
            inputs['input_ids'] = inputs['input_ids'].cuda()
            inputs['attention_mask'] = inputs['attention_mask'].cuda()
            scores = model(**inputs, return_dict=True).logits[:, -1, yes_loc].view(-1, ).float()
            result.append(scores.detach().cpu())
            torch.cuda.empty_cache()
    result = torch.cat(result).tolist()
    return result

# Remove duplicate question
def deduplicate(top_embedding_result):
    query_count = {}
    result = []
    for qa_pair in top_embedding_result:
        if qa_pair['User'] in query_count.keys():
            query_count[qa_pair['User']] += 1
        else:
            query_count[qa_pair['User']] = 1
            
        if query_count[qa_pair['User']] <= 2:
            result.append(qa_pair)
            
    return result

# Sample top p question
def top_p_sampling(logits, p=0.8):
    #sorted_indices = np.argsort(logits)  # Sort logits
    sorted_probs = numpy.exp(logits) / numpy.sum(numpy.exp(logits))  # Convert sorted logits to probabilities
    cum_probs = numpy.cumsum(sorted_probs)  # Calculate the cumulative probability
    valid_indices = numpy.where(cum_probs <= (p))[0]  # Get valid indices where cumulative probability is above threshold
    if len(valid_indices) > 0:
        min_valid_index = valid_indices[-1]
    else:
        min_valid_index = len(logits)  # If no valid indices, select the last one (highest probability)
    #selected_index = np.random.choice(mask)  # Randomly select an index from the valid set
    return min(max(min_valid_index, 5), 20)

# Get Rag content
def llm_input(sample_result):
    top = ''
    botton = ''
    for i in range(len(sample_result)):
        if i % 2 == 0:
            top = top +  ' User: ' + sample_result[i]['User'] + ' Gura: ' +  sample_result[i]['Gura']
        if i % 2 == 1:
            botton = ' User: ' + sample_result[i]['User'] + ' Gura: ' + sample_result[i]['Gura'] + botton
        
    return (top + botton)

In [12]:
import copy
search_record = []
for i in range(len(user_input)):
    print(i)
    question_embedding = user_input_embedding[i]
    similarity = question_embedding @ Query_embedding.T
    
    search_result = []
    for j in range(len(similarity)):
        qa_dataset[j]['embedding_score'] = float(similarity[j])
        search_result.append(copy.copy(qa_dataset[j]))
    
    search_result.sort(key = lambda x: x['embedding_score'], reverse = True)
    top_embedding_result = deduplicate(search_result[:200])
   
    match_list = [[user_input[i], text['User'] + text['Gura']] for text in top_embedding_result]
    scores = compute_score(tokenizer,model, match_list)
    #scores = similarity
    for k in range(len(top_embedding_result)):
        top_embedding_result[k]['rank_score'] = scores[k]

    top_rank_result = copy.copy(top_embedding_result)
    top_rank_result.sort(key = lambda x: x['rank_score'], reverse = True)
    
    rank_score = []
    for l in range(len(top_rank_result)):
        rank_score.append(top_rank_result[l]['rank_score'])
    sample_result = top_rank_result[:top_p_sampling(rank_score, p=0.8)]
    rag_content = llm_input(sample_result)
    
    search_record.append({'input_id':i, 'input_text': user_input[i], 'embedding_search_result': top_embedding_result[:40], 
                          'rank_search_result':top_rank_result[:10], 'rag_content': rag_content })

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17


In [14]:
search_record[0]

{'input_id': 0,
 'input_text': 'What is your favorite food?',
 'embedding_search_result': [{'User': 'What is your favorite food?',
   'Gura': 'I love all kinds of food, but I especially love fish! And spanakopita, that spinach puff stuff, yummy!',
   'embedding_score': 1.0,
   'rank_score': 6.22265625},
  {'User': 'What is your favorite food?',
   'Gura': 'I like all kinds of food, but I especially love shrimp!',
   'embedding_score': 1.0,
   'rank_score': 5.6171875},
  {'User': "What's your favorite food?",
   'Gura': 'I love all kinds of food, but I especially love fish!',
   'embedding_score': 0.99853515625,
   'rank_score': 5.71875},
  {'User': "What's your favorite food?",
   'Gura': 'I love all kinds of food, but I especially love fish!',
   'embedding_score': 0.99853515625,
   'rank_score': 5.71875}],
 'rank_search_result': [{'User': 'What is your favorite food?',
   'Gura': 'I love all kinds of food, but I especially love fish! And spanakopita, that spinach puff stuff, yummy!',