In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch


print(torch.cuda.is_available())
if torch.cuda.is_available():
    torch.cuda.empty_cache()
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available, otherwise fallback to CPU
model_id = "lmsys/vicuna-13b-v1.5"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, 
                                             torch_dtype=torch.bfloat16, 
                                             device_map=device)



True


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]



In [2]:
file_id = 1
answer_df_path = "../../vicuna13b_data/answer_df_part"+str(file_id)+".csv"

In [3]:
import pandas as pd
import numpy as np
answer_df = pd.read_csv(answer_df_path)
query_df = pd.read_csv("../../data/fea_df.csv")

if not 'answer_string' in answer_df.columns:
    answer_df['answer_string'] = np.nan
    
print(answer_df.shape)
print(answer_df.iloc[4280:4283,])

(4824, 3)
        sm_id                                         text_w_eos  \
4280  13wviyb  i first joined to just lose weight but after t...   
4281  13wvpld  no one else in my life would understand but to...   
4282  13wwakl  currently i enjoy coffemate french vanilla alt...   

                                          answer_string  
4280  \n(1) relation: yes, related phrases in the pa...  
4281  \n(1) relation: yes, related phrases in the pa...  
4282  \n(1) relation: yes, related phrases in the pa...  


In [4]:
system_message = """
You are an AI assistant designed to answer questions. Restrict your answer to the exact question and use the exact answer format asked. If the answer is yes, always provide related phrases. 
"""

def format_user_message(text):
    question_content = "Question: Does the paragraph mention any of the following topics:\n"
    for i in range(len(query_df)):
        question_content += f"  ({i+1}) {query_df.fea[i]}: {query_df.description[i]}.\n"
    answer_content = "Return answer in format: Answer:\n"
    for i in range(len(query_df)): 
        answer_content += f"  ({i+1}) {query_df.fea[i]}: yes/no, related phrases in the paragraph if any: \n"
    paragragh_content = f"Paragraph: '{text}' \n"
    user_message = system_message + question_content + paragragh_content + answer_content
    #print(user_message)
    
    return user_message



In [5]:
from tqdm import tqdm

for k in tqdm(range(1000), desc="Processing batch"):
    
    batch_size = 10

    # Filter for rows where 'answer_string' is NaN
    unanswered_df = answer_df[answer_df['answer_string'].isna()]

    # Get the indices of these NaN entries in the original DataFrame
    indices_to_update = unanswered_df.index[:batch_size]

    # Prepare prompt content for the first 10 entries with NaN answer_string
    user_messages = [format_user_message(text) for text in unanswered_df['text_w_eos'].iloc[:batch_size]]

    # Save the indices list if needed for later use
    indices_to_update_list = list(indices_to_update)


    if len(indices_to_update_list)>0:
        for i in range(len(user_messages)):
            failed = True
            failed_count = 0
            while failed and failed_count<10:
                input_ids = tokenizer(user_messages[i], return_tensors='pt').input_ids.to(model.device)
                output = model.generate(inputs=input_ids, 
                                        temperature=0.1,
                                        do_sample=True, 
                                        max_new_tokens=512)
                response = tokenizer.decode(output[0], skip_special_tokens=True)

                if ("Answer:" in response) and ("(19) depressedmood:" in response):
                    answer = response.split("Answer:")[-1]
                    answer_df.loc[indices_to_update_list[i], 'answer_string'] = answer
                    # print(answer)
                    failed = False
                else:
                    failed = True
                    failed_count += 1
                    # Use the original index from indices_to_update_list
                    answer_df.loc[indices_to_update_list[i], 'answer_string'] = "Answer not found"

        print(answer_df.loc[indices_to_update_list, ['sm_id','answer_string']])
        answer_df.to_csv(answer_df_path, index=False)
    else:
        break
    

Processing batch:   0%|          | 0/1000 [00:00<?, ?it/s]
