In [1]:
# pip install -q transformers accelerate
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

print(torch.cuda.is_available())
if torch.cuda.is_available():
    torch.cuda.empty_cache()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

checkpoint = "bigscience/bloomz-7b1"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, 
                                             torch_dtype=torch.float16, 
                                             device_map=device)



True


In [2]:
inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda")
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))




Translate to English: Je t’aime. I love you.</s>


In [3]:
file_id = 1
answer_df_path = "../../bloom_data/answer_df_part"+str(file_id)+".csv"

In [4]:
import pandas as pd
import numpy as np
answer_df = pd.read_csv(answer_df_path)
query_df = pd.read_csv("../../data/fea_df.csv")

if not 'answer_string' in answer_df.columns:
    answer_df['answer_string'] = np.nan
    
print(answer_df.shape)
print(answer_df.iloc[0:10,])

(9647, 3)
     sm_id                                         text_w_eos  answer_string
0  10002yt  20 pound down after 22 week s and 17 week s ou...            NaN
1  1003i3b  tw ana body dysmorphia describing body potenti...            NaN
2  1003xw2  hello fellow brawearing friends. this is going...            NaN
3  10042jf  according to cronometer 100 gram of canola oil...            NaN
4  1004j21  its been two week s since i started strength t...            NaN
5  1008hih  i made panfried chicken thighs ovenroasted pot...            NaN
6  100ash9  i have aversive arfid so i have a very bad fea...            NaN
7  100aza8  original post here httpswww. reddit. comrcicoc...            NaN
8  100bwe6  for context i am early 30s attending a corpora...            NaN
9  100c6y2  i got this from a thrift store and was pleasan...            NaN


In [5]:
system_message = """
You are an AI assistant designed to answer questions.
Please restrict your answer to the exact question and use the exact answer format asked.
Answer should not have implied information. If the answer is yes, always provide related phrases. 
"""

def format_user_message(text):
    question_content = "Does the paragraph mention any of the following topics:\n"
    for i in range(len(query_df)):
        question_content += f"  ({i+1}) {query_df.fea[i]}: {query_df.description[i]}.\n"
    answer_content = "Return answer in format:\n"
    for i in range(len(query_df)): 
        answer_content += f"  ({i+1}) {query_df.fea[i]}: [yes/no], related phrases if any: \n"
    paragragh_content = f"Paragraph: '{text}' \n"
    user_message = question_content + answer_content + paragragh_content
    #print(user_message)
    
    return user_message




In [6]:
from tqdm import tqdm
for k in tqdm(range(1), desc="Processing batch"):
    batch_size = 10

    # Filter for rows where 'answer_string' is NaN
    unanswered_df = answer_df[answer_df['answer_string'].isna()]

    # Get the indices of these NaN entries in the original DataFrame
    indices_to_update = unanswered_df.index[:batch_size]

    # Prepare prompt content for the first 10 entries with NaN answer_string
    user_messages = [format_user_message(text) for text in unanswered_df['text_w_eos'].iloc[:batch_size]]

    # Save the indices list if needed for later use
    indices_to_update_list = list(indices_to_update)

    # query process
    messages = [ [ {"role": "system", "content": system_message}, {"role": "user", "content": user_message}] for user_message in user_messages]
    inputs = [tokenizer.apply_chat_template(
        message,
        tokenize=False,
        add_generation_prompt=True ) for message in messages]
    
    
    responses = []
    for i in range(len(inputs)):
        # inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda")
        # outputs = model.generate(inputs)
        # print(tokenizer.decode(outputs[0]))

        model_inputs = tokenizer.encode(inputs[i], return_tensors="pt").to(model.device)
        outputs = model.generate(model_inputs,
                                 max_new_tokens=512,
                                 temperature=0.3)
        
        responses.append(tokenizer.decode(outputs[0], skip_special_tokens=True)[0])
    
    # Display and process responses in a loop
    for i, response in enumerate(responses):
        #display(Markdown(colorize_text(f"{response}")))
        # Extract answer if available
        answer = response
        # Use the original index from indices_to_update_list
        answer_df.loc[indices_to_update_list[i], 'answer_string'] = answer

    print(answer_df.loc[indices_to_update_list, ['sm_id','answer_string']])
    # answer_df.to_csv(answer_df_path, index=False)
    


Processing batch:   0%|          | 0/1 [00:00<?, ?it/s]


ValueError: Cannot use chat template functions because tokenizer.chat_template is not set and no template argument was passed! For information about writing templates and setting the tokenizer.chat_template attribute, please see the documentation at https://huggingface.co/docs/transformers/main/en/chat_templating

In [7]:
inputs

tensor([[153772,    427,   9522,   6395,  76721,  68258,     17]],
       device='cuda:0')

In [11]:
batch_size = 1

# Filter for rows where 'answer_string' is NaN
unanswered_df = answer_df[answer_df['answer_string'].isna()]

# Get the indices of these NaN entries in the original DataFrame
indices_to_update = unanswered_df.index[:batch_size]

# Prepare prompt content for the first 10 entries with NaN answer_string
user_messages = [format_user_message(text) for text in unanswered_df['text_w_eos'].iloc[:batch_size]]

# Save the indices list if needed for later use
indices_to_update_list = list(indices_to_update)

messages = [ system_message+" \n "+user_message for user_message in user_messages]
 

responses = []
for i in range(len(messages)):
    # inputs = tokenizer.encode("Translate to English: Je t’aime.", return_tensors="pt").to("cuda")
    # outputs = model.generate(inputs)
    # print(tokenizer.decode(outputs[0]))

    inputs = tokenizer.encode(messages[i], return_tensors="pt").to(model.device)
    outputs = model.generate(inputs,
                             max_new_tokens=512,
                             do_sample=True,
                             temperature=0.3)

    responses.append(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Display and process responses in a loop
for i, response in enumerate(responses):
    print(response)
    





In [16]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"\nYou are an AI assistant designed to answer questions.\nPlease restrict your answer to the exact question and use the exact answer format asked.\nAnswer should not have implied information. If the answer is yes, always provide related phrases. \n \n Does the paragraph mention any of the following topics:\n  (1) relation: Family and social relationships.\n  (2) protein: High protein diet, carbohydrate-reduced(low-carb) high-protein diet.\n  (3) ed: Eating disorders(ED) diagnosis or recovery, ED includes anorexia nervosa, anorexic, bulimia, bulimic, binge eating disorders, arfid, osfed, pica.\n  (4) exercise: Physical exercise.\n  (5) meal: Routine of meals.\n  (6) crave: Craving for high calorie food or carbs.\n  (7) restrict: Restrict nutrition or calorie intake.\n  (8) binge: Binge eating.\n  (9) loss: Body weight loss.\n  (10) gain: Body weight gain.\n  (11) calorie: Count calorie.\n  (12) thinspo: Drive for thinness, want to be thinner or skinny.\n  (13) leanbody: Drive for lean b