In [1]:
# %pip install -U transformers accelerate

In [2]:
from time import time
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import torch
from IPython.display import display, Markdown
import pandas as pd

In [3]:
from transformers import AutoTokenizer,AutoModelForCausalLM,pipeline
import torch

base_model = "/kaggle/input/llama-3.1/transformers/8b-instruct/2"

tokenizer = AutoTokenizer.from_pretrained(base_model)

model = AutoModelForCausalLM.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
)
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [4]:
def colorize_text(text):
    for word, color in zip(["Reasoning", "Question", "Answer", "Total time"], ["blue", "red", "green", "magenta"]):
        text = text.replace(f"{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text


def query_model(
        system_message,
        user_message,
        temperature=0,
        max_length=1024
        ):
    start_time = time()
    user_message = "Question: " + user_message + " Answer:"
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
        ]
    prompt = pipe.tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True
        )
    terminators = [
        pipe.tokenizer.eos_token_id,
        pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    sequences = pipe(
        prompt,
        do_sample=True,
        top_p=0.9,
        temperature=temperature,
        num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=terminators[0]
    )
    #answer = f"{sequences[0]['generated_text'][len(prompt):]}\n"
    answer = sequences[0]['generated_text']
    end_time = time()
    ttime = f"Total time: {round(end_time-start_time, 2)} sec."

    return user_message + " " + answer  + " " +  ttime


system_message = """
You are an AI assistant designed to answer questions.
Please restrict your answer to the exact question and use the exact answer format asked.
"""


In [10]:
# # test
# t1 = time()
# response = query_model(
#     system_message,
#     user_message="What is the surface temperature of the Moon?",
#     temperature=0.1,
#     max_length=256)
# display(Markdown(colorize_text(f"{response}")))



**<font color='red'>Question:</font>** What is the surface temperature of the Moon? 

**<font color='green'>Answer:</font>** The surface temperature of the Moon varies greatly between day and night. The average temperature during the lunar day (when the Sun is shining on the surface) is around 107°C (225°F), while the average temperature during the lunar night (when the Sun is not shining on the surface) is around -173°C (-279°F). 

**<font color='magenta'>Total time:</font>** 41.63 sec.

In [12]:
# text = "29 m with anorexia nervosa hasnt dealt with the reality of i have an e EOS day EOS who has a hard time opening up about itim in a mental block of how to communicate with my boss who has had my back since hire have communicated that i have a e EOS day 4 month s ago that my e EOS day has to put it lightly hit a spike if i dont quit im scared ill be in inpatient again EOS i feel guilty due to the timing of me going on vacation for january my boss approved it but i also havent had a proper meal in month s due to the mentality this job puts me in because of the location im at EOS i work tomorrow until tues day straight but i dont think if i continue with this job my body will let me im a little a scared im close to inpatient again but i cant financially handle that again im currently paying off a 2000 debt"
# query_text = "high protein food, food contain high protein"

# prompt_content = (
#     f"Does this paragraph '{text}' mention '{query_text}'? "
#     "Return answer in format: [yes/no], phrases related to "
#     f"{query_text} is [...]"
# )


# response = query_model(
#     system_message,
#     user_message=prompt_content,
#     temperature=0.1,
#     max_length=50)
# display(Markdown(colorize_text(f"{response}")))




**<font color='red'>Question:</font>** Does this sentence '29 m with anorexia nervosa hasnt dealt with the reality of i have an e EOS day EOS who has a hard time opening up about itim in a mental block of how to communicate with my boss who has had my back since hire have communicated that i have a e EOS day 4 month s ago that my e EOS day has to put it lightly hit a spike if i dont quit im scared ill be in inpatient again EOS i feel guilty due to the timing of me going on vacation for january my boss approved it but i also havent had a proper meal in month s due to the mentality this job puts me in because of the location im at EOS i work tomorrow until tues day straight but i dont think if i continue with this job my body will let me im a little a scared im close to inpatient again but i cant financially handle that again im currently paying off a 2000 debt' mention 'high protein food, food contain high protein'? Return answer in format: [yes/no], phrases related to 29 m with anorexia nervosa hasnt dealt with the reality of i have an e EOS day EOS who has a hard time opening up about itim in a mental block of how to communicate with my boss who has had my back since hire have communicated that i have a e EOS day 4 month s ago that my e EOS day has to put it lightly hit a spike if i dont quit im scared ill be in inpatient again EOS i feel guilty due to the timing of me going on vacation for january my boss approved it but i also havent had a proper meal in month s due to the mentality this job puts me in because of the location im at EOS i work tomorrow until tues day straight but i dont think if i continue with this job my body will let me im a little a scared im close to inpatient again but i cant financially handle that again im currently paying off a 2000 debt is [...] 

**<font color='green'>Answer:</font>** [no] 

**<font color='magenta'>Total time:</font>** 3.51 sec.

In [15]:
# query_text = "29 m with anorexia nervosa hasnt dealt with the reality of i have an e EOS day EOS who has a hard time opening up about itim in a mental block of how to communicate with my boss who has had my back since hire have communicated that i have a e EOS day 4 month s ago that my e EOS day has to put it lightly hit a spike if i dont quit im scared ill be in inpatient again EOS i feel guilty due to the timing of me going on vacation for january my boss approved it but i also havent had a proper meal in month s due to the mentality this job puts me in because of the location im at EOS i work tomorrow until tues day straight but i dont think if i continue with this job my body will let me im a little a scared im close to inpatient again but i cant financially handle that again im currently paying off a 2000 debt"
# anchor_topic_text = "eating disorder, binging, purging, recovery, treatment, anorexia nervosa, anorexic, bulimia, bulimic, binge eating disorders, arfid, osfed, pica"

# prompt_content = (
#     f"Does this sentence '{query_text}' mention '{anchor_topic_text}'? "
#     "Return answer in format: [yes/no], phrases related to "
#     f"{query_text} is [...]"
# )


# response = query_model(
#     system_message,
#     user_message=prompt_content,
#     temperature=0.1,
#     max_length=50)
# display(Markdown(colorize_text(f"{response}")))




**<font color='red'>Question:</font>** Does this sentence '29 m with anorexia nervosa hasnt dealt with the reality of i have an e EOS day EOS who has a hard time opening up about itim in a mental block of how to communicate with my boss who has had my back since hire have communicated that i have a e EOS day 4 month s ago that my e EOS day has to put it lightly hit a spike if i dont quit im scared ill be in inpatient again EOS i feel guilty due to the timing of me going on vacation for january my boss approved it but i also havent had a proper meal in month s due to the mentality this job puts me in because of the location im at EOS i work tomorrow until tues day straight but i dont think if i continue with this job my body will let me im a little a scared im close to inpatient again but i cant financially handle that again im currently paying off a 2000 debt' mention 'eating disorder, binging, purging, recovery, treatment and recovery of anorexia nervosa, anorexic, bulimia, bulimic, binge eating disorders, arfid, osfed, pica'? Return answer in format: [yes/no], phrases related to 29 m with anorexia nervosa hasnt dealt with the reality of i have an e EOS day EOS who has a hard time opening up about itim in a mental block of how to communicate with my boss who has had my back since hire have communicated that i have a e EOS day 4 month s ago that my e EOS day has to put it lightly hit a spike if i dont quit im scared ill be in inpatient again EOS i feel guilty due to the timing of me going on vacation for january my boss approved it but i also havent had a proper meal in month s due to the mentality this job puts me in because of the location im at EOS i work tomorrow until tues day straight but i dont think if i continue with this job my body will let me im a little a scared im close to inpatient again but i cant financially handle that again im currently paying off a 2000 debt is [...] 

**<font color='green'>Answer:</font>** [yes]

Phrases related to eating disorders mentioned: 
- anorexia nervosa
- EOS day (likely referring to an eating disorder episode)
- anorexic
- bulimia (implied by "purging")
- bul 

**<font color='magenta'>Total time:</font>** 31.8 sec.

# load data and try examples

In [5]:
text_df = pd.read_csv("/kaggle/input/eddata/sm_eos.csv")
query_df = pd.read_csv("/kaggle/input/eddata/query_df.csv")

In [6]:
def format_prompt(text):
    question_content = "Does the paragraph mention any of the following topics:\n"
    for i in range(len(query_df)):
        question_content += f"  ({i+1}) {query_df.topic[i]}: {query_df.description[i]}.\n"
    answer_content = "Return answer in format:\n"
    for i in range(len(query_df)):
        answer_content += f"  ({i+1}) {query_df.topic[i]}: [yes/no], related phrases if any: \n"
    paragragh_content = f"Paragraph: '{text}' \n"
    user_message = question_content + answer_content + paragragh_content
    #print(user_message)
    
    return user_message

In [7]:
t1 = time()
prompt_content = format_prompt(text_df.text_w_eos[0])
response = query_model(
    system_message,
    user_message=prompt_content,
    temperature=0.5,
    max_length=512)
display(Markdown(colorize_text(f"{response}")))


Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)




**<font color='red'>Question:</font>** Does the paragraph mention any of the following topics:
  (1) relation: Family and social relationships.
  (2) protein: High protein food or diet.
  (3) thinspo: Drive for thinness, want to be skinny or underweight (thinspiration).
  (4) health: Physical or mental health issues.
  (5) body: Body dissatisfaction.
  (6) ed: Eating disorders(ED) diagnosis or recovery, ED includes anorexia nervosa, anorexic, bulimia, bulimic, binge eating disorders, arfid, osfed, pica.
  (7) exercise: Physical exercise.
  (8) meal: Routine of meals.
  (9) crave: Craving for high calorie food or carbs.
  (10) restrict: Restrict nutrition or calorie intake.
  (11) binge: Binge eating.
  (12) loss: Weight loss.
  (13) gain: Weight gain.
  (14) fear: Fear of weight gain.
  (15) calorie: Count calorie.
Return answer in format:
  (1) relation: [yes/no], related phrases if any: 
  (2) protein: [yes/no], related phrases if any: 
  (3) thinspo: [yes/no], related phrases if any: 
  (4) health: [yes/no], related phrases if any: 
  (5) body: [yes/no], related phrases if any: 
  (6) ed: [yes/no], related phrases if any: 
  (7) exercise: [yes/no], related phrases if any: 
  (8) meal: [yes/no], related phrases if any: 
  (9) crave: [yes/no], related phrases if any: 
  (10) restrict: [yes/no], related phrases if any: 
  (11) binge: [yes/no], related phrases if any: 
  (12) loss: [yes/no], related phrases if any: 
  (13) gain: [yes/no], related phrases if any: 
  (14) fear: [yes/no], related phrases if any: 
  (15) calorie: [yes/no], related phrases if any: 
Paragraph: '20 pound down after 22 week s and 17 week s out from my first wellness show. check the posing progress too starting fat burners and cardio again this week after being sick so excited to see what comes on show day. ' 
 

**<font color='green'>Answer:</font>** (1) relation: no, 
(2) protein: no, 
(3) thinspo: yes, related phrases if any: '20 pound down', 'fat burners', 
(4) health: yes, related phrases if any: 'being sick', 'wellness show', 
(5) body: yes, related phrases if any: 'posing progress', 
(6) ed: no, 
(7) exercise: yes, related phrases if any: 'cardio', 
(8) meal: no, 
(9) crave: no, 
(10) restrict: no, 
(11) binge: no, 
(12) loss: yes, related phrases if any: '20 pound down', 
(13) gain: no, 
(14) fear: no, 
(15) calorie: no 

**<font color='magenta'>Total time:</font>** 112.02 sec.

In [9]:

prompt_content = format_prompt(text_df.text_w_eos[1])
response = query_model(
    system_message,
    user_message=prompt_content,
    temperature=0,
    max_length=512)
display(Markdown(colorize_text(f"{response}")))




**<font color='red'>Question:</font>** Does the paragraph mention any of the following topics:
  (1) relation: Family and social relationships.
  (2) protein: High protein food or diet.
  (3) thinspo: Drive for thinness, want to be skinny or underweight (thinspiration).
  (4) health: Physical or mental health issues.
  (5) body: Body dissatisfaction.
  (6) ed: Eating disorders(ED) diagnosis or recovery, ED includes anorexia nervosa, anorexic, bulimia, bulimic, binge eating disorders, arfid, osfed, pica.
  (7) exercise: Physical exercise.
  (8) meal: Routine of meals.
  (9) crave: Craving for high calorie food or carbs.
  (10) restrict: Restrict nutrition or calorie intake.
  (11) binge: Binge eating.
  (12) loss: Weight loss.
  (13) gain: Weight gain.
  (14) fear: Fear of weight gain.
  (15) calorie: Count calorie.
Return answer in format:
  (1) relation: [yes/no], related phrases if any: 
  (2) protein: [yes/no], related phrases if any: 
  (3) thinspo: [yes/no], related phrases if any: 
  (4) health: [yes/no], related phrases if any: 
  (5) body: [yes/no], related phrases if any: 
  (6) ed: [yes/no], related phrases if any: 
  (7) exercise: [yes/no], related phrases if any: 
  (8) meal: [yes/no], related phrases if any: 
  (9) crave: [yes/no], related phrases if any: 
  (10) restrict: [yes/no], related phrases if any: 
  (11) binge: [yes/no], related phrases if any: 
  (12) loss: [yes/no], related phrases if any: 
  (13) gain: [yes/no], related phrases if any: 
  (14) fear: [yes/no], related phrases if any: 
  (15) calorie: [yes/no], related phrases if any: 
Paragraph: 'tw ana body dysmorphia describing body potentially triggering adjectives to describe body calorie s intakecounting nonrecoveryall of my friends my bf and my bfs family tell me im skinny. and im not. i dont have a flat stomach. i have a slight double chin. my legs arent thin and my arms have fat on them. im trying to lose 35 pounds in 6 month s. i originally wanted to lose that much in about 34 month s but pushed it back farther because my boyfriend was afraid i was going to die. im not going to. im eating abt 1400 calorie s a day. and he recently did research and told me that that is pretty average for weight loss. and that triggered me and made me feel like i wasnt doing enough. he forces me to eat every time im with him. even in front of his family and it makes me so uncomfortable. it makes me partially dread seeing him bcuz ik he will end up making me eat something. ik i set a goal to eat around 1400 calorie s but i want to eat nothing. my bf just doesnt understand how it feels to have ana. he tells me abt his concerns for me and i keep starving myself and he gets hurt that i wont take his feelings into consideration. but he doesnt understand that i cant just change. bcuz the thing is that i dont want to get better and i wont want to recover until im skinny. ' 
 

**<font color='green'>Answer:</font>** (1) relation: yes, related phrases if any:'my friends','my bf','my bfs family','my boyfriend'
(2) protein: no
(3) thinspo: yes, related phrases if any: 'im skinny', 'im not', 'i want to be skinny', 'until im skinny'
(4) health: yes, related phrases if any: 'im going to die', 'ana', 'bcuz the thing is that i dont want to get better and i wont want to recover'
(5) body: yes, related phrases if any: 'body dysmorphia','my body','my legs arent thin and my arms have fat on them','my stomach', 'double chin'
(6) ed: yes, related phrases if any: 'ana', 'ana body dysmorphia', 'i keep starving myself', 'i wont want to recover'
(7) exercise: no
(8) meal: yes, related phrases if any: 'i want to eat nothing', 'i set a goal to eat around 1400 calorie s', 'he forces me to eat every time im with him'
(9) crave: no
(10) restrict: yes, related phrases if any: 'i want to eat nothing', 'i set a goal to eat around 1400 calorie s'
(11) binge: no
(12) loss: yes, related phrases if any: 'im trying to lose 35 pounds in 6 month s'
(13) gain: no
(14) fear: yes, related phrases if any: 'fear of weight gain', 'fear of not being skinny'
(15) calorie: yes, related phrases if any: '1400 calorie s a day', 'i set a goal to eat around 1400 calorie s' 

**<font color='magenta'>Total time:</font>** 234.3 sec.

In [24]:
# # sequential processing 

# answer_df = text_df[['sm_id', 'text_w_eos']].copy()
# # Create a new column 'answer_string'
# answer_df['answer_string'] = ""


# for i in text_df.index:
#     prompt_content = format_prompt(text_df.text_w_eos[i])
#     response = query_model(
#         system_message,
#         user_message=prompt_content,
#         temperature=0.1,
#         max_length=512)
#     display(Markdown(colorize_text(f"{response}")))
#     # Check if "Answer:" is in the response to avoid errors
#     if "Answer:" in response:
#         answer = response.split("Answer:")[1]
#         answer_df.loc[i, 'answer_string'] = answer
#     else:
#         answer_df.loc[i, 'answer_string'] = "Answer not found"

# answer_df.to_csv('/kaggle/working/answer_df.csv', index=False)


# batch processing

In [39]:
def query_model_batch(
        system_message,
        user_messages,
        temperature=0,
        max_length=1024
    ):
    start_time = time()
    # Add "Question: ... Answer:" to each user message for clarity
    batched_messages = [
        "Question: " + message + " Answer:" for message in user_messages
    ]
    
    # Construct prompts for each message in batch
    all_prompts = [
        pipe.tokenizer.apply_chat_template(
            [
                {"role": "system", "content": system_message},
                {"role": "user", "content": user_message}
            ],
            tokenize=False,
            add_generation_prompt=True
        ) for user_message in batched_messages
    ]
    
    # Define the end-of-sequence terminators
    terminators = [
        pipe.tokenizer.eos_token_id,
        pipe.tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]
    
    # Run the batch inference
    sequences = pipe(
        all_prompts,
        do_sample=True,
        top_p=0.5,
        temperature=temperature,
        num_return_sequences=1,
        eos_token_id=terminators,
        max_new_tokens=max_length,
        return_full_text=False,
        pad_token_id=terminators[0]
    )
    
    # Extract generated text for each sequence
    answers = []
    for i, sequence in enumerate(sequences):
        answer = sequence[0]['generated_text']
        total_time = f"Total time: {round(time() - start_time, 2)} sec."
        # Format the response with timing information
        answers.append(batched_messages[i] + " " + answer + " " + total_time)

    return answers


In [46]:

batch_size = 10

# Load the previous answer DataFrame
answer_df = pd.read_csv("/kaggle/working/answer_df.csv")

# Filter for rows where 'answer_string' is NaN
unanswered_df = answer_df[answer_df['answer_string'].isna()]

# Get the indices of these NaN entries in the original DataFrame
indices_to_update = unanswered_df.index[:batch_size]

# Prepare prompt content for the first 10 entries with NaN answer_string
prompts = [format_prompt(text) for text in unanswered_df['text_w_eos'].iloc[:batch_size]]

# Save the indices list if needed for later use
indices_to_update_list = list(indices_to_update)
indices_to_update_list


# Batch process all prompts at once
responses = query_model_batch(
    system_message=system_message,
    user_messages=prompts,
    temperature=0.1,
    max_length=512
)


# Display and process responses in a loop
for i, response in enumerate(responses):
    #display(Markdown(colorize_text(f"{response}")))
    
    # Extract answer if available
    if "Answer:" in response:
        answer = response.split("Answer:")[1]
        # Use the original index from indices_to_update_list
        answer_df.loc[indices_to_update_list[i], 'answer_string'] = answer
    else:
        # Use the original index from indices_to_update_list
        answer_df.loc[indices_to_update_list[i], 'answer_string'] = "Answer not found"

#answer_df.to_csv('/kaggle/working/answer_df.csv', index=False)


KeyboardInterrupt: 

In [44]:
answer_df.head(20)

Unnamed: 0,sm_id,text_w_eos,answer_string
0,10002yt,20 pound down after 22 week s and 17 week s ou...,"(1) relation: no, \n(2) protein: no, \n(3) th..."
1,1003i3b,tw ana body dysmorphia describing body potenti...,"(1) relation: yes, related phrases if any:'my..."
2,1003xw2,hello fellow brawearing friends. this is going...,"(1) relation: yes, related phrases if any: fa..."
3,10042jf,according to cronometer 100 gram of canola oil...,"(1) relation: no, \n(2) protein: no, \n(3) th..."
4,1004j21,its been two week s since i started strength t...,"(1) relation: no, \n(2) protein: no, \n(3) th..."
5,1008hih,i made panfried chicken thighs ovenroasted pot...,"(1) relation: no, \n(2) protein: yes, related..."
6,100ash9,i have aversive arfid so i have a very bad fea...,"(1) relation: no, \n(2) protein: no, \n(3) th..."
7,100aza8,original post here httpswww. reddit. comrcicoc...,"(1) relation: yes, related phrases if any:'my..."
8,100bwe6,for context i am early 30s attending a corpora...,"(1) relation: no, \n(2) protein: no, \n(3) th..."
9,100c6y2,i got this from a thrift store and was pleasan...,"(1) relation: no, \n(2) protein: no, \n(3) th..."


In [45]:
answer_df.to_csv('/kaggle/working/answer_df.csv', index=False)
