In [28]:
import pickle

import pandas as pd

from utils import *

  from .autonotebook import tqdm as notebook_tqdm


In [29]:
with open('../processed_data/attention_scores.pkl', 'rb') as file:
    attention_scores = pickle.load(file)

graph = torch.load('../processed_data/heterogeneous_graph_768_no_med_balanced_with_prompt.pt')

In [30]:
graph

HeteroData(
  user={
    node_id=[4826],
    x=[4826, 37],
    y=[4826],
    prompt=[4826],
  },
  food={
    node_id=[5896],
    prompt=[5896],
    x=[5896, 814],
  },
  ingredient={
    node_id=[2792],
    prompt=[2792],
    x=[2792, 768],
  },
  category={
    node_id=[174],
    prompt=[174],
    x=[174, 768],
  },
  habit={
    node_id=[54],
    prompt=[54],
    x=[54, 768],
  },
  (user, eats, food)={ edge_index=[2, 136967] },
  (food, contains, ingredient)={ edge_index=[2, 19410] },
  (food, belongs_to, category)={ edge_index=[2, 19410] },
  (user, has, habit)={ edge_index=[2, 46947] },
  (food, eaten, user)={ edge_index=[2, 136967] },
  (ingredient, in, food)={ edge_index=[2, 19410] },
  (category, contains, food)={ edge_index=[2, 19410] },
  (habit, from, user)={ edge_index=[2, 46947] }
)

In [31]:
offset_dict = {
    'user': 0,
    'food': graph['user'].x.shape[0],
    'ingredient': graph['user'].x.shape[0] + graph['food'].x.shape[0],
    'category': graph['user'].x.shape[0] + graph['food'].x.shape[0] + graph['ingredient'].x.shape[0],
    'habit': graph['user'].x.shape[0] + graph['food'].x.shape[0] + graph['ingredient'].x.shape[0] + \
             graph['category'].x.shape[0]
}

new_attention_scores = {}
for key in attention_scores:
    """
    This is because the offset in pyg is not the same as the offset in the original graph.
    """
    src, tgt = int(key[0]), int(key[1])
    if src > 2 * (graph['user'].x.shape[0] + graph['food'].x.shape[0]) - 1 or tgt < offset_dict['user']:
        continue
    if (src >= offset_dict['ingredient']) and (src < offset_dict['ingredient'] + graph['food'].x.shape[0]):
        src -= graph['food'].x.shape[0]
    elif src >= offset_dict['ingredient'] + graph['food'].x.shape[0]:
        src -= 2 * graph['food'].x.shape[0] + graph['user'].x.shape[0]

    new_key = (src, tgt)
    new_attention_scores[new_key] = np.mean(attention_scores[key])

node_dict = {}
for node_type in ['user', 'food', 'ingredient', 'category', 'habit']:
    for ind, node_id in enumerate(graph[node_type].node_id):
        offset_ind = ind + offset_dict[node_type]
        node_dict[offset_ind] = (node_type, node_id, ind)

In [32]:
def find_top_k(attention_scores, source_node_id, node_type, k, node_dict, offset_dict, is_filter=True):
    # Filter attention_scores by source_node_id
    filtered_scores = {key: score for key, score in attention_scores.items() if key[0] == source_node_id}
    # Sort the filtered scores by attention score in descending order
    sorted_scores = sorted(filtered_scores.items(), key=lambda item: item[1], reverse=True)
    # Initialize a list to hold the top k nodes of the specified type
    top_k_nodes = []

    for (source_id, target_id), score in sorted_scores:
        # Check if the target node is of the desired type using the node_dict and offset_dict
        if offset_dict[node_type] <= target_id < offset_dict.get(node_type, 0) + graph[node_type].x.shape[0]:
            # Append the node information (node type, node id, index) to the top_k_nodes list
            top_k_nodes.append((node_type, node_dict[target_id][1], node_dict[target_id][2]))

            # Break the loop if we have found k nodes
            if is_filter and len(top_k_nodes) == k:
                break

    return top_k_nodes

In [43]:
find_top_k(new_attention_scores, 4826, 'ingredient', 5, node_dict, offset_dict)

[('ingredient', '00014006', 2738)]

In [34]:
user2food = generate_edge_mapping(graph, ('user', 'eats', 'food'))
user2habit = generate_edge_mapping(graph, ('user', 'has', 'habit'))
res = []
for user_id in tqdm(range(len(graph['user'].node_id))):
    top_k_habits = find_top_k(new_attention_scores, user_id, 'habit', 10, node_dict, offset_dict)
    top_k_foods = find_top_k(new_attention_scores, user_id, 'food', 10, node_dict, offset_dict)
    top_habits_desc = extract_descriptions(graph, [x[2] for x in top_k_habits], prompt_type='habit')
    top_food_desc = extract_descriptions(graph, [x[2] for x in top_k_foods], prompt_type='food')
    all_habit_desc = extract_descriptions(graph, user2habit[user_id], prompt_type='habit')
    all_food_desc = extract_descriptions(graph, user2food[user_id], prompt_type='food')
    res.append([user_id,
                '; '.join(top_habits_desc),
                '; '.join(top_food_desc),
                '; '.join(all_habit_desc),
                '; '.join(all_food_desc),
                graph['user'].y[user_id].item()])
df = pd.DataFrame(res, columns=['user_id', 'top_habits_desc', 'top_food_desc', 'all_habit_desc', 'all_food_desc', 'label'])

100%|██████████| 4826/4826 [01:18<00:00, 61.77it/s]


In [50]:
food2ingredient = generate_edge_mapping(graph, ('food', 'contains', 'ingredient'))
res = []
for food_id in tqdm(range(len(graph['food'].node_id))):
    # Some food nodes do not have any ingredients.
    if food_id not in food2ingredient:
        res.append([food_id, '', ''])
        continue
    top_k_ingredient = find_top_k(new_attention_scores, food_id+graph['user'].node_id.shape[0], 'ingredient', 3, node_dict, offset_dict)
    top_ingredient_desc = extract_descriptions(graph, [x[2] for x in top_k_ingredient], prompt_type='ingredient')
    all_ingredient_desc = extract_descriptions(graph, food2ingredient[food_id], prompt_type='ingredient')

    res.append([food_id,
                ';'.join(top_ingredient_desc),
                ';'.join(all_ingredient_desc)])
df_food = pd.DataFrame(res, columns=['food_id', 'top_ingredient_desc', 'all_ingredient_desc'])

100%|██████████| 5896/5896 [00:40<00:00, 145.52it/s]


In [30]:
df.to_csv('../processed_data/attention_result_user.csv', index=False)

In [52]:
df_food.to_csv('../processed_data/attention_result_food.csv', index=False)

In [138]:
df_sample = df.sample(1000)

In [139]:
base_text = """Opioid users tend to have unhealthy dietary patterns different from regular people. We have developed a model to predict if a user is an opioid user based on his dietary habits and food intake history and give back recommendations on which foods and habits most support the decision. Your task is to act as a nutritionist to help me further filter the list: First, check if this user is an opioid user or a non-opioid user. Then, based on the finding, use your general knowledge to find the foods and habits that support the decision most. For example, chronic opioid users often have unhealthy dietary patterns, favoring salty, sweet, and fried foods and being more susceptible to alcohol, and tobacco, while non-opioid users can have various healthy diet patterns. Finally, the most important thing to bear in mind is that you MUST select a portion (about 5) of foods and habits that you think can most support the decision, use as few words as you can to provide two concise lists of full food or habit names without any other explanation and with limited options each. Be aware, you should use your knowledge to consider all foods and habits names for the final candidates but only select a portion of it.\n """

In [140]:
prompted_text_list = []
for i, row in df_sample.iterrows():
    opioid_flag = 'opioid' if row['label'] == 1 else 'non-opioid'
    text = f"""
    User {row['user_id']} is an {opioid_flag} user. The model think the foods support the decisions are: {row['top_food_desc']}.  \n
    The model think the habits support the decisions are: {row['top_habits_desc']}. \n
    The rest of foods include: {row['all_food_desc']}. The rest of habits include: {row['all_habit_desc']}.
    """
    prompted_text = base_text + text
    prompted_text_list.append(prompted_text)

In [141]:
prompted_text_list[9]

"Opioid users tend to have unhealthy dietary patterns different from regular people. We have developed a model to predict if a user is an opioid user based on his dietary habits and food intake history and give back recommendations on which foods and habits most support the decision. Your task is to act as a nutritionist to help me further filter the list: First, check if this user is an opioid user or a non-opioid user. Then, based on the finding, use your general knowledge to find the foods and habits that support the decision most. For example, chronic opioid users often have unhealthy dietary patterns, favoring salty, sweet, and fried foods and being more susceptible to alcohol, and tobacco, while non-opioid users can have various healthy diet patterns. Finally, the most important thing to bear in mind is that you MUST select a portion (about 5) of foods and habits that you think can most support the decision, use as few words as you can to provide two concise lists of full food or

In [142]:
from openai import OpenAI
client = OpenAI()

In [144]:
def query(prompt):
    response = client.chat.completions.create(
      model="gpt-3.5-turbo",
      messages=[
        {"role": "system", "content": "You are a nutritionist."},
        {"role": "user", "content": prompt}
      ]
    )
    return response.choices[0].message.content

In [145]:
import time
GPT_reasoning = []
for prompt in tqdm(prompted_text_list):
    reasoning = query(prompt)
    time.sleep(20)
    GPT_reasoning.append(reasoning)

100%|██████████| 1000/1000 [7:07:13<00:00, 25.63s/it] 


In [146]:
GPT_reasoning

["Based on the given information, here are the selected foods and habits that most support the decision that the user is a non-opioid user:\n\nFoods:\n1. Milk\n2. Human milk\n\nHabits:\n1. Drinks little water\n2. Don't drink tap water\n3. Eats little or no fish\n4. Eats little or no shellfish\n5. Ate more food than usual",
 'Based on the information provided, here are the selected foods that support the decision that User 744 is an opioid user:\n\n1. Peppers, hot, cooked, NS as to form, fat not added in cooking\n2. Frankfurter or hot dog, beef and pork\n3. Crackers, sandwich, cheese filled\n4. Chili con carne without beans\n5. Ground beef, less than 80% lean, cooked (formerly regular)\n\nAnd here are the selected habits that support the decision:\n\n1. Heavy cigarette smoker\n2. Uses tobacco often\n3. Drinks little water\n4. Drinks Alcohol more than average\n5. Claims to have a poor diet',
 'For User 2839, a non-opioid user, the selected foods that support the decision are:\n\n1. Scram

In [2]:
import pickle

In [148]:
with open('../processed_data/GPT_reasoning_1000.pkl', 'wb') as file:
    pickle.dump(GPT_reasoning, file)

In [3]:
with open('../processed_data/GPT_reasoning_1000.pkl', 'rb') as file:
    GPT = pickle.load(file)

In [26]:
GPT[494:500]

['Based on the provided information, the foods that most support the decision of User 565 being an opioid user are:\n1. Egg and bacon on biscuit\n2. Pork roast, NS as to cut, cooked, lean only eaten\n3. Fruit juice drink (Sunny D)\n4. Cornbread, made from home recipe\n5. Bologna\n\nThe habits that most support the decision are:\n1. Uses tobacco often.\n2. Drinks lots of milk.\n3. Adds lots of salt at the table.\n4. Eats few to no meals outside the home.\n5. Claims to have a poor diet.',
 'Based on the provided information, we can identify the following foods and habits that most support the decision that User 1798 is an opioid user:\n\nFoods: \n1. Chicken thigh, fried, coated, skin / coating eaten, from raw\n2. Chicken drumstick, fried, coated, skin / coating eaten, from raw\n3. Potato, mashed, from fresh, made with milk, with gravy\n4. Pork roast, NS as to cut, cooked, lean only eaten\n5. Beans, string, green, cooked, from canned, made with butter\n\nHabits:\n1. Uses lots of salt in p

In [37]:
df_food

Unnamed: 0,food_id,top_ingredient_desc,all_ingredient_desc
0,0,,"Alcoholic beverage, beer, light."
1,1,,"Beverages, water, tap, municipal.;Beverages, C..."
2,2,,"Babyfood, cereal, mixed, with applesauce and b..."
3,3,,"Beans, white, mature seeds, cooked, boiled, wi..."
4,4,,"Ice cream, vanilla.;Ice cream, vanilla, with a..."
...,...,...,...
5096,5888,"Egg, white, raw, fresh.;Vegetable oil, NFS.;Sa...","Beverages, coffee, instant, regular, powder.;B..."
5097,5889,"Sausage, egg and cheese breakfast biscuit.","Beverages, Mixed vegetable and fruit juice dri..."
5098,5891,"Beverages, coffee, brewed, prepared with tap w...","Peanuts, all types, dry-roasted, with salt.;Su..."
5099,5894,"Potatoes, boiled, cooked without skin, flesh, ...","Sugars, granulated.;Miso.;Salad dressing, sesa..."
