In [1]:
from torch_geometric.data import HeteroData
from utils import *
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tqdm import tqdm
import random

import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
PCA_FLAG = False
LESS_IMBALANCE_FLAG = False
POSITIVE_RATIO = 1
ADD_MEDICAL = False
LLM_FLAG = False
SAMPLING_FLAG = False
SAMPLING_SIZE = 100

In [3]:
# Load datasets
food_ingredients_df = pd.read_csv('../processed_data/food_ingredients.csv', dtype=str)
user_food_df = pd.read_csv('../processed_data/user_food.csv', dtype=str)
df_demo = pd.read_csv('../processed_data/main_table.csv', dtype=str)
user_habit_df = pd.read_csv('../processed_data/user_habit_10.csv', dtype=str)
user_medicine_df = pd.read_csv('../processed_data/user_prescription_medicine.csv', dtype=str)

In [4]:
df_demo = df_demo.fillna(0)
df_demo['label'] = df_demo['label'].astype(float).astype(int).astype(str)

In [5]:
random.seed(42)
if LESS_IMBALANCE_FLAG:
    user_food_df_pos = user_food_df.loc[user_food_df['SEQN'].isin(df_demo.loc[df_demo['label'] == '1']['SEQN'].unique())]
    if SAMPLING_FLAG:
        unique_pos_SEQN = random.sample(user_food_df_pos['SEQN'].unique().tolist(), k=SAMPLING_SIZE)
        user_food_df_pos = user_food_df.loc[user_food_df['SEQN'].isin(unique_pos_SEQN)]

    df_demo_pos = df_demo.loc[df_demo['SEQN'].isin(user_food_df_pos['SEQN'].unique())]

    user_food_df_neg = user_food_df.loc[user_food_df['SEQN'].isin(df_demo.loc[df_demo['label'] == '0']['SEQN'].unique())]
    unique_neg_SEQN = random.sample(user_food_df_neg['SEQN'].unique().tolist(), k=POSITIVE_RATIO*len(df_demo_pos))

    user_food_df_neg = user_food_df.loc[user_food_df['SEQN'].isin(unique_neg_SEQN)]
    user_food_df = pd.concat([user_food_df_pos, user_food_df_neg])
    df_demo = df_demo.loc[df_demo['SEQN'].isin(user_food_df['SEQN'].unique())]

In [6]:
user_habit_df = user_habit_df.rename(columns={'habitID': 'habit_id', 'habitDesc': 'habit_desc'})
user_habit_df = user_habit_df.loc[user_habit_df['SEQN'].isin(user_food_df['SEQN'].unique())]
user_medicine_df = user_medicine_df.loc[user_medicine_df['SEQN'].isin(user_food_df['SEQN'].unique())]

In [7]:
food_ingredients_df = food_ingredients_df.loc[food_ingredients_df['food_id'].isin(user_food_df['food_id'].unique())]

In [8]:
food_ingredients_df['WWEIA_id'] = food_ingredients_df['WWEIA_id'].str.zfill(4)
food_ingredients_df['ingredient_id'] = food_ingredients_df['ingredient_id'].str.zfill(8)
food_ingredients_df['food_id'] = food_ingredients_df['food_id'].str.zfill(10)
user_food_df['food_id'] = user_food_df['food_id'].str.zfill(10)
user_habit_df['habit_id'] = user_habit_df['habit_id'].str.zfill(2)

In [9]:
# Create unique identifiers and map them to integers
unique_food_ids = np.array(list(set(user_food_df['food_id'].tolist()).union(set(food_ingredients_df['food_id'].tolist()))))
unique_ingredient_ids = food_ingredients_df['ingredient_id'].unique()
unique_wweia_ids = food_ingredients_df['WWEIA_id'].unique()
unique_user_ids = user_food_df['SEQN'].unique()
unique_habit_ids = user_habit_df['habit_id'].unique()
unique_medicine_ids = user_medicine_df['RXDDRGID'].unique()

In [10]:
food_to_int = {food_id: i for i, food_id in enumerate(unique_food_ids)}
ingredient_to_int = {ingredient_id: i for i, ingredient_id in enumerate(unique_ingredient_ids)}
wweia_to_int = {wweia_id: i for i, wweia_id in enumerate(unique_wweia_ids)}
user_to_int = {user_id: i for i, user_id in enumerate(unique_user_ids)}
habit_to_int = {habit_id: i for i, habit_id in enumerate(unique_habit_ids)}
medicine_to_int = {medicine_id: i for i, medicine_id in enumerate(unique_medicine_ids)}

In [11]:
# # Create a mapping from original ID to new integer ID
# int_to_id = list(unique_user_ids) + list(unique_food_ids) + list(unique_ingredient_ids) +  list(unique_wweia_ids) + list(unique_habit_ids)
# id_to_int = {original_id: i for i, original_id in enumerate(int_to_id)}

In [12]:
"""
if a user doesn't have food records, we discard it.
"""
# Create edges between food and ingredient nodes
food_ingredient_edges = torch.tensor(
    [[food_to_int[food_id], ingredient_to_int[ingredient_id]] for food_id, ingredient_id in zip(food_ingredients_df['food_id'], food_ingredients_df['ingredient_id'])],
    dtype=torch.long
).t().contiguous()

# Create edges between food and category nodes
food_category_edges = torch.tensor(
    [[food_to_int[food_id], wweia_to_int[category_id]] for food_id, category_id in zip(food_ingredients_df['food_id'], food_ingredients_df['WWEIA_id'])],
    dtype=torch.long
).t().contiguous()

# Create edges between user and food nodes
user_food_edges = torch.tensor(
    [[user_to_int[user_id], food_to_int[food_id]] for user_id, food_id in zip(user_food_df['SEQN'], user_food_df['food_id'])],
    dtype=torch.long
).t().contiguous()

# Create edges between user and habit nodes
user_habit_edges = torch.tensor(
    [[user_to_int[user_id], habit_to_int[habit_id]] for user_id, habit_id in zip(user_habit_df['SEQN'], user_habit_df['habit_id'])],
    dtype=torch.long
).t().contiguous()

# Create edges between user and medicine nodes
user_medicine_edges = torch.tensor(
    [[user_to_int[user_id], medicine_to_int[medicine_id]] for user_id, medicine_id in zip(user_medicine_df['SEQN'], user_medicine_df['RXDDRGID'])],
    dtype=torch.long
).t().contiguous()

In [13]:
graph = HeteroData()
graph['user']['node_id'] = unique_user_ids
graph['food']['node_id'] = unique_food_ids
graph['ingredient']['node_id']= unique_ingredient_ids
graph['category']['node_id'] = unique_wweia_ids
graph['habit']['node_id'] = unique_habit_ids
graph['medicine']['node_id'] = unique_medicine_ids

graph['user', 'eats', 'food'].edge_index = user_food_edges
graph['food', 'contains', 'ingredient'].edge_index = food_ingredient_edges
graph['food', 'belongs_to', 'category'].edge_index = food_category_edges
graph['user', 'has', 'habit'].edge_index = user_habit_edges
graph['user', 'takes', 'medicine'].edge_index = user_medicine_edges

In [14]:
graph

HeteroData(
  user={ node_id=[83352] },
  food={ node_id=[9640] },
  ingredient={ node_id=[3355] },
  category={ node_id=[174] },
  habit={ node_id=[54] },
  medicine={ node_id=[1264] },
  (user, eats, food)={ edge_index=[2, 2322627] },
  (food, contains, ingredient)={ edge_index=[2, 32229] },
  (food, belongs_to, category)={ edge_index=[2, 32229] },
  (user, has, habit)={ edge_index=[2, 758227] },
  (user, takes, medicine)={ edge_index=[2, 117008] }
)

#### Add features and labels to user nodes

In [15]:
df_demo['user_prompt'] = df_demo.apply(user_prompt_adding, axis=1, result_type='expand')

In [16]:
df_demo['age'] = df_demo['age'].astype(int)
# Transform ages to age groups
bins = [-1, 10, 20, 30, 40, 50, 60, 100]
labels = ['1', '2', '3', '4', '5', '6', '7']
# Create a new column for age groups
df_demo['age_group'] = pd.cut(df_demo['age'], bins=bins, labels=labels, right=True)
df_demo.drop('age', axis=1, inplace=True)

In [17]:
categorical_columns = ['gender', 'race', 'household_income', 'education', 'age_group']
df_demo = onehot_encoding(df_demo, categorical_columns)

In [18]:
df_demo['SEQN'] = df_demo['SEQN'].astype(str)
df_demo = df_demo.set_index('SEQN')

df_demo['label'].loc[df_demo['label'] == 2] = 0
labels = df_demo['label'].astype(int)
user_prompt = df_demo['user_prompt']

df_demo.drop(['years', 'weight_interview', 'weight_mec', 'label', 'user_prompt'], axis=1, inplace=True)
df_demo = df_demo.astype(float).astype(int)

In [19]:
if ADD_MEDICAL:
    df_medical = pd.read_csv('../processed_data/medical_table.csv', index_col='SEQN')
    df_medical.index = df_medical.index.astype(str)

    standard_scaler = StandardScaler()
    minmax_scaler = MinMaxScaler()
    df_standardized = pd.DataFrame(standard_scaler.fit_transform(df_medical), columns=df_medical.columns, index=df_medical.index)
    df_medical = pd.DataFrame(minmax_scaler.fit_transform(df_standardized), columns=df_medical.columns, index=df_medical.index)

    df_demo = df_demo.merge(df_medical, left_index=True, right_index=True, how='left')
    df_demo = df_demo.fillna(0)

In [20]:
node_id_tensor = graph['user']['node_id']
ordered_features = df_demo.loc[node_id_tensor.tolist()]
ordered_labels = labels.loc[node_id_tensor.tolist()]
ordered_prompt = user_prompt.loc[node_id_tensor.tolist()]
ordered_feature_tensor = torch.tensor(ordered_features.values, dtype=torch.float32)
ordered_labels_tensor = torch.tensor(ordered_labels.values, dtype=torch.int64)
ordered_prompt_list = ordered_prompt.values.tolist()
graph['user'].x = ordered_feature_tensor
graph['user'].y = ordered_labels_tensor
graph['user'].prompt = ordered_prompt_list

In [21]:
from transformers import BertTokenizer, BertModel, LlamaTokenizer, LlamaModel
    # Tokenize input and get output from BERT model
if not LLM_FLAG:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
else:
    tokenizer = LlamaTokenizer.from_pretrained('../llama-2-7b')
    model = LlamaModel.from_pretrained('../llama-2-7b')

In [22]:
def get_bert_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
    # Extract the hidden states (last layer)
    last_hidden_state = outputs.last_hidden_state
    # Average the hidden states to get sentence embedding
    sentence_embedding = torch.mean(last_hidden_state, dim=1).squeeze().numpy()

    return sentence_embedding

#### Add features to food nodes

In [23]:
graph['food'].x = None

In [24]:
food = pd.concat([user_food_df[['food_id', 'food_desc']],food_ingredients_df[['food_id', 'food_desc']]]).drop_duplicates(subset=['food_id'], keep='last')
food = food.set_index('food_id')

In [25]:
df_food_nutrition = pd.read_csv('../processed_data/food_nutrition.csv')
df_food_nutrition['food_id'] = df_food_nutrition['food_id'].astype(int).astype(str)
df_food_nutrition['food_id'] = df_food_nutrition['food_id'].str.zfill(10)
df_food_nutrition = df_food_nutrition.set_index('food_id')

standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()
df_standardized = pd.DataFrame(standard_scaler.fit_transform(df_food_nutrition), columns=df_food_nutrition.columns, index=df_food_nutrition.index)
df_food_nutrition = pd.DataFrame(minmax_scaler.fit_transform(df_standardized), columns=df_food_nutrition.columns, index=df_food_nutrition.index)

food = food.merge(df_food_nutrition, left_index=True, right_index=True, how='left')
food = food.fillna(0)

In [26]:
food['food_prompt'] = food.apply(food_prompt_adding, args=(df_food_nutrition.columns.tolist()[2:],), axis=1, result_type='expand')

In [27]:
node_id_tensor = graph['food']['node_id']
ordered_prompt = food['food_prompt'].loc[node_id_tensor.tolist()]
ordered_prompt_list = ordered_prompt.values.tolist()
graph['food'].prompt = ordered_prompt_list
food.drop('food_prompt', axis=1, inplace=True)

ordered_features = food.loc[node_id_tensor.tolist()]
ordered_features_tensor = torch.tensor(ordered_features['food_desc'].apply(get_bert_embedding), dtype=torch.float32)
ordered_features.drop('food_desc', axis=1, inplace=True)
pca = PCA(n_components=100)
if PCA_FLAG:
    pca.fit(ordered_features_tensor)
    reduced_embeddings = pca.transform(ordered_features_tensor)
    reduced_embeddings = torch.tensor(reduced_embeddings, dtype=torch.float32)
    ordered_features_tensor = torch.cat((reduced_embeddings, torch.tensor(ordered_features.values, dtype=torch.float32)), dim=1)
    graph['food'].x = ordered_features_tensor
else:
    ordered_features_tensor = torch.cat((ordered_features_tensor, torch.tensor(ordered_features.values, dtype=torch.float32)), dim=1)
    graph['food'].x = ordered_features_tensor

#### Add features to ingredient nodes

In [28]:
graph['ingredient'].x = None
ingredient = food_ingredients_df[['ingredient_id', 'ingredient_desc']].drop_duplicates(subset=['ingredient_id'], keep='last')
ingredient = ingredient.set_index('ingredient_id')
ingredient['ingredient_prompt'] = ingredient.apply(ingredient_prompt_adding, axis=1, result_type='expand')

node_id_tensor = graph['ingredient']['node_id']
ordered_prompt = ingredient['ingredient_prompt'].loc[node_id_tensor.tolist()]
ordered_prompt_list = ordered_prompt.values.tolist()
graph['ingredient'].prompt = ordered_prompt_list
ingredient.drop('ingredient_prompt', axis=1, inplace=True)

ordered_features = ingredient.loc[node_id_tensor.tolist()]
ordered_features_tensor = torch.tensor(ordered_features['ingredient_desc'].apply(get_bert_embedding), dtype=torch.float32)
if PCA_FLAG:
    reduced_embeddings = pca.transform(ordered_features_tensor)
    reduced_embeddings = torch.tensor(reduced_embeddings, dtype=torch.float32)
    graph['ingredient'].x = reduced_embeddings
else:
    graph['ingredient'].x = ordered_features_tensor

#### Add features to category nodes

In [29]:
graph['category'].x = None
category = food_ingredients_df[['WWEIA_id', 'WWEIA_desc']].drop_duplicates(subset=['WWEIA_id'], keep='last')
category = category.set_index('WWEIA_id')
category['category_prompt'] = category.apply(category_prompt_adding, axis=1, result_type='expand')

node_id_tensor = graph['category']['node_id']
ordered_prompt = category['category_prompt'].loc[node_id_tensor.tolist()]
ordered_prompt_list = ordered_prompt.values.tolist()
graph['category'].prompt = ordered_prompt_list
category.drop('category_prompt', axis=1, inplace=True)

ordered_features = category.loc[node_id_tensor.tolist()]
ordered_features_tensor = torch.tensor(ordered_features['WWEIA_desc'].apply(get_bert_embedding), dtype=torch.float32)
if PCA_FLAG:
    reduced_embeddings = pca.transform(ordered_features_tensor)
    reduced_embeddings = torch.tensor(reduced_embeddings, dtype=torch.float32)
    graph['category'].x = reduced_embeddings
else:
    graph['category'].x = ordered_features_tensor

#### Add features to habit nodes

In [30]:
graph['habit'].x = None
habit = user_habit_df[['habit_id', 'habit_desc']].drop_duplicates(subset=['habit_id'], keep='last')
habit = habit.set_index('habit_id')
habit['habit_prompt'] = habit.apply(habit_prompt_adding, axis=1, result_type='expand')

node_id_tensor = graph['habit']['node_id']
ordered_prompt = habit['habit_prompt'].loc[node_id_tensor.tolist()]
ordered_prompt_list = ordered_prompt.values.tolist()
graph['habit'].prompt = ordered_prompt_list
habit.drop('habit_prompt', axis=1, inplace=True)

ordered_features = habit.loc[node_id_tensor.tolist()]
ordered_features_tensor = torch.tensor(ordered_features['habit_desc'].apply(get_bert_embedding), dtype=torch.float32)
if PCA_FLAG:
    reduced_embeddings = pca.transform(ordered_features_tensor)
    reduced_embeddings = torch.tensor(reduced_embeddings, dtype=torch.float32)
    graph['habit'].x = reduced_embeddings
else:
    graph['habit'].x = ordered_features_tensor

#### Add features to medicine nodes

In [31]:
graph['medicine'].x = None
category = user_medicine_df[['RXDDRGID', 'RXDDRUG']].drop_duplicates(subset=['RXDDRGID'], keep='last')
category = category.set_index('RXDDRGID')
node_id_tensor = graph['medicine']['node_id']
ordered_features = category.loc[node_id_tensor.tolist()]
ordered_features_tensor = torch.tensor(ordered_features['RXDDRUG'].str.lower().apply(get_bert_embedding), dtype=torch.float32)
if PCA_FLAG:
    reduced_embeddings = pca.transform(ordered_features_tensor)
    reduced_embeddings = torch.tensor(reduced_embeddings, dtype=torch.float32)
    graph['medicine'].x = reduced_embeddings
else:
    graph['medicine'].x = ordered_features_tensor

In [32]:
## We hope to build meta-paths but this graph is too big for it. We need sampling techniques to overcome it.
source_nodes, dest_nodes = graph['user', 'eats', 'food'].edge_index
graph['food', 'eaten', 'user'].edge_index = torch.stack([dest_nodes, source_nodes], dim=0)
source_nodes, dest_nodes = graph['food', 'contains', 'ingredient'].edge_index
graph['ingredient', 'in', 'food'].edge_index = torch.stack([dest_nodes, source_nodes], dim=0)
source_nodes, dest_nodes = graph['food', 'belongs_to', 'category'].edge_index
graph['category', 'contains', 'food'].edge_index = torch.stack([dest_nodes, source_nodes], dim=0)
source_nodes, dest_nodes = graph['user', 'has', 'habit'].edge_index
graph['habit', 'from', 'user'].edge_index = torch.stack([dest_nodes, source_nodes], dim=0)

In [33]:
graph

HeteroData(
  user={
    node_id=[83352],
    x=[83352, 38],
    y=[83352],
    prompt=[83352],
  },
  food={
    node_id=[9640],
    prompt=[9640],
    x=[9640, 814],
  },
  ingredient={
    node_id=[3355],
    prompt=[3355],
    x=[3355, 768],
  },
  category={
    node_id=[174],
    prompt=[174],
    x=[174, 768],
  },
  habit={
    node_id=[54],
    prompt=[54],
    x=[54, 768],
  },
  medicine={
    node_id=[1264],
    x=[1264, 768],
  },
  (user, eats, food)={ edge_index=[2, 2322627] },
  (food, contains, ingredient)={ edge_index=[2, 32229] },
  (food, belongs_to, category)={ edge_index=[2, 32229] },
  (user, has, habit)={ edge_index=[2, 758227] },
  (user, takes, medicine)={ edge_index=[2, 117008] },
  (food, eaten, user)={ edge_index=[2, 2322627] },
  (ingredient, in, food)={ edge_index=[2, 32229] },
  (category, contains, food)={ edge_index=[2, 32229] },
  (habit, from, user)={ edge_index=[2, 758227] }
)

In [34]:
graph.metadata()

(['user', 'food', 'ingredient', 'category', 'habit', 'medicine'],
 [('user', 'eats', 'food'),
  ('food', 'contains', 'ingredient'),
  ('food', 'belongs_to', 'category'),
  ('user', 'has', 'habit'),
  ('user', 'takes', 'medicine'),
  ('food', 'eaten', 'user'),
  ('ingredient', 'in', 'food'),
  ('category', 'contains', 'food'),
  ('habit', 'from', 'user')])

In [35]:
torch.save(graph, '../processed_data/heterogeneous_graph_768_raw.pt')

In [None]:
from transformers import BertTokenizer, BertModel, LlamaTokenizer, LlamaModel