In [None]:
from torch_geometric.data import Data
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from utils import *

import warnings
warnings.filterwarnings('ignore')

## Constructing a homogeneous graph for GCN baseline

In [None]:
# Load datasets
food_ingredients_df = pd.read_csv('../processed_data/food_ingredients.csv', dtype=str)
user_food_df = pd.read_csv('../processed_data/user_food.csv', dtype=str)
df_demo = pd.read_csv('../processed_data/main_table.csv', dtype=str)
df_diet = pd.read_csv('../processed_data/diet_table.csv', dtype=str)

In [None]:
food_ingredients_df['WWEIA_id'] = food_ingredients_df['WWEIA_id'].str.zfill(6)
food_ingredients_df['ingredient_id'] = food_ingredients_df['ingredient_id'].str.zfill(8)
food_ingredients_df['food_id'] = food_ingredients_df['food_id'].str.zfill(8)

In [None]:
# Create unique identifiers and map them to integers
unique_food_ids = food_ingredients_df['food_id'].unique()
unique_ingredient_ids = food_ingredients_df['ingredient_id'].unique()
unique_wweia_ids = food_ingredients_df['WWEIA_id'].unique()
unique_user_ids = user_food_df['SEQN'].unique()
all_ids = list(unique_user_ids) + list(unique_food_ids) + list(unique_ingredient_ids) +  list(unique_wweia_ids)

In [None]:
# Not all users have detail food records, and sometimes the ingredient ids are also food ids.
len(unique_wweia_ids)

174

In [None]:
# Create a mapping from original ID to new integer ID
all_ids = list(unique_user_ids) + list(unique_food_ids) + list(unique_ingredient_ids) +  list(unique_wweia_ids)
id_to_int = {original_id: i for i, original_id in enumerate(all_ids)}

In [None]:
len(set(all_ids))

95604

In [None]:
# Create edges between food and ingredient nodes
food_ingredient_edges = torch.tensor(
    [[id_to_int[food_id], id_to_int[ingredient_id]] for food_id, ingredient_id in zip(food_ingredients_df['food_id'], food_ingredients_df['ingredient_id'])],
    dtype=torch.long
).t().contiguous()

# Create edges between food and category nodes
food_category_edges = torch.tensor(
    [[id_to_int[food_id], id_to_int[category_id]] for food_id, category_id in zip(food_ingredients_df['food_id'], food_ingredients_df['WWEIA_id'])],
    dtype=torch.long
).t().contiguous()

valid_food_codes = set(food_ingredients_df['food_id'].unique())
filtered_user_food_df = user_food_df[user_food_df['food_id'].isin(valid_food_codes)]

# Create edges between user and food nodes
user_food_edges = torch.tensor(
    [[id_to_int[user_id], id_to_int[food_id]] for user_id, food_id in zip(filtered_user_food_df['SEQN'], filtered_user_food_df['food_id'])],
    dtype=torch.long
).t().contiguous()

In [None]:
# Concatenate all edges
edge_index = torch.cat((food_ingredient_edges, food_category_edges, user_food_edges), dim=1)

In [None]:
# Tokenize the descriptions
tokenized_food_desc = [word_tokenize(desc.lower()) for desc in food_ingredients_df['food_desc'].unique()]
tokenized_ingredient_desc = [word_tokenize(desc.lower()) for desc in food_ingredients_df['ingredient_desc'].unique()]
tokenized_wweia_desc = [word_tokenize(desc.lower()) for desc in food_ingredients_df['WWEIA_desc'].unique()]
# Combine all tokenized descriptions
all_tokenized_desc = tokenized_food_desc + tokenized_ingredient_desc + tokenized_wweia_desc

# Train a Word2Vec model
model = Word2Vec(sentences=all_tokenized_desc, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
max_feature_size = model.vector_size

# Function to get the mean vector for a description
def get_mean_vector(word2vec_model, words):
    words = [word for word in words if word in word2vec_model.wv.index_to_key]
    if len(words) >= 1:
        return np.mean(word2vec_model.wv[words], axis=0)
    else:
        return []

In [None]:
categorical_columns = ['gender', 'race', 'household_income', 'education', 'age_group']
df_demo = onehot_encoding(df_demo, categorical_columns)

In [None]:
df_demo = df_demo.astype(float).astype(int)
df_demo['SEQN'] = df_demo['SEQN'].astype(str)
df_demo.drop(['years', 'weight_interview', 'weight_mec'], axis=1, inplace=True)
df_demo['label'].loc[df_demo['label'] == 2] = 0
df_demo = df_demo.set_index('SEQN')

In [None]:
df_diet['SEQN'] = df_diet['SEQN'].astype(float).astype(int).astype(str)

In [None]:
df_diet = df_diet.set_index('SEQN')

In [None]:
df_demo = pd.merge(df_demo, df_diet, left_index=True, right_index=True, how='left')

In [None]:
df_demo

Unnamed: 0_level_0,label,gender_1,gender_2,race_1,race_2,race_3,race_4,race_5,household_income_-1,household_income_1,...,DRQSDT9,DRQSDT10,DRQSDT11,DRQSDT12,DRQSDT91,DR1TNUMF,DR1TWSZ,DR1_300,DRD340,DRD360
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21005,0,1,0,0,0,0,1,0,0,0,...,-1,-1,-1,-1,-1,17,-1,2,-1,-1
21006,0,0,1,0,0,0,1,0,0,0,...,-1,-1,-1,-1,-1,10,-1,3,2,1
21007,0,0,1,0,0,1,0,0,1,0,...,-1,-1,-1,-1,-1,10,-1,2,-1,-1
21008,0,1,0,0,0,0,1,0,0,0,...,-1,-1,-1,-1,-1,4,-1,2,-1,-1
21009,0,1,0,0,0,1,0,0,0,0,...,-1,-1,-1,-1,-1,22,-1,2,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
124818,0,1,0,0,0,0,1,0,1,0,...,-1,-1,-1,-1,-1,9,4,3,1,1
124819,0,1,0,0,0,0,1,0,1,0,...,-1,-1,-1,-1,-1,14,1,2,2,2
124820,0,0,1,0,0,1,0,0,1,0,...,-1,-1,-1,-1,-1,8,91,2,2,2
124821,0,1,0,0,0,0,1,0,1,0,...,-1,-1,-1,-1,-1,10,4,2,1,1


In [None]:
df_demo['label'].value_counts()

label
0    93144
1     2728
Name: count, dtype: int64

In [None]:
food_dict = dict(zip(food_ingredients_df['food_id'], food_ingredients_df['food_desc']))
ingredient_dict = dict(zip(food_ingredients_df['ingredient_id'], food_ingredients_df['ingredient_desc']))
wweia_dict = dict(zip(food_ingredients_df['WWEIA_id'], food_ingredients_df['WWEIA_desc']))
user_dict = {idx: row.tolist() for idx, row in df_demo.iterrows()}

In [None]:
# Create node features using Word2Vec embeddings
node_features = []
for original_id in all_ids:
    if original_id in food_dict:
        desc = food_dict[original_id]
        feature = get_mean_vector(model, word_tokenize(desc.lower()))
    elif original_id in ingredient_dict:
        desc = ingredient_dict[original_id]
        feature = get_mean_vector(model, word_tokenize(desc.lower()))
    elif original_id in wweia_dict:
        desc = wweia_dict[original_id]
        feature = get_mean_vector(model, word_tokenize(desc.lower()))
    elif original_id in user_dict:
        feature = np.array(user_dict[original_id])  # Demographic features
    else:
        feature = np.zeros(model.vector_size)  # Fallback to zero vector
    if len(feature) < max_feature_size:
        feature = np.pad(feature, (0, max_feature_size - len(feature)), 'constant', constant_values=0)
    node_features.append(feature)

In [None]:
node_features = torch.tensor(np.array(node_features), dtype=torch.float)

In [None]:
# Only 2413 records has dietary records, so they are included in the graph.

In [None]:
# Create PyG data object
data = Data(x=node_features, edge_index=edge_index)

In [None]:
data

Data(x=[96244, 100], edge_index=[2, 2172501])

In [None]:
torch.save(data, "../processed_data/simple_graph.pt")

In [None]:
len(unique_user_ids)

83352