#In this notebook we will prepare the data for training, We will create 3 dfs that will hold the paths to dish images, the embedding of the ingredients and some of them, the calories. We will use 3 different embedding styles - one-hot-encoding, index-embedding and Word2Vec

In [None]:
import pandas as pd
import numpy as np
import os
import shutil
from google.colab import drive
from sklearn.preprocessing import MultiLabelBinarizer
from gensim.models import Word2Vec
import pickle
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!gsutil -m cp -r "gs://nutrition5k_dataset/nutrition5k_dataset/metadata" .

Copying gs://nutrition5k_dataset/nutrition5k_dataset/metadata/ingredients_metadata.csv...
/ [0/3 files][    0.0 B/  2.2 MiB]   0% Done                                    Copying gs://nutrition5k_dataset/nutrition5k_dataset/metadata/dish_metadata_cafe1.csv...
Copying gs://nutrition5k_dataset/nutrition5k_dataset/metadata/dish_metadata_cafe2.csv...
/ [3/3 files][  2.2 MiB/  2.2 MiB] 100% Done                                    
Operation completed over 3 objects/2.2 MiB.                                      


In [None]:

def generate_df_with_id_calories(df):
    # Define the base columns present in the original CSV.
    # Since you already have a dish_calories column, we include it here.
    base_columns = ['dish_id', 'dish_calories', 'serving_size', 'protein', 'fat', 'carbs']
    num_cols = df.shape[1]

    # Each ingredient block is assumed to have 7 columns: ingredient_id, ingredient_name, amount, calories, fat, carb, protein.
    ingredient_block_size = 7
    num_ingredients = (num_cols - len(base_columns)) // ingredient_block_size

    # Build a list of ingredient column names.
    ingredient_columns = []
    for i in range(1, num_ingredients + 1):
        ingredient_columns.extend([
            f'ingredient_id_{i}',
            f'ingredient_name_{i}',
            f'amount_{i}',
            f'calories_{i}',  # This column is present but not used for summing.
            f'fat_{i}',
            f'carb_{i}',
            f'protein_{i}'
        ])

    final_columns = base_columns + ingredient_columns
    df.columns = final_columns
    ingredient_name_cols = [col for col in df.columns if col.startswith('ingredient_name_')]
    df['ingredients'] = df[ingredient_name_cols].apply(lambda row: ', '.join(row.dropna().astype(str)), axis=1)

    # Prepare the final DataFrame with only the desired columns.
    df_final = df[['dish_id', 'ingredients', 'dish_calories']]
    return df_final

def generate_df_with_paths_calories(df,path_to_images):
    image_files = [os.path.join(path_to_images, f) for f in os.listdir(path_to_images) if f.endswith((".jpg", ".png", ".jpeg"))]
    file_names = [os.path.splitext(os.path.basename(f))[0].replace("_rgb", "") for f in image_files]
    print('the amount of images is:' , len(file_names))
    df = df[df['dish_id'].isin(file_names)]
    print("Filtered DataFrame shape:", df.shape)
    df_with_paths = df.reset_index(drop=True)

    df_with_paths['path'] = df_with_paths['dish_id'].apply(lambda x: os.path.join(image_dir, f"{x}_rgb.png"))
    df_with_paths = df_with_paths.drop(columns=['dish_id'])
    return df_with_paths

In [None]:
df = pd.read_csv('/content/metadata/dish_metadata_cafe1.csv', on_bad_lines='skip',header=None)
df_with_id = generate_df_with_id_calories(df)
(df_with_id)
image_dir = "/content/drive/MyDrive/Deep Final Work/Data/Nutrition5k_RGB"
df_with_paths_calories = generate_df_with_paths_calories(df_with_id,image_dir)
df_with_paths_calories["ingredients"] = df_with_paths_calories["ingredients"].apply(
    lambda x: x.split(", ") if isinstance(x, str) else x
)

the amount of images is: 2895
Filtered DataFrame shape: (2895, 3)


##Prepare one hot encoding

In [None]:
one_hot_df = df_with_paths_calories.copy()
# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()

ingredients_dummies = mlb.fit_transform(one_hot_df["ingredients"])
ingredients_df = pd.DataFrame(ingredients_dummies, columns=mlb.classes_, index=one_hot_df.index)
one_hot_df = pd.concat([one_hot_df.drop("ingredients", axis=1), ingredients_df], axis=1)

print("Final DataFrame shape:", one_hot_df.shape)
print(one_hot_df.head())

Final DataFrame shape: (2895, 200)
   dish_calories                                               path  almonds  \
0     300.794281  /content/drive/MyDrive/Deep Final Work/Data/Nu...        0   
1     419.438782  /content/drive/MyDrive/Deep Final Work/Data/Nu...        0   
2     382.936646  /content/drive/MyDrive/Deep Final Work/Data/Nu...        0   
3      20.590000  /content/drive/MyDrive/Deep Final Work/Data/Nu...        0   
4      74.360001  /content/drive/MyDrive/Deep Final Work/Data/Nu...        0   

   apple  artichokes  arugula  asparagus  avocado  baby carrots  bacon  ...  \
0      1           0        0          0        0             0      0  ...   
1      1           0        0          0        0             0      0  ...   
2      0           0        0          0        0             0      0  ...   
3      0           0        0          0        0             1      0  ...   
4      0           0        0          0        0             0      0  ...   

   wheat 

##Prepare index embedding

In [None]:
def build_ingredient_vocab(df, ingredient_col='ingredients'):
    """
    Build a vocabulary mapping for ingredients.
    Assumes that each entry in df[ingredient_col] is already a list.
    """
    all_ingredients = {ingredient for ingredients in df[ingredient_col] for ingredient in ingredients}
    ingredient_to_index = {ingredient: idx + 1 for idx, ingredient in enumerate(sorted(all_ingredients))}
    return ingredient_to_index

index_df = df_with_paths_calories.copy()
ingredient_to_index = build_ingredient_vocab(index_df, ingredient_col='ingredients')
print("Number of unique ingredients:", len(ingredient_to_index))

index_df['ingredient_indices'] = index_df['ingredients'].apply(
    lambda ingredients: [ingredient_to_index[ingredient] for ingredient in ingredients]
)

max_length = index_df['ingredient_indices'].apply(len).max()
print("Maximum number of ingredients in a dish:", max_length)

def pad_sequence(seq, max_length, padding_value=0):
    return seq + [padding_value] * (max_length - len(seq))

index_df['ingredient_indices_padded'] = index_df['ingredient_indices'].apply(
    lambda seq: pad_sequence(seq, max_length)
)
index_df.drop(columns=['ingredients',"ingredient_indices"], inplace=True)
index_df.rename(columns={'ingredient_indices_padded': 'ingredients_embedding'}, inplace=True)


Number of unique ingredients: 198
Maximum number of ingredients in a dish: 17


## prepare Word2Vev encoding

In [None]:
w2v_df = df_with_paths_calories.copy()
corpus = w2v_df['ingredients'].tolist()

# Train a Word2Vec model on the ingredient lists.
embedding_dim = 50
w2v_model = Word2Vec(sentences=corpus, vector_size=embedding_dim, window=3, min_count=1, workers=4)

# Function to compute the average embedding for a list of ingredients.
def compute_avg_embedding(ingredients, w2v_model):
    vectors = []
    for ingredient in ingredients:
        if ingredient in w2v_model.wv:
            vectors.append(w2v_model.wv[ingredient])
        else:
            # Use a zero vector if the ingredient is not in the vocabulary.
            vectors.append(np.zeros(w2v_model.vector_size))
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(w2v_model.vector_size)

w2v_df['ingredient_embedding'] = w2v_df['ingredients'].apply(lambda x: compute_avg_embedding(x, w2v_model))
w2v_df.head()

Unnamed: 0,ingredients,dish_calories,path,ingredient_embedding
0,"[soy sauce, garlic, white rice, parsley, onion...",300.794281,/content/drive/MyDrive/Deep Final Work/Data/Nu...,"[-0.03913052, -0.057547938, -0.0937923, 0.0345..."
1,"[pepper, white rice, mixed greens, garlic, soy...",419.438782,/content/drive/MyDrive/Deep Final Work/Data/Nu...,"[-0.039588775, -0.05838116, -0.09531968, 0.032..."
2,"[jalapenos, lemon juice, pork, wheat berry, ca...",382.936646,/content/drive/MyDrive/Deep Final Work/Data/Nu...,"[-0.03427486, -0.05676774, -0.09941951, 0.0276..."
3,"[cherry tomatoes, cucumbers, baby carrots]",20.59,/content/drive/MyDrive/Deep Final Work/Data/Nu...,"[-0.044774458, -0.0354941, -0.058514077, 0.005..."
4,[deprecated],74.360001,/content/drive/MyDrive/Deep Final Work/Data/Nu...,"[-0.0091026435, -0.02940884, -0.03603014, 0.02..."


##Save the training dfs and test dfs

In [None]:
import os
import pickle
import numpy as np
from sklearn.model_selection import train_test_split

n_samples = len(one_hot_df)

indices = np.arange(n_samples)
train_indices, test_indices = train_test_split(indices, test_size=0.05, random_state=42)

one_hot_train_df = one_hot_df.iloc[train_indices].reset_index(drop=True)
one_hot_test_df  = one_hot_df.iloc[test_indices].reset_index(drop=True)

index_train_df = index_df.iloc[train_indices].reset_index(drop=True)
index_test_df  = index_df.iloc[test_indices].reset_index(drop=True)

w2v_train_df   = w2v_df.iloc[train_indices].reset_index(drop=True)
w2v_test_df    = w2v_df.iloc[test_indices].reset_index(drop=True)

# Directory to save the CSV files (adjust if needed).
save_dir = "/content/drive/MyDrive/Deep Final Work/Training dfs/"

# Save the DataFrames as CSV files.
one_hot_train_df.to_csv(os.path.join(save_dir, "one_hot_train_df.csv"), index=False)
one_hot_test_df.to_csv(os.path.join(save_dir, "one_hot_test_df.csv"), index=False)

index_train_df.to_csv(os.path.join(save_dir, "index_train_df.csv"), index=False)
index_test_df.to_csv(os.path.join(save_dir, "index_test_df.csv"), index=False)

w2v_train_df.to_csv(os.path.join(save_dir, "w2v_train_df.csv"), index=False)
w2v_test_df.to_csv(os.path.join(save_dir, "w2v_test_df.csv"), index=False)

print(one_hot_train_df.shape)
print(one_hot_test_df.shape)
print(index_train_df.shape)
print(index_test_df.shape)
print(w2v_train_df.shape)
print(w2v_test_df.shape)

# Save the ingredient_to_index mapping as a pickle file.
pkl_path = os.path.join(save_dir, "ingredient_to_index.pkl")
if os.path.exists(pkl_path):
    os.remove(pkl_path)
with open(pkl_path, "wb") as f:
    pickle.dump(ingredient_to_index, f)

print("DataFrames saved:")
print(" - one_hot_train_df.csv")
print(" - one_hot_test_df.csv")
print(" - index_train_df.csv")
print(" - index_test_df.csv")
print(" - w2v_train_df.csv")
print(" - w2v_test_df.csv")
print(" - ingredient_to_index.pkl")

(2750, 200)
(145, 200)
(2750, 3)
(145, 3)
(2750, 4)
(145, 4)
DataFrames saved:
 - one_hot_train_df.csv
 - one_hot_test_df.csv
 - index_train_df.csv
 - index_test_df.csv
 - w2v_train_df.csv
 - w2v_test_df.csv
 - ingredient_to_index.pkl


In [None]:
# Function to verify that a given column has the same values across multiple DataFrames.
def verify_same_paths(dfs, col="path"):
    base_paths = dfs[0][col]
    for df in dfs[1:]:
        if not base_paths.equals(df[col]):
            return False
    return True

train_dfs = [one_hot_train_df, index_train_df, w2v_train_df]
test_dfs  = [one_hot_test_df, index_test_df, w2v_test_df]

train_paths_same = verify_same_paths(train_dfs, col="path")
test_paths_same  = verify_same_paths(test_dfs, col="path")

print("Train DataFrames have identical image paths:", train_paths_same)
print("Test DataFrames have identical image paths:", test_paths_same)


Train DataFrames have identical image paths: True
Test DataFrames have identical image paths: True


In [None]:
w2v_df.head()

Unnamed: 0,ingredients,dish_calories,path,ingredient_embedding
0,"[soy sauce, garlic, white rice, parsley, onion...",300.794281,/content/drive/MyDrive/Deep Final Work/Data/Nu...,"[-0.03913052, -0.057547938, -0.0937923, 0.0345..."
1,"[pepper, white rice, mixed greens, garlic, soy...",419.438782,/content/drive/MyDrive/Deep Final Work/Data/Nu...,"[-0.039588775, -0.05838116, -0.09531968, 0.032..."
2,"[jalapenos, lemon juice, pork, wheat berry, ca...",382.936646,/content/drive/MyDrive/Deep Final Work/Data/Nu...,"[-0.03427486, -0.05676774, -0.09941951, 0.0276..."
3,"[cherry tomatoes, cucumbers, baby carrots]",20.59,/content/drive/MyDrive/Deep Final Work/Data/Nu...,"[-0.044774458, -0.0354941, -0.058514077, 0.005..."
4,[deprecated],74.360001,/content/drive/MyDrive/Deep Final Work/Data/Nu...,"[-0.0091026435, -0.02940884, -0.03603014, 0.02..."
