<a href="https://colab.research.google.com/github/IamKishoreSreedharan/food-lens/blob/main/TME_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# # !pip install mlflow databricks
# !databricks configure --host https://community.cloud.databricks.com/

In [1]:
import os
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from pathlib import Path
from tqdm import tqdm
import hashlib

In [2]:
DIR = '/content/drive/MyDrive/project/'

In [None]:

# Create organized subdirectories
os.makedirs(os.path.join(DIR, "images"), exist_ok=True)          # For downloaded images
os.makedirs(os.path.join(DIR, "data"), exist_ok=True)            # For CSVs
os.makedirs(os.path.join(DIR, "models/classifier"), exist_ok=True)          # For autoencoder weights
os.makedirs(os.path.join(DIR, "models/cbir"), exist_ok=True)
os.makedirs(os.path.join(DIR, "latent_features"), exist_ok=True) # For feature vectors
SRC_FILENAME = 'data/recipe.csv'

In [None]:
df = pd.read_csv(DIR + SRC_FILENAME, low_memory=False)

In [None]:
essential_cols = [
    "recipe_id", "title", "ingredients", "directions",
    "prep_time", "cook_time", "total_time", "servings",
    "image", "category",'instructions_list','calories', 'carbohydrates_g', 'sugars_g', 'fat_g',
    'saturated_fat_g', 'cholesterol_mg', 'protein_g', 'dietary_fiber_g', 'sodium_mg',  'url', "rating"
]

In [None]:
nutrition_cols = [
     'calories_from_fat',
    'calcium_mg', 'iron_mg', 'magnesium_mg', 'potassium_mg',
    'zinc_mg', 'phosphorus_mg', 'vitamin_a_iu_IU',
    'niacin_equivalents_mg', 'vitamin_b6_mg', 'vitamin_c_mg',
    'folate_mcg', 'thiamin_mg', 'riboflavin_mg',
    'vitamin_e_iu_IU', 'vitamin_k_mcg', 'biotin_mcg',
    'vitamin_b12_mcg', 'mono_fat_g', 'poly_fat_g',
    'trans_fatty_acid_g', 'omega_3_fatty_acid_g',
    'omega_6_fatty_acid_g',
    'author', 'description', 'yields', 'rating_count',
    'review_count'
]

df_clean = df.drop(columns=nutrition_cols)

In [None]:
# df_clean.isna().sum()

In [None]:
import hashlib

# Create a unique ID for each recipe
df_clean["recipe_id"] = df_clean["image"].astype(str).apply(
    lambda x: hashlib.md5(x.encode()).hexdigest()[:10]
)
df_clean = df_clean[df_clean['recipe_id'] != 'a3d2de7675']

In [None]:
nutritional_cols = [
    'calories', 'carbohydrates_g', 'sugars_g', 'fat_g',
    'saturated_fat_g', 'cholesterol_mg', 'protein_g',
    'dietary_fiber_g', 'sodium_mg'
]

# Calculate mean per category
category_means = df_clean.groupby('category')[nutritional_cols].mean()

# Impute missing values
for col in nutritional_cols:
    df_clean[col] = df_clean.apply(
        lambda row: category_means.loc[row['category'], col] if pd.isna(row[col]) else row[col],
        axis=1
    )

# Verify no nulls remain in core metrics
core_cols = ['calories', 'fat_g', 'saturated_fat_g', 'cholesterol_mg', 'sugars_g']
print("Null counts after imputation:")
print(df_clean[core_cols].isna().sum())

Null counts after imputation:
calories           0
fat_g              0
saturated_fat_g    0
cholesterol_mg     0
sugars_g           0
dtype: int64


In [None]:
# Classification function
def classify_health(row):
    # Check for nulls in core metrics
    core = ['calories', 'fat_g', 'saturated_fat_g', 'cholesterol_mg', 'sugars_g']
    if any(pd.isna(row[col]) for col in core):
        return 'Unknown'

    # Core thresholds
    healthy_conditions = [
        row['calories'] <= 200,
        row['fat_g'] <= 10,
        row['saturated_fat_g'] <= 3,
        row['cholesterol_mg'] <= 30,
        row['sugars_g'] <= 10
    ]
    unhealthy_conditions = [
        row['calories'] > 400,
        row['fat_g'] > 15,
        row['saturated_fat_g'] > 6,
        row['cholesterol_mg'] > 60,
        row['sugars_g'] > 25
    ]

    # Optional boosts
    protein_boost = row['protein_g'] >= 10 if not pd.isna(row['protein_g']) else False
    fiber_boost = row['dietary_fiber_g'] >= 5 if not pd.isna(row['dietary_fiber_g']) else False
    sodium_flag = row['sodium_mg'] > 600 if not pd.isna(row['sodium_mg']) else False

    # Count conditions
    healthy_count = sum(healthy_conditions)
    unhealthy_count = sum(unhealthy_conditions)

    # Classification
    if healthy_count >= 4 or (healthy_count >= 3 and (protein_boost or fiber_boost)):
        return 'Healthy'
    elif unhealthy_count >= 2 or (unhealthy_count >= 1 and sodium_flag):
        return 'Unhealthy'
    return 'Moderate'

In [None]:
# Scoring function
def classify_diet(row):
    cals = row['calories'] or 0
    carbs = row['carbohydrates_g'] or 0
    sugars = row['sugars_g'] or 0
    fat = row['fat_g'] or 0
    prot = row['protein_g'] or 0
    fiber = row['dietary_fiber_g'] or 0
    sodium = row['sodium_mg'] or 0

    scores = {
        'HCLF': 0, 'HPLC': 0, 'Balanced': 0, 'LCHF': 0, 'LCHFib': 0, 'Junk': 0
    }

    # Carbs
    if carbs >= 40: scores['HCLF'] += 2; scores['HPLC'] -= 2; scores['LCHF'] -= 2
    elif carbs <= 15: scores['HPLC'] += 2; scores['LCHF'] += 2; scores['HCLF'] -= 2
    elif 20 <= carbs <= 40: scores['Balanced'] += 2

    # Fat
    if fat <= 10: scores['HCLF'] += 2; scores['LCHFib'] += 2; scores['Balanced'] += 1; scores['LCHF'] -= 2
    elif fat >= 15: scores['LCHF'] += 2; scores['Junk'] += 1; scores['HCLF'] -= 2; scores['LCHFib'] -= 2
    elif 5 <= fat <= 15: scores['Balanced'] += 1

    # Protein
    if prot >= 20: scores['HPLC'] += 2; scores['HCLF'] -= 1
    elif 10 <= prot <= 20: scores['Balanced'] += 1; scores['LCHF'] += 1

    # Calories
    if cals <= 200: scores['LCHFib'] += 2; scores['Junk'] -= 2
    elif cals > 300: scores['Junk'] += 2; scores['LCHFib'] -= 2

    # Sugars
    if sugars >= 25: scores['Junk'] += 2; scores['HCLF'] -= 1; scores['HPLC'] -= 1; scores['Balanced'] -= 1; scores['LCHF'] -= 1; scores['LCHFib'] -= 1
    elif sugars <= 10: scores['LCHFib'] += 1; scores['Junk'] -= 1

    # Fiber
    if fiber >= 5: scores['HCLF'] += 1; scores['Balanced'] += 1; scores['LCHFib'] += 2; scores['Junk'] -= 1

    # Sodium
    if sodium > 600: scores['Balanced'] -= 1; scores['Junk'] += 1

    # Classify
    max_score = max(scores.values())
    if max_score <= 0: return 'Other'
    for diet, score in scores.items():
        if score == max_score: return diet
    return 'Other'

In [None]:
df_clean['health_level'] = df_clean.apply(classify_health, axis=1)
df_clean['diet'] = df_clean.apply(classify_diet, axis=1)
# df.to_csv('/MyDrive/NewDataset/recipes_classified.csv', index=False)
# print(df['HealthClassification'].value_counts())

In [None]:
df_clean['diet'].value_counts()

Unnamed: 0_level_0,count
diet,Unnamed: 1_level_1
LCHFib,7597
Junk,7081
Balanced,3502
HPLC,2867
LCHF,2442
HCLF,1926


In [None]:
df_clean.to_csv(os.path.join(DIR, "data/recipes_classified.csv"), index=False)

In [None]:
df = df_clean.copy()

In [None]:
df.head()

Unnamed: 0,title,url,category,rating,ingredients,directions,prep_time,cook_time,total_time,servings,...,saturated_fat_g,cholesterol_mg,protein_g,dietary_fiber_g,sodium_mg,instructions_list,image,recipe_id,health_level,diet
0,Simple Macaroni and Cheese,https://www.allrecipes.com/recipe/238691/simpl...,main-dish,4.42,1 (8 ounce) box elbow macaroni ; ¼ cup butter ...,Bring a large pot of lightly salted water to a...,10 mins,20 mins,30 mins,4,...,20.9,99.6,26.5,2.1,777.0,['Bring a large pot of lightly salted water to...,https://www.allrecipes.com/thmb/GZrTl8DBwmRuor...,148ecaf409,Unhealthy,Junk
1,Gourmet Mushroom Risotto,https://www.allrecipes.com/recipe/85389/gourme...,main-dish,4.8,"6 cups chicken broth, divided ; 3 tablespoons ...","In a saucepan, warm the broth over low heat. W...",20 mins,30 mins,50 mins,6,...,6.6,29.3,11.3,2.7,1130.8,"['Warm broth in a saucepan over low heat.', 'M...",https://www.allrecipes.com/thmb/xCk4IEjfAYBikO...,65fcb51062,Unhealthy,Junk
2,Dessert Crepes,https://www.allrecipes.com/recipe/19037/desser...,breakfast-and-brunch,4.8,"4 eggs, lightly beaten ; 1 ⅓ cups milk ; 2 ta...","In large bowl, whisk together eggs, milk, melt...",10 mins,10 mins,20 mins,8,...,3.4,111.1,6.4,0.4,234.5,"['Whisk together eggs, milk, flour, melted but...",https://www.allrecipes.com/thmb/VwULr05JFDluPI...,1191ef7390,Moderate,LCHFib
3,Pork Steaks,https://www.allrecipes.com/recipe/70463/pork-s...,meat-and-poultry,4.57,¼ cup butter ; ¼ cup soy sauce ; 1 bunch green...,"Melt butter in a skillet, and mix in the soy s...",15 mins,30 mins,45 mins,6,...,11.4,118.0,26.5,1.1,719.7,['Melt butter in a skillet over medium heat; s...,https://www.allrecipes.com/thmb/mYkvln7o9pb35l...,d6b54b7cce,Unhealthy,HPLC
4,Quick and Easy Pizza Crust,https://www.allrecipes.com/recipe/20171/quick-...,bread,4.7,1 (.25 ounce) package active dry yeast ; 1 tea...,Preheat oven to 450 degrees F (230 degrees C)....,,,,8,...,0.6,34.474707,4.8,1.1,292.8,['Preheat oven to 450 degrees F (230 degrees C...,https://www.allrecipes.com/thmb/V3Llo-ottudIs_...,6bddaec59a,Healthy,LCHFib


In [None]:
IMAGE_SIZE = (512, 512)

In [None]:
remove = []

In [None]:
# Download images with progress tracking
def download_save_image(row):
    try:
        img_path = Path(DIR + 'images/' + f"{row['recipe_id']}.jpg")
        # print(img_path)

        if img_path.exists():  # Skip already downloaded
            return True

        response = requests.get(row['image'], timeout=15)
        img = Image.open(BytesIO(response.content))
        img = img.convert('RGB').resize(IMAGE_SIZE)
        img.save(img_path, 'JPEG', quality=90)
        return True
    except Exception as e:
        print(f"Failed {row['recipe_id']}: {str(e)}")
        remove.append(row['recipe_id'])
        return False

In [None]:
import requests
from io import BytesIO
from PIL import Image
import pandas as pd
from pathlib import Path
import concurrent.futures
from tqdm import tqdm

# Constants
DIR = "/path/to/dataset/"
IMAGE_SIZE = (256, 256)  # Target size
df = pd.read_csv(DIR + 'data/final.csv')
remove = []  # List to store failed downloads
IMAGE_DIR = Path(DIR) / "images"
IMAGE_DIR.mkdir(parents=True, exist_ok=True)  # Ensure directory exists

# Function to download and save an image
def download_save_image(row):
    try:
        img_path = IMAGE_DIR / f"{row['recipe_id']}.jpg"
        if img_path.exists():  # Skip if already downloaded
            return row['recipe_id'], True

        response = requests.get(row['image'], timeout=15)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
            img = img.convert('RGB').resize(IMAGE_SIZE)
            img.save(img_path, 'JPEG', quality=90)
            return row['recipe_id'], True
        else:
            raise Exception(f"HTTP {response.status_code}")
    except Exception as e:
        return row['recipe_id'], False  # Return failed recipe ID

# Run concurrent downloading
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    futures = {executor.submit(download_save_image, row): row['recipe_id'] for _, row in df.iterrows()}

    # Track progress
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Downloading Images"):
        recipe_id, success = future.result()
        if not success:
            remove.append(recipe_id)

# Save failed download list
if remove:
    pd.DataFrame(remove, columns=['recipe_id']).to_csv(DIR + "failed_downloads.csv", index=False)
    print(f"Failed downloads saved: {len(remove)} images")


In [None]:
# 7bc84360b0, 8a0a4ffe66, 15d879b56e, 6b1a599fcf, 25351ef3d6, 4090db5c8c, 8376f5af72, d637804a15, 59d80de8ad, a1e64da808, 4090db5c8c

In [None]:
# Batch processing with status tracking
success_mask = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    success_mask.append(download_save_image(row))

  1%|          | 296/25415 [00:14<11:15, 37.17it/s]

Failed 7bc84360b0: cannot identify image file <_io.BytesIO object at 0x7d8a935e7b00>


 10%|█         | 2637/25415 [00:15<00:19, 1195.53it/s]

Failed 8a0a4ffe66: cannot identify image file <_io.BytesIO object at 0x7d8a935e74c0>


 20%|█▉        | 5032/25415 [05:35<1:04:10,  5.29it/s]

Failed 6b1a599fcf: cannot identify image file <_io.BytesIO object at 0x7d8a93706e30>


 33%|███▎      | 8330/25415 [19:15<1:11:08,  4.00it/s]

Failed 25351ef3d6: cannot identify image file <_io.BytesIO object at 0x7d8a93704ef0>


 41%|████      | 10402/25415 [27:32<1:04:52,  3.86it/s]

Failed 4090db5c8c: Invalid URL 'NaNname': No scheme supplied. Perhaps you meant https://NaNname?


 60%|██████    | 15302/25415 [47:29<38:42,  4.35it/s]

Failed d637804a15: cannot identify image file <_io.BytesIO object at 0x7d8a93706980>


 69%|██████▉   | 17575/25415 [56:43<27:09,  4.81it/s]

Failed 59d80de8ad: cannot identify image file <_io.BytesIO object at 0x7d8a935e5da0>


 70%|███████   | 17818/25415 [57:35<28:13,  4.49it/s]

Failed a1e64da808: cannot identify image file <_io.BytesIO object at 0x7d8a93707ab0>


 75%|███████▌  | 19171/25415 [1:02:36<13:59,  7.44it/s]

Failed 4090db5c8c: Invalid URL 'NaNname': No scheme supplied. Perhaps you meant https://NaNname?


100%|██████████| 25415/25415 [1:29:51<00:00,  4.71it/s]


In [None]:
print(remove)

['7bc84360b0', '8a0a4ffe66', '6b1a599fcf', '25351ef3d6', '4090db5c8c', 'd637804a15', '59d80de8ad', 'a1e64da808', '4090db5c8c']


In [None]:
# Create clean dataset
df_clean_downloaded = df[success_mask].reset_index(drop=True)
df_clean_downloaded = df_clean_downloaded[~df_clean_downloaded['recipe_id'].isin(remove)]
print(len(df_clean_downloaded))
df_clean_downloaded.to_csv(DIR + '/data/final.csv', index=False)

25406


In [None]:
print(f"Successfully downloaded {len(df_clean_downloaded)}/{len(df)} images")

Successfully downloaded 25406/25415 images


# Data Augmentation

In [None]:
os.makedirs(os.path.join(DIR, "images_aug"), exist_ok=True)

In [None]:
import pandas as pd
from pathlib import Path
from PIL import Image
import torchvision.transforms as transforms
from tqdm import tqdm
import numpy as np

In [None]:
# Image augmentation pipeline
augment = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2),
    transforms.ToTensor(),
    transforms.ToPILImage()
])

In [None]:
import pandas as pd
from pathlib import Path
from PIL import Image
import torchvision.transforms as transforms
from tqdm import tqdm
import numpy as np
import concurrent.futures

IMAGE_DIR = Path(DIR) / 'images/'
IMG_OUT_DIR = Path(DIR) / 'images_aug/'
# Image augmentation pipeline
augment = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomRotation(15),
    transforms.ColorJitter(brightness=0.2),
    transforms.ToTensor(),
    transforms.ToPILImage()
])

# Target size per class (match LCHFib)
TARGET_SIZE = 4000
classes = ['LCHFib', 'Junk', 'Balanced', 'HPLC', 'LCHF', 'HCLF']
augmented_data = []


def augment_image(row, i):
    """Function to augment a single image."""
    orig_img_path = IMAGE_DIR / f"{row['recipe_id']}.jpg"
    new_recipe_id = f"{row['recipe_id']}_aug{i}"
    new_img_path = IMG_OUT_DIR / f"{new_recipe_id}.jpg"

    try:
        if new_img_path.exists():
            # If augmented image already exists, reuse it
            new_row = row.copy()
            new_row['recipe_id'] = new_recipe_id
            new_row['image'] = f"{new_recipe_id}.jpg"
            return pd.DataFrame([new_row])

        if orig_img_path.exists():
            # Load and augment original image
            img = Image.open(orig_img_path).convert('RGB')
            aug_img = augment(img)
            aug_img.save(new_img_path)

            new_row = row.copy()
            new_row['recipe_id'] = new_recipe_id
            new_row['image'] = f"{new_recipe_id}.jpg"
            return pd.DataFrame([new_row])
        else:
            print(f"Original image not found: {orig_img_path}")
            return None
    except Exception as e:
        print(f"Error processing {orig_img_path}: {e}")
        return None


# Process each class with outer tqdm
for diet in classes:
    class_df = df[df['diet'] == diet]
    current_count = len(class_df)

    if current_count >= TARGET_SIZE:
        # No augmentation needed, keep as is
        augmented_data.append(class_df)
    else:
        # Add original rows
        augmented_data.append(class_df)
        extra_needed = TARGET_SIZE - current_count

        # Parallel processing for image augmentation
        with concurrent.futures.ProcessPoolExecutor(max_workers=8) as executor:  # Adjust workers based on CPU cores
            futures = [executor.submit(augment_image, class_df.sample(1, random_state=i).iloc[0], i)
                       for i in range(extra_needed)]

            for future in tqdm(concurrent.futures.as_completed(futures), total=extra_needed, desc=f"Augmenting {diet}"):
                result = future.result()
                if result is not None:
                    augmented_data.append(result)

    # Verify class size
    class_size = sum(len(df[df['diet'] == diet]) for df in augmented_data if diet in df['diet'].values)
    print(f"{diet}: {class_size} (Target: {TARGET_SIZE})")

LCHFib: 7597 (Target: 4000)
Junk: 7081 (Target: 4000)


Augmenting Balanced: 100%|██████████| 498/498 [00:16<00:00, 29.56it/s]


Balanced: 4000 (Target: 4000)


Augmenting HPLC: 100%|██████████| 1133/1133 [00:42<00:00, 26.57it/s]


HPLC: 4000 (Target: 4000)


Augmenting LCHF: 100%|██████████| 1558/1558 [01:12<00:00, 21.47it/s]


LCHF: 4000 (Target: 4000)


Augmenting HCLF: 100%|██████████| 2074/2074 [01:11<00:00, 29.11it/s]


HCLF: 4000 (Target: 4000)


In [None]:
# Combine and save
augmented_df = pd.concat(augmented_data).reset_index(drop=True)
augmented_df.to_csv(DIR + 'data/recipes_augmented_balanced.csv', index=False)

print("Final distribution:")
print(augmented_df['diet'].value_counts())

Final distribution:
diet
LCHFib      7597
Junk        7081
Balanced    4000
HPLC        4000
LCHF        4000
HCLF        4000
Name: count, dtype: int64


In [None]:
len(augmented_df)

30678

# Model

In [3]:
import torch
import torch.nn as nn

In [4]:


# class CBIRCAutoEncoder(nn.Module):
#     def __init__(self, latent_dim=128, num_classes=6):
#         super(CBIRCAutoEncoder, self).__init__()
#         # Encoder: 256x256 -> 128-D
#         self.encoder = nn.Sequential(
#             nn.Conv2d(3, 32, 4, stride=2, padding=1),  # [3, 256, 256] -> [32, 128, 128]
#             nn.ReLU(),
#             nn.Conv2d(32, 64, 4, stride=2, padding=1),  # [64, 64, 64]
#             nn.ReLU(),
#             nn.Conv2d(64, 128, 4, stride=2, padding=1),  # [128, 32, 32]
#             nn.ReLU(),
#             nn.Conv2d(128, 256, 4, stride=2, padding=1),  # [256, 16, 16]
#             nn.ReLU(),
#             nn.Flatten(),
#             nn.Linear(256 * 16 * 16, latent_dim)  # [128]
#         )
#         # Decoder: 128-D -> 256x256 (for training only)
#         self.decoder = nn.Sequential(
#             nn.Linear(latent_dim, 256 * 16 * 16),
#             nn.ReLU(),
#             nn.Unflatten(1, (256, 16, 16)),
#             nn.ConvTranspose2d(256, 128, 4, stride=2, padding=1),
#             nn.ReLU(),
#             nn.ConvTranspose2d(128, 64, 4, stride=2, padding=1),
#             nn.ReLU(),
#             nn.ConvTranspose2d(64, 32, 4, stride=2, padding=1),
#             nn.ReLU(),
#             nn.ConvTranspose2d(32, 3, 4, stride=2, padding=1),
#             nn.Sigmoid()
#         )
#         # Classifier: 128-D -> 6 classes
#         self.classifier = nn.Sequential(
#             nn.Linear(latent_dim, 64),
#             nn.ReLU(),
#             nn.Linear(64, num_classes)
#         )

#     def forward(self, x):
#         latent = self.encoder(x)
#         recon = self.decoder(latent)
#         class_logits = self.classifier(latent)
#         return recon, class_logits

#     def get_latent(self, x):
#         return self.encoder(x)  # For CBIR

#     def classify(self, x):
#         latent = self.encoder(x)
#         return self.classifier(latent)  # For diet_level

In [5]:
# Find if any accelerator is presented, if yes switch device to use CUDA or else use CPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [6]:
class CBIRCAutoEncoder(nn.Module):
    def __init__(self, latent_dim=128, num_classes=6):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Conv2d(3, 64, 3, stride=1, padding=1),  # [64, 256, 256]
            nn.ReLU(True),
            nn.Conv2d(64, 64, 3, stride=1, padding=1),
            nn.ReLU(True),
            nn.MaxPool2d(2, 2),  # [64, 128, 128]
            nn.Conv2d(64, 128, 3, stride=2, padding=1),  # [128, 64, 64]
            nn.ReLU(True),
            nn.Conv2d(128, 128, 3, stride=1),  # [128, 62, 62]
            nn.ReLU(True),
            nn.MaxPool2d(2, 2),  # [128, 31, 31]
            nn.Conv2d(128, 256, 3, stride=2, padding=1),  # [256, 16, 16]
            nn.ReLU(True),
            nn.Conv2d(256, 256, 3, stride=1, padding=1),
            nn.ReLU(True),
            nn.Conv2d(256, 256, 3, stride=1, padding=1),
            nn.ReLU(True),
        )
        self.fc_latent = nn.Linear(256 * 16 * 16, latent_dim)  # [65536, 128]
        self.fc_to_decoder = nn.Linear(latent_dim, 256 * 16 * 16)  # [128, 65536]
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(256, 256, 3, stride=1, padding=1),  # [256, 16, 16]
            nn.ReLU(True),
            nn.ConvTranspose2d(256, 256, 3, stride=1, padding=1),
            nn.ReLU(True),
            nn.ConvTranspose2d(256, 128, 3, stride=2),  # [128, 33, 33]
            nn.ReLU(True),
            nn.ConvTranspose2d(128, 64, 3, stride=2, padding=1),  # [64, 66, 66]
            nn.ReLU(True),
            nn.ConvTranspose2d(64, 32, 3, stride=2, padding=1),  # [32, 132, 132]
            nn.ReLU(True),
            nn.ConvTranspose2d(32, 32, 3, stride=2, padding=1),  # [32, 264, 264]
            nn.ReLU(True),
            nn.ConvTranspose2d(32, 3, 3, stride=2, padding=0, output_padding=1),  # [3, 529, 529]
            nn.Sigmoid()
        )
        self.classifier = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        x = self.encoder(x)
        x = x.view(x.size(0), -1)  # [64, 65536]
        latent = self.fc_latent(x)  # [64, 128]
        decoder_input = self.fc_to_decoder(latent).view(-1, 256, 16, 16)
        recon = self.decoder(decoder_input)  # [64, 3, 529, 529]
        recon = recon[:, :, :256, :256]  # Crop to [64, 3, 256, 256]
        logits = self.classifier(latent)  # [64, 6]
        return recon, logits


In [9]:
# os.makedirs("images")

In [None]:
# !cp -r "/content/drive/MyDrive/project/images" "/content/images"

In [7]:
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms as T
from PIL import Image
from pathlib import Path
import torch
import pandas as pd

class RecipeDataset(Dataset):
    def __init__(self, df, image_dir, transform=None):
        self.df = df
        self.image_dir = Path(image_dir)  # Faster with Pathlib
        self.transform = transform
        self.label_map = {lbl: idx for idx, lbl in enumerate(sorted(df['diet'].unique()))}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        if "_aug" in row['recipe_id']:
          img_path = self.image_dir / "images_aug" / f"{row['recipe_id']}.jpg"
        else:
          img_path = self.image_dir / f"{row['recipe_id']}.jpg"

        # Handle missing files to avoid crashes
        if not img_path.exists():
            print(f"Warning: Missing image {img_path}")
            return torch.zeros(3, 256, 256), 0  # Return empty tensor if missing

        img = Image.open(img_path).convert('RGB')
        if self.transform:
            img = self.transform(img)

        label = self.label_map[row['diet']]
        return img, label


In [8]:
from torch.utils.data import random_split


# Load data
df = pd.read_csv("/content/drive/MyDrive/project/data/recipes_augmented_balanced.csv")

# Define transformations including normalization
transform = T.Compose([
    T.Resize((256, 256)),  # Resize to 256x256
    T.ToTensor(),  # Convert image to tensor
    T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize with mean and std
])

dataset = RecipeDataset(df, "/content/drive/MyDrive/project/images", transform)
# Split dataset into 80% training and 20% testing
train_size = int(0.8 * len(dataset))  # 80% for training
test_size = len(dataset) - train_size  # 20% for testing

# Use random_split to create training and testing datasets
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# Create DataLoader for training and testing sets
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=5, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=5, pin_memory=True)



In [None]:
import torch
import torch.nn as nn
from pathlib import Path
from tqdm import tqdm

# Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CBIRCAutoEncoder(latent_dim=128, num_classes=6).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
recon_loss_fn = nn.MSELoss()
class_loss_fn = nn.CrossEntropyLoss()

In [None]:
print(model)

CBIRCAutoEncoder(
  (encoder): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
  )
  (fc_latent): Linear(in_features=65536, out_features=128, bias=Tr

In [None]:
EPOCH = 2
MODELS_DIR = Path(DIR) / 'models'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

best_loss = float('inf')

# Initialize a list to store loss history for plotting
loss_history = []

for epoch in range(EPOCH):
    epoch_loss = 0.0
    recon_loss_epoch = 0.0
    class_loss_epoch = 0.0

    with tqdm(train_loader, desc=f"Epoch {epoch+1}") as pbar:
        for images, labels in pbar:
            images, labels = images.to(device), labels.to(device)
            optimizer.zero_grad()

            # Forward pass
            recon, logits = model(images)

            # Compute losses
            recon_loss = recon_loss_fn(recon, images)
            class_loss = class_loss_fn(logits, labels)

            # Combine losses
            loss = recon_loss + 0.5 * class_loss

            # Backpropagation and optimization
            loss.backward()
            optimizer.step()

            # Accumulate losses
            epoch_loss += loss.item() * images.size(0)
            recon_loss_epoch += recon_loss.item() * images.size(0)
            class_loss_epoch += class_loss.item() * images.size(0)

            # Update progress bar with detailed loss information
            pbar.set_postfix(
                loss=f"{loss.item():.4f}",
                recon_loss=f"{recon_loss.item():.4f}",
                class_loss=f"{class_loss.item():.4f}"
            )

    # Average losses for the epoch
    epoch_loss /= len(train_loader.dataset)
    recon_loss_epoch /= len(train_loader.dataset)
    class_loss_epoch /= len(train_loader.dataset)

    # Store losses in history for plotting
    loss_history.append({
        'epoch': epoch + 1,
        'total_loss': epoch_loss,
        'recon_loss': recon_loss_epoch,
        'class_loss': class_loss_epoch
    })

    # Print epoch-wise loss
    print(f"Epoch {epoch+1}, Total Loss: {epoch_loss:.4f}, Recon Loss: {recon_loss_epoch:.4f}, Class Loss: {class_loss_epoch:.4f}")

    # Save best model based on total loss
    if epoch_loss < best_loss:
        best_loss = epoch_loss
        save_path = MODELS_DIR / f"{epoch+1}_{epoch_loss:.4f}.pth"
        torch.save(model.state_dict(), save_path)
        print(f"Saved best model to {save_path}")

# Save the final model
final_path = MODELS_DIR / 'final.pth'
torch.save(model.state_dict(), final_path)
print(f"Saved final model to {final_path}")

Epoch 1:   7%|▋         | 25/384 [05:31<34:55,  5.84s/it, class_loss=1.7333, loss=1.1964, recon_loss=0.3297]



Epoch 1:  12%|█▏        | 45/384 [08:50<33:47,  5.98s/it, class_loss=1.7508, loss=1.2118, recon_loss=0.3364]



Epoch 1:  17%|█▋        | 65/384 [11:44<31:19,  5.89s/it, class_loss=1.6607, loss=1.1590, recon_loss=0.3287]



Epoch 1:  17%|█▋        | 65/384 [12:00<31:19,  5.89s/it, class_loss=1.6607, loss=1.1590, recon_loss=0.3287]



Epoch 1:  48%|████▊     | 185/384 [30:30<19:47,  5.97s/it, class_loss=1.6683, loss=1.1366, recon_loss=0.3025]



Epoch 1:  69%|██████▉   | 265/384 [42:40<11:29,  5.79s/it, class_loss=1.8009, loss=1.4138, recon_loss=0.5134]



Epoch 1:  84%|████████▍ | 324/384 [51:37<07:17,  7.28s/it, class_loss=1.7538, loss=1.1927, recon_loss=0.3158]



Epoch 1:  85%|████████▌ | 328/384 [52:17<07:19,  7.84s/it, class_loss=1.7015, loss=1.1628, recon_loss=0.3121]



Epoch 1:  93%|█████████▎| 359/384 [56:58<03:01,  7.27s/it, class_loss=1.8115, loss=1.2446, recon_loss=0.3388]



Epoch 1: 100%|██████████| 384/384 [1:00:49<00:00,  9.50s/it, class_loss=1.7454, loss=1.2140, recon_loss=0.3413]


Epoch 1, Total Loss: 1.2124, Recon Loss: 0.3376, Class Loss: 1.7496
Saved best model to /content/drive/MyDrive/project/models/1_1.2124.pth


Epoch 2:  12%|█▏        | 45/384 [00:07<00:44,  7.63it/s, class_loss=1.7659, loss=1.2063, recon_loss=0.3233]



Epoch 2:  14%|█▍        | 55/384 [00:09<00:42,  7.80it/s, class_loss=1.7489, loss=1.2084, recon_loss=0.3340]



Epoch 2:  17%|█▋        | 66/384 [00:11<01:00,  5.28it/s, class_loss=1.7292, loss=1.2097, recon_loss=0.3451]



Epoch 2:  27%|██▋       | 102/384 [00:17<00:46,  6.05it/s, class_loss=1.8161, loss=1.2354, recon_loss=0.3274]



Epoch 2:  30%|██▉       | 114/384 [00:18<00:39,  6.88it/s, class_loss=1.7027, loss=1.1818, recon_loss=0.3305]



Epoch 2:  30%|███       | 117/384 [00:19<00:49,  5.42it/s, class_loss=1.7822, loss=1.2196, recon_loss=0.3285]



Epoch 2:  32%|███▏      | 121/384 [00:19<00:39,  6.58it/s, class_loss=1.6896, loss=1.1582, recon_loss=0.3134]



Epoch 2:  34%|███▎      | 129/384 [00:21<00:38,  6.65it/s, class_loss=1.8275, loss=1.2290, recon_loss=0.3153]



Epoch 2:  62%|██████▏   | 237/384 [00:38<00:26,  5.64it/s, class_loss=1.7261, loss=1.2003, recon_loss=0.3372]



Epoch 2: 100%|██████████| 384/384 [01:00<00:00,  6.33it/s, class_loss=1.7154, loss=1.1470, recon_loss=0.2893]


Epoch 2, Total Loss: 1.2049, Recon Loss: 0.3312, Class Loss: 1.7476
Saved best model to /content/drive/MyDrive/project/models/2_1.2049.pth
Saved final model to /content/drive/MyDrive/project/models/final.pth


In [None]:
import os

folder_path = DIR + "images/"
file_count = len(os.listdir(folder_path))
print(f"Total files and folders: {file_count}")


Total files and folders: 24750


In [None]:
from pathlib import Path
import pandas as pd

# Define paths
folder_path = Path(DIR) / "images"
csv_path = Path(DIR) / "data/final.csv"

# Load recipe IDs from DataFrame and clean them
df = pd.read_csv(csv_path, dtype={"recipe_id": str})  # Ensure recipe_id is string
df["recipe_id"] = df["recipe_id"].str.strip()  # Remove spaces
valid_ids = set(df["recipe_id"])

# Get all image filenames (without extensions)
image_filenames = {img.stem.strip() for img in folder_path.glob("*.jpg")}

# Find missing images
missing_images = valid_ids - image_filenames
extra_images = image_filenames - valid_ids

print(f"Total missing images: {len(missing_images)}")
print(f"Sample missing images: {list(missing_images)[:5]}")
print(f"Total extra images: {len(extra_images)}")
print(f"Sample extra images: {list(extra_images)[:5]}")


Total missing images: 10
Sample missing images: ['7bc84360b0', '8376f5af72', 'd637804a15', '4090db5c8c', '6b1a599fcf']
Total extra images: 0
Sample extra images: []


In [None]:
len(df)

25415

In [None]:
# {'$schema': 'http://json-schema.org/schema#',
#  'type': 'array',
#  'items': {'type': 'object',
#   'properties': {'id': {'type': 'string'},
#    'images': {'type': 'array',
#     'items': {'type': 'object',
#      'properties': {'id': {'type': 'string'}, 'url': {'type': 'string'}},
#      'required': ['id', 'url']}}},
#   'required': ['id', 'images']}}

In [None]:
# {'$schema': 'http://json-schema.org/schema#',
#  'type': 'array',
#  'items': {'type': 'object',
#   'properties': {'ingredients': {'type': 'array',
#     'items': {'type': 'object',
#      'properties': {'text': {'type': 'string'}},
#      'required': ['text']}},
#    'url': {'type': 'string'},
#    'partition': {'type': 'string'},
#    'title': {'type': 'string'},
#    'id': {'type': 'string'},
#    'instructions': {'type': 'array',
#     'items': {'type': 'object',
#      'properties': {'text': {'type': 'string'}},
#      'required': ['text']}}},
#   'required': ['id',
#    'ingredients',
#    'instructions',
#    'partition',
#    'title',
#    'url']}}

In [9]:
import torch

# Define the device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize the model (ensure it matches the architecture used during training)
model = CBIRCAutoEncoder()  # Replace with your actual model class
model.load_state_dict(torch.load(DIR + "models/final.pth", map_location=device))  # Load weights
model.to(device)
model.eval()  # Set model to evaluation mode


CBIRCAutoEncoder(
  (encoder): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
  )
  (fc_latent): Linear(in_features=65536, out_features=128, bias=Tr

In [11]:
# Assuming test_loader is defined (e.g., batch_size=64, same as train_loader)
# Verify test_loader setup
images, labels = next(iter(test_loader))
print(f"Test batch shape: {images.shape}, Labels shape: {labels.shape}")  # [64, 3, 256, 256], [64]

Test batch shape: torch.Size([64, 3, 256, 256]), Labels shape: torch.Size([64])


In [None]:
# Inference
all_preds = []
all_labels = []
correct = 0
total = 0

with torch.no_grad():  # No gradients needed for inference
    for images, labels in tqdm(test_loader, desc="Inference"):
        images, labels = images.to(device), labels.to(device)
        _, logits = model(images)  # Ignore recon, we want logits
        preds = torch.argmax(logits, dim=1)  # Predicted class indices

        # Accumulate for accuracy
        correct += (preds == labels).sum().item()
        total += labels.size(0)

        # Store for detailed analysis
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

# Calculate accuracy
accuracy = correct / total * 100
print(f"Test Accuracy: {accuracy:.2f}% ({correct}/{total})")

Inference:  16%|█▌        | 15/96 [01:41<10:04,  7.46s/it]



Inference:  74%|███████▍  | 71/96 [10:56<03:27,  8.28s/it]

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Class names (adjust if your labels differ)
class_names = ['LCHFib', 'Junk', 'Balanced', 'HPLC', 'LCHF', 'HCLF']

# Convert to numpy arrays
all_preds = np.array(all_preds)
all_labels = np.array(all_labels)

# Classification report
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names=class_names))

# Confusion matrix
conf_matrix = confusion_matrix(all_labels, all_preds)
print("\nConfusion Matrix:")
print(conf_matrix)

# Optional: Visualize confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
            xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()