In [54]:
import os
import pandas as pd
dataset_path = '../data/nutrition5k_reconstructed/'

dataset_path = '../../data/nutrition5k_reconstructed/'

image_path = os.path.join(dataset_path, 'images')
label_path = os.path.join(dataset_path, 'labels/labels.csv')
train_path = os.path.join(dataset_path, 'metadata/train_ids.csv')
test_path = os.path.join(dataset_path, 'metadata/test_ids.csv')

# Training Files
with open(train_path, 'r') as f:
    train_files = f.read().splitlines()
    print('Number of training files:', len(train_files))
    
# Testing Files
with open(test_path, 'r') as f:
    test_files = f.read().splitlines()
    print('Number of testing files:', len(test_files))

# Labels
labels = pd.read_csv(label_path)
print('Number of labels:', len(labels))
print('\n------------------------------------\n')

# Displaying the first 5 rows of the labels
print(labels.head())

# Record original labels
original_labels = labels.copy()


Number of training files: 2755
Number of testing files: 507
Number of labels: 3262

------------------------------------

                id    calories   mass        fat       carb    protein
0  dish_1566851198  463.047455  342.0  21.119635  42.903687  25.897209
1  dish_1565119439  465.412231  189.0  19.563423  42.858475  27.351355
2  dish_1561753432  380.321686  238.0  19.463066  16.882177  33.369026
3  dish_1561662054  419.438782  292.0  23.838249  26.351543  25.910593
4  dish_1563205982  316.239502  469.0   4.363000  54.792000  16.148998


In [55]:
# stats about the labels
stats = labels.describe()
stats

Unnamed: 0,calories,mass,fat,carb,protein
count,3262.0,3262.0,3262.0,3262.0,3262.0
mean,255.012738,214.980074,12.727633,19.386111,18.004492
std,219.63757,161.497428,13.526409,21.605225,20.044535
min,0.0,1.0,0.0,0.0,0.0
25%,80.114996,92.0,1.845926,6.1715,2.128136
50%,209.110062,177.0,8.845456,15.36,11.488166
75%,375.122963,305.0,19.03252,27.532451,28.034391
max,3943.325195,3051.0,106.343002,844.568604,147.491821


In [56]:
import numpy as np
import pandas as pd

# Log Transformation with a small constant to avoid log(0)
labels.iloc[:, 1:6] = np.log1p(labels.iloc[:, 1:6])

print('Log Transformation:')
print(labels.head(3))
print(labels.describe())

# Normalizing outputs in range [0-1] using max-min normalization
min_values = labels.iloc[:, 1:6].min()
max_values = labels.iloc[:, 1:6].max()
labels.iloc[:, 1:6] = labels.iloc[:, 1:6].sub(min_values).div(max_values - min_values)

# Saving the minimum and maximum values for inverse transformation
with open('min_max_values.csv', 'w') as f:
    f.write("category,min,max\n")
    for i, (min_val, max_val) in enumerate(zip(min_values, max_values)):
        f.write(f"{labels.columns[i + 1]},{min_val},{max_val}\n")

print('\n------------------------------------\n')
print('Normalized labels:')
print(labels.head(3))

# Adding full image name column...
labels['img_indx'] = labels["id"] + ".jpeg"

# Add original labels to the dataframe
labels['original_calories'] = original_labels['calories']
labels['original_mass'] = original_labels['mass']
labels['original_fat'] = original_labels['fat']
labels['original_carb'] = original_labels['carb']
labels['original_protein'] = original_labels['protein']


Log Transformation:
                id  calories      mass       fat      carb   protein
0  dish_1566851198  6.139987  5.837730  3.096466  3.781998  3.292023
1  dish_1565119439  6.145070  5.247024  3.023514  3.780968  3.344675
2  dish_1561753432  5.943643  5.476464  3.018622  2.883805  3.537156
          calories         mass          fat         carb      protein
count  3262.000000  3262.000000  3262.000000  3262.000000  3262.000000
mean      5.086636     5.080756     2.045539     2.633352     2.309239
std       1.112333     0.836975     1.192373     0.948255     1.231602
min       0.000000     0.693147     0.000000     0.000000     0.000000
25%       4.395868     4.532599     1.045888     1.970112     1.140437
50%       5.347631     5.181784     2.287010     2.794839     2.524780
75%       5.929916     5.723585     2.997357     3.351042     3.368481
max       8.280033     8.023552     4.676029     6.740009     5.000530

------------------------------------

Normalized labels:
       

In [57]:
# stats about the labels
stats = labels.describe()
stats

Unnamed: 0,calories,mass,fat,carb,protein,original_calories,original_mass,original_fat,original_carb,original_protein
count,3262.0,3262.0,3262.0,3262.0,3262.0,3262.0,3262.0,3262.0,3262.0,3262.0
mean,0.614326,0.598549,0.437452,0.390705,0.461799,255.012738,214.980074,12.727633,19.386111,18.004492
std,0.134339,0.114179,0.254997,0.14069,0.246294,219.63757,161.497428,13.526409,21.605225,20.044535
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,0.5309,0.523771,0.22367,0.292301,0.228063,80.114996,92.0,1.845926,6.1715,2.128136
50%,0.645847,0.612331,0.489092,0.414664,0.504903,209.110062,177.0,8.845456,15.36,11.488166
75%,0.716171,0.686243,0.641005,0.497187,0.673625,375.122963,305.0,19.03252,27.532451,28.034391
max,1.0,1.0,1.0,1.0,1.0,3943.325195,3051.0,106.343002,844.568604,147.491821


In [58]:
# Splitting the data into training, validation and testing
from sklearn.model_selection import train_test_split
import shutil

# Create Training Dataframe
train_labels = labels[labels['id'].isin(train_files)]
print('Number of training labels:', len(train_labels))

# Create Testing Dataframe
test_labels = labels[labels['id'].isin(test_files)]

# Splitting the test data into validation and testing
test_labels, val_labels = train_test_split(test_files, test_size=0.4, random_state=42)
print('Number of validation labels:', len(val_labels))
print('Number of testing labels:', len(test_labels))
# convert the validation and testing labels to dataframes
val_labels = labels[labels['id'].isin(val_labels)]
test_labels = labels[labels['id'].isin(test_labels)]

# Saving the training, validation and testing labels
base_dir = './data'
if os.path.exists(base_dir):
    shutil.rmtree(base_dir)
    
if not os.path.exists(base_dir):
    os.makedirs(base_dir)

train_labels.to_csv(os.path.join(base_dir, 'train_labels.csv'), index=False)
val_labels.to_csv(os.path.join(base_dir, 'val_labels.csv'), index=False)
test_labels.to_csv(os.path.join(base_dir, 'test_labels.csv'), index=False)

Number of training labels: 2755
Number of validation labels: 203
Number of testing labels: 304


In [59]:
# Speical Task about predicting based on the image and masses.
labels = pd.read_csv(label_path)
mass_inputs = labels.drop(columns=['calories', 'fat', 'carb', 'protein'])
mass_inputs.to_csv(os.path.join(base_dir, 'mass_inputs.csv'), index=False)