In [13]:
# data preprocessing
import os

nutrition5k_path = '../data/nutrition5k'
nutrition5k_overhead_path = os.path.join(nutrition5k_path, 'realsense_overhead')
nutrition5k_side_path = os.path.join(nutrition5k_path, 'side_angles')
metadata_path = os.path.join(nutrition5k_path, 'metadata/dish_metadata_cafe1.csv')
ingredients_path = os.path.join(nutrition5k_path, 'metadata/ingredients_metadata.csv')

"""
realsense_overhead
    - image_index
        - depth_color.png
        - depth_raw.png
        - rgb.png
        
        
side_angles
    - image_index
        - frames_sampled30
            - camera_{}_frame_{}.png
            
metadata
    - dish_metadata_cafe1.csv
        - dish_id, dish_name, calories, mass, fat, carb, protein, ingredients content
    - dish_metadata_cafe2.csv
    - ingredients_metadata.csv
"""

metadata_ids = set()
for row in open(metadata_path):
    row = row.strip().split(',')
    metadata_ids.add(row[0])
    
print('Number of unique meta ids: ', len(metadata_ids))

overhead_ids = set()
for file in os.listdir(nutrition5k_overhead_path):
    overhead_ids.add(file)
    
print('Number of unique overhead ids: ', len(overhead_ids))
    
side_ids = set()
for file in os.listdir(nutrition5k_side_path):
    side_ids.add(file)
    
print('Number of unique side ids: ', len(side_ids))

# intersection of metadata and overhead and side

all_ids = metadata_ids.intersection(overhead_ids).intersection(side_ids)
print('Number of unique ids overall: ', len(all_ids))

side_meta_ids = metadata_ids.intersection(side_ids)
print('Number of unique side meta ids: ', len(side_meta_ids))

Number of unique meta ids:  4768
Number of unique overhead ids:  3490
Number of unique side ids:  4793
Number of unique ids overall:  3112
Number of unique side meta ids:  4606


In [None]:
# create a new dataset containing only the valid ids
data_path = '../data/nutrition5k_revised'
if not os.path.exists(data_path):
    os.makedirs(data_path)
    
image_path = os.path.join(data_path, 'images')
if not os.path.exists(image_path):
    os.makedirs(image_path)


for i in side_meta_ids:
    current_side_image_path = os.path.join(nutrition5k_side_path, i, 'frames_sampled30')
    for image_index in os.listdir(current_side_image_path):
        abs_address = os.path.join(current_side_image_path, image_index)
        new_path = os.path.join(image_path, i)
        if not os.path.exists(new_path):
            os.makedirs(new_path)
        os.system('cp ' + abs_address + ' ' + new_path)
    
    
    # store the images in the data path 
    if i not in all_ids:
        continue
    
    curr_overhead_image_path = os.path.join(nutrition5k_overhead_path, i)
    for overhead_pics in os.listdir(curr_overhead_image_path):
        # print overhead pics's absolute path
        abs_address = os.path.join(curr_overhead_image_path, overhead_pics)
        # # copy the overhead images to the data
        new_path = os.path.join(image_path, i)
        if not os.path.exists(new_path):
            os.makedirs(new_path)
        os.system('cp ' + abs_address + ' ' + new_path)
        
        
print('Data preprocessing done!')
print('Data stored in: ', data_path)
print('Number of images: ', len(os.listdir(image_path)))


Data preprocessing done!
Data stored in:  ../data/nutrition5k_revised
Number of images:  4606


In [None]:
# label preprocessing
label_path = os.path.join(data_path, 'labels')
if not os.path.exists(label_path):
    os.makedirs(label_path)
    
with open(os.path.join(label_path, 'labels.csv'), 'w') as f:
    with open(metadata_path) as metadata:
        for line in metadata:
            id = line.strip().split(',')[0]
            if id in side_meta_ids:
                f.write(line)

food_ingredients = {}

with open(os.path.join(label_path, 'labels.csv'), 'r') as labels:
    for line in labels:
        line = line.strip().split(',')
        food_id = line[0]
        ingredients = []
        for i in line:
            if i.startswith('ingr_'):
                # take the last non zero values
                ingredients.append(int(i.split('_')[1]))
        food_ingredients[food_id] = ingredients
            
ingredients_dict = {}
with open(ingredients_path) as ingredients:
    # skip the first line
    next(ingredients)
    for line in ingredients:
        line = line.strip().split(',')
        ingredients_dict[int(line[1])] = line[0]
        
with open(os.path.join(label_path, 'ingredients.txt'), 'w') as f:
    for food_id, ingredients in food_ingredients.items():
        f.write(food_id + '\t' + ','.join([ingredients_dict[i] for i in ingredients]) + '\n')
        
with open(os.path.join(label_path, 'nutrition.csv'), 'w') as f:
    f.write('id,calories,mass,fat,carb,protein\n')
    with open(metadata_path) as metadata:
        for line in metadata:
            # take only the first 6 columns (id, calories, mass, fat, carb, protein)
            line = line.strip().split(',')[0:6]
            f.write(','.join(line) + '\n')

In [19]:
metadata_path = os.path.join(data_path, 'metadata')
if not os.path.exists(metadata_path):
    os.makedirs(metadata_path)

with open(os.path.join(metadata_path, 'metadata.csv'), 'w') as f:
    for i in all_ids:
        f.write(i + '\n')

In [22]:
for i in os.listdir(image_path):
    if 'camera_A_frame_001.jpeg' not in os.listdir(os.path.join(image_path, i)):
        print(i)

dish_1559332822
dish_1561997933
dish_1558722636
dish_1558639979
dish_1558636388
dish_1559243974
dish_1558642420
dish_1559319971
dish_1558635523
dish_1558723475
dish_1558722685
dish_1566488956
dish_1558636483
dish_1559059954
dish_1558638954
dish_1558637423
dish_1558630038
dish_1558642458
dish_1559240908
dish_1558725286
dish_1559320019
dish_1559233055
dish_1559332418
dish_1558632130
dish_1558632080
dish_1558722171
dish_1558640941
dish_1558723866
dish_1558639555
dish_1558636868
dish_1558725193
dish_1559239753
dish_1558642490
dish_1559233144
dish_1558638896
dish_1558721827
dish_1558721980
dish_1558639335
dish_1558109864
dish_1558723512
dish_1558723414
dish_1561996647
dish_1561996813
dish_1567785778
dish_1558636813
dish_1558629013
dish_1558110436
dish_1558641989
dish_1558721272
dish_1558637698
dish_1567801520
dish_1561996612
dish_1558629517
dish_1558641164
dish_1559578599
dish_1559319902
dish_1558640105
dish_1567785709
dish_1558724891
dish_1558109945
dish_1558639662
dish_1559245848
dish_155