## Transform for HF dataset

In [1]:
import pandas as pd

data = pd.read_csv('../data/drink_recipies_monin.csv')
data.head()

Unnamed: 0,name,recipie,category,image_name,image_ext,image_url
0,'Luck Of The Irish' Frozen Lemonade,3/4 oz. Monin Pistachio Syrup\n3/4 oz. Monin G...,lemonade,1725657501_6166165.png,png,https://www.monin.com/media/catalog/product/ca...
1,4Th Of July Picnic Refresher,1 wedge(s) watermelon\nice\n5 oz. green tea\n1...,lemonade,1725657506_5769687.png,png,https://www.monin.com/media/catalog/product/ca...
2,Adult Key Lime Pie Lemonade,ice\n1 1/4 oz. citrus vodka\n3/4 oz. Monin Key...,lemonade,1725657511_360125.png,png,https://www.monin.com/media/catalog/product/ca...
3,After The Siesta Lemonade,1 pinch(es) fresh cilantro\n1 pinch(es) black ...,lemonade,1725657516_4292703.png,png,https://www.monin.com/media/catalog/product/ca...
4,Agave Limeade,1/2 oz. Monin Agave Organic Nectar\n3/4 oz. Mo...,lemonade,1725657521_6316235.png,png,https://www.monin.com/media/catalog/product/ca...


In [None]:
# check duplicates


In [4]:
import os
import shutil
import random

# Create necessary directories
train_dir = '../data/monin/combined_dataset/train'
test_dir = '../data/monin/combined_dataset/test'

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

# Define source folders
source_folders = ['coffee', 'cold_tea', 'lemonade']
source_paths = [f'../data/monin/{folder}' for folder in source_folders]

# Collect all files from the source folders
all_files = []
for folder in source_paths:
    for file in os.listdir(folder):
        file_path = os.path.join(folder, file)
        if os.path.isfile(file_path):  # Ensure it's a file
            all_files.append(file_path)

# Shuffle the files to randomize the split
random.shuffle(all_files)

# Calculate the split index
split_idx = int(0.8 * len(all_files))

# Split files into train and test sets
train_files = all_files[:split_idx]
test_files = all_files[split_idx:]

# Move the files into the respective directories
for file in train_files:
    shutil.copy(file, train_dir)

for file in test_files:
    shutil.copy(file, test_dir)

print(f"Total files: {len(all_files)}, Train: {len(train_files)}, Test: {len(test_files)}")


Total files: 687, Train: 549, Test: 138


In [6]:
data.columns

Index(['name', 'recipie', 'category', 'image_name', 'image_ext', 'image_url'], dtype='object')

In [17]:
# now add metadata.jsonl to train and test folders
import json
metadata = data.to_dict(orient='records')

# before dumpingto json, change 'image_name' to 'file_name'
for item in metadata:
    item['file_name'] = item.pop('image_name')
    
# create a column "text" with format
#Name: {name}, Recipie: {recipie}, Category: {category}
for item in metadata:
    item['text'] = f"Name: {item['name']}; Recipie: {item['recipie']}; Category: {item['category']}"


# dump metadata to json to metadata.jsonl
with open(os.path.join(train_dir, 'metadata.jsonl'), 'w') as f:
    for item in metadata:
        json.dump(item, f)
        f.write('\n')
        
with open(os.path.join(test_dir, 'metadata.jsonl'), 'w') as f:
    for item in metadata:
        json.dump(item, f)
        f.write('\n')

In [10]:
from datasets import load_dataset

dataset = load_dataset("imagefolder", data_dir="../data/monin/combined_dataset", split="train")

Downloading data: 100%|██████████| 552/552 [00:00<00:00, 27605.95files/s]
Downloading data: 100%|██████████| 141/141 [00:00<?, ?files/s]
Generating train split: 549 examples [00:00, 6168.33 examples/s]
Generating test split: 138 examples [00:00, 4749.01 examples/s]


In [14]:
dataset[0]['recipie']

'1 wedge(s) watermelon\nice\n5 oz. green tea\n1 oz. Monin Watermelon Syrup\n2 oz. lemon-lime soda\n\nCombine ingredients, except sparkling beverage  in serving glass.\nStir well to combine.\nTop with sparkling beverage.\nGarnish.'

In [15]:
# check cuda
import torch
torch.cuda.is_available()

True