In [None]:
!pip install kaggle -q
!pip install split-folders -q

In [None]:
import tensorflow as tf
import numpy as np
import splitfolders

In [None]:
# Kaggle API setup
!mkdir ~/.kaggle
!echo '{"username":"masnormen","key":"14b2c5e35c96599a1c66abdd34386b02"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# Donglod dataset
!kaggle datasets download -d trolukovich/food11-image-dataset
!kaggle datasets download -d kmader/food41

Downloading food11-image-dataset.zip to /content
 99% 1.07G/1.08G [00:06<00:00, 172MB/s]
100% 1.08G/1.08G [00:06<00:00, 168MB/s]
Downloading food41.zip to /content
100% 5.30G/5.30G [01:26<00:00, 49.9MB/s]
100% 5.30G/5.30G [01:26<00:00, 66.0MB/s]


# Extracting & Splitting Dataset

- Training: 700
- Validation: 150
- Test: 150
- *Total: 1000*

Dataset 1:
- Bread
- Dessert
- Egg
- Meat
- Soup
- Vegetable-Fruit

Dataset 2:
- fried_rice
- french_fries
- spaghetti_carbonara
- hamburger
- pizza
- sushi

In [None]:
!mkdir dataset
!mkdir dataset/train
!mkdir dataset/dev
!mkdir dataset/test

## Dataset 1 - Extraction & Cleaning

In [None]:
!unzip -j -q food11-image-dataset.zip "training/Bread/*" -d "dataset/train/bread"
!unzip -j -q food11-image-dataset.zip "training/Dessert/*" -d "dataset/train/dessert"
!unzip -j -q food11-image-dataset.zip "training/Egg/*" -d "dataset/train/egg"
!unzip -j -q food11-image-dataset.zip "training/Meat/*" -d "dataset/train/meat"
!unzip -j -q food11-image-dataset.zip "training/Soup/*" -d "dataset/train/soup"
!unzip -j -q food11-image-dataset.zip "training/Vegetable-Fruit/*" -d "dataset/train/vegfruit"

In [None]:
!unzip -j -q food11-image-dataset.zip "validation/Bread/*" -d "dataset/dev/bread"
!unzip -j -q food11-image-dataset.zip "validation/Dessert/*" -d "dataset/dev/dessert"
!unzip -j -q food11-image-dataset.zip "validation/Egg/*" -d "dataset/dev/egg"
!unzip -j -q food11-image-dataset.zip "validation/Meat/*" -d "dataset/dev/meat"
!unzip -j -q food11-image-dataset.zip "validation/Soup/*" -d "dataset/dev/soup"
!unzip -j -q food11-image-dataset.zip "validation/Vegetable-Fruit/*" -d "dataset/dev/vegfruit"

In [None]:
!unzip -j -q food11-image-dataset.zip "evaluation/Bread/*" -d "dataset/test/bread"
!unzip -j -q food11-image-dataset.zip "evaluation/Dessert/*" -d "dataset/test/dessert"
!unzip -j -q food11-image-dataset.zip "evaluation/Egg/*" -d "dataset/test/egg"
!unzip -j -q food11-image-dataset.zip "evaluation/Meat/*" -d "dataset/test/meat"
!unzip -j -q food11-image-dataset.zip "evaluation/Soup/*" -d "dataset/test/soup"
!unzip -j -q food11-image-dataset.zip "evaluation/Vegetable-Fruit/*" -d "dataset/test/vegfruit"

In [None]:
# Delete unnecessary data

!ls -d dataset/train/bread/* | tail -n +701 | xargs rm
!ls -d dataset/train/dessert/* | tail -n +701 | xargs rm
!ls -d dataset/train/egg/* | tail -n +701 | xargs rm
!ls -d dataset/train/meat/* | tail -n +701 | xargs rm
!ls -d dataset/train/soup/* | tail -n +701 | xargs rm
!ls -d dataset/train/vegfruit/* | tail -n +701 | xargs rm

In [None]:
!ls -d dataset/dev/bread/* | tail -n +151 | xargs rm
!ls -d dataset/dev/dessert/* | tail -n +151 | xargs rm
!ls -d dataset/dev/egg/* | tail -n +151 | xargs rm
!ls -d dataset/dev/meat/* | tail -n +151 | xargs rm
!ls -d dataset/dev/soup/* | tail -n +151 | xargs rm
!ls -d dataset/dev/vegfruit/* | tail -n +151 | xargs rm

In [None]:
!ls -d dataset/test/bread/* | tail -n +151 | xargs rm
!ls -d dataset/test/dessert/* | tail -n +151 | xargs rm
!ls -d dataset/test/egg/* | tail -n +151 | xargs rm
!ls -d dataset/test/meat/* | tail -n +151 | xargs rm
!ls -d dataset/test/soup/* | tail -n +151 | xargs rm
!ls -d dataset/test/vegfruit/* | tail -n +151 | xargs rm

## Dataset 2 - Extraction & Split

In [None]:
!mkdir dirty_data

In [None]:
!unzip -j -q food41.zip "images/fried_rice/*" -d "dirty_data/fried_rice"
!unzip -j -q food41.zip "images/french_fries/*" -d "dirty_data/french_fries"
!unzip -j -q food41.zip "images/hamburger/*" -d "dirty_data/hamburger"
!unzip -j -q food41.zip "images/pizza/*" -d "dirty_data/pizza"
!unzip -j -q food41.zip "images/spaghetti_carbonara/*" -d "dirty_data/spaghetti"
!unzip -j -q food41.zip "images/sushi/*" -d "dirty_data/sushi"

In [None]:
# Do data split to train, dev, test => 700 : 150: 150

splitfolders.ratio("dirty_data", output="output_data", seed=42, ratio=(.7, .15, .15), group_prefix=None)

Copying files: 6000 files [00:01, 3556.91 files/s]


In [None]:
# Cleaning up folders

!mv output_data/train/* dataset/train/
!mv output_data/val/* dataset/dev/
!mv output_data/test/* dataset/test/
!rm -rf output_data
!rm -rf dirty_data

## Dataset 3 -Scrapping

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
base_url = "https://www.bing.com/images/s/earch?q=nasi+rendang&form=HDRSC2&first=1&tsc=ImageBasicHover"
# base_url = "https://www.google.com/search?q=nasi+rendang&tbm=isch&ved=2ahUKEwi2j4zMgLfwAhVYG3IKHRInAGIQ2-cCegQIABAA&oq=nasi+rendang&gs_lcp=CgNpbWcQDFAAWABgpp7qAmgAcAB4AIABAIgBAJIBAJgBAKoBC2d3cy13aXotaW1n&sclient=img&ei=deaUYPaAEdi2yAOSzoCQBg"
# base_url = "https://cookpad.com/id"
# base_url = "https://image.baidu.com/search/index?tn=baiduimage&ps=1&ct=201326592&lm=-1&cl=2&nc=1&ie=utf-8&word=rendang"
# food_url = "https://www.bing.com/images/search?view=detailV2&ccid=vu01BajQ&id=83631D8938F489950B36394260A2E056AE1201FD&thid=OIP.vu01BajQ8gHARX1imUU6kQHaHa&mediaurl=https%3a%2f%2fi1.wp.com%2ficone-inc.org%2fwp-content%2fuploads%2f2019%2f03%2fNasi-Padang-Vera.jpg%3ffit%3d4096%252C4096%26ssl%3d1&exph=4096&expw=4096&q=nasi+rendang&simid=608048948364126468&ck=C3942BA57F808B87ACBC0F30B78619F5&selectedIndex=0&FORM=IRPRST"
hdr = {'User-Agent': 'Mozilla/5.0'}

soup = BeautifulSoup(requests.get(base_url, headers=hdr).text, 'lxml')
food = []
# print(soup)
# food_urls = {food_element: None for food_element in soup.select(".block bg-black-transparent rounded-t overflow-hidden, .tofu_image")}
food_urls = {food_element: None for food_element in soup.select(".imgbox")}
# soup.find_all("div", {"class": "stylelistrow"})
print(food_urls)

{}
